In [4]:
import logging
import pandas as pd
import matplotlib.pyplot as plt

def setup_logging() -> None:
    logging.basicConfig(
        level=logging.INFO, 
        format='%(asctime)s - %(levelname)s - %(funcName)s | %(message)s',
        handlers=[
            logging.FileHandler('78-ipynb.log')
        ]
    )

    logging.info('Logging setup complete.')

def fetch_df(file_path: str) -> pd.DataFrame:
    logging.info(f'Fetching DataFrame from {file_path}')

    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError as na_error:
        logging.error(f'File not found in {file_path}: {na_error}')
        raise na_error
    except pd.errors.EmptyDataError as empty_error:
        logging.error(f'No data found in {file_path}: {empty_error}')
        raise empty_error
    except pd.errors.ParserError as parse_error:
        logging.error(f'Error parsing data in {file_path}: {parse_error}')
        raise parse_error
    except Exception as e:
        logging.error(f"An unexpected error occurred while reading {file_path}. Error: {e}")
        raise e
    else:
        logging.info(f'Data successfully fetched from {file_path}')
        return df
    
def log_df_info(df: pd.DataFrame) -> None:
    print(f'DataFrame shape: {df.shape}')
    print(f'NaN values: {df.isna().values.any()}')
    print(f'Duplicate: {df.duplicated().values.any()}')
    print(f'Data Types:\n{df.dtypes}')

# Explore and Clean the Data

In [7]:
if __name__ == "__main__":
    setup_logging()

    file_path = 'input-78.csv'

    in_df = fetch_df(file_path)
    log_df_info(in_df)

    in_df.USD_Production_Budget = pd.to_numeric(
        in_df.USD_Production_Budget.replace('[\$,]', '', regex=True), 
        errors='coerce'
    )

    in_df.USD_Worldwide_Gross = pd.to_numeric(
        in_df.USD_Worldwide_Gross.replace('[\$,]', '', regex=True), 
        errors='coerce'
    )

    in_df.USD_Domestic_Gross = pd.to_numeric(
        in_df.USD_Domestic_Gross.replace('[\$,]', '', regex=True), 
        errors='coerce'
    )

    in_df.Release_Date = pd.to_datetime(
        in_df.Release_Date, 
        errors='coerce'
    )

    log_df_info(in_df)

DataFrame shape: (5391, 6)
NaN values: False
Duplicate: False
Data Types:
Rank                      int64
Release_Date             object
Movie_Title              object
USD_Production_Budget    object
USD_Worldwide_Gross      object
USD_Domestic_Gross       object
dtype: object
DataFrame shape: (5391, 6)
NaN values: False
Duplicate: False
Data Types:
Rank                              int64
Release_Date             datetime64[ns]
Movie_Title                      object
USD_Production_Budget             int64
USD_Worldwide_Gross               int64
USD_Domestic_Gross                int64
dtype: object
