In [2]:
import logging
import pandas as pd
import matplotlib.pyplot as plt

def setup_logging() -> None:
    logging.basicConfig(
        level=logging.INFO, 
        format='%(asctime)s - %(levelname)s - %(funcName)s | %(message)s',
        handlers=[
            logging.FileHandler('78-ipynb.log')
        ]
    )

    logging.info('Logging setup complete.')

def fetch_df(file_path: str) -> pd.DataFrame:
    logging.info(f'Fetching DataFrame from {file_path}')

    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError as na_error:
        logging.error(f'File not found in {file_path}: {na_error}')
        raise na_error
    except pd.errors.EmptyDataError as empty_error:
        logging.error(f'No data found in {file_path}: {empty_error}')
        raise empty_error
    except pd.errors.ParserError as parse_error:
        logging.error(f'Error parsing data in {file_path}: {parse_error}')
        raise parse_error
    except Exception as e:
        logging.error(f"An unexpected error occurred while reading {file_path}. Error: {e}")
        raise e
    else:
        logging.info(f'Data successfully fetched from {file_path}')
        return df
    
def log_df_info(df: pd.DataFrame) -> None:
    print(f'DataFrame shape: {df.shape}')
    print(f'NaN values: {df.isna().values.any()}')
    print(f'Duplicate: {df.duplicated().values.any()}')
    print(f'Data Types:\n{df.dtypes}')

# Explore and Clean the Data

In [3]:
def clean_df(in_df: pd.DataFrame) -> pd.DataFrame:
    
    # print('Before cleanup')
    # log_df_info(in_df)
    
    in_df.USD_Production_Budget = pd.to_numeric(
        in_df.USD_Production_Budget.replace('[\$,]', '', regex=True), 
        errors='coerce'
    )

    in_df.USD_Worldwide_Gross = pd.to_numeric(
        in_df.USD_Worldwide_Gross.replace('[\$,]', '', regex=True), 
        errors='coerce'
    )

    in_df.USD_Domestic_Gross = pd.to_numeric(
        in_df.USD_Domestic_Gross.replace('[\$,]', '', regex=True), 
        errors='coerce'
    )

    in_df.Release_Date = pd.to_datetime(
        in_df.Release_Date, 
        errors='coerce'
    )

    # print('After cleanup')
    # log_df_info(in_df)

    return in_df

# Investigate films with Zero revenue

In [23]:
def challenge1(in_df: pd.DataFrame) -> None:
    # print(f'Description:\n{in_df.describe()}')
    
    print(f'Average production budget: {in_df.USD_Production_Budget.mean(): .2f}')

    print(f'Average worldwide gross revenue: {in_df.USD_Worldwide_Gross.mean(): .2f}')

    print(f'Minimum worldwide revenue:\n{in_df.loc[in_df.USD_Worldwide_Gross.idxmin()]}')
    print(f'Minimum domestic revenue:\n{in_df.loc[in_df.USD_Domestic_Gross.idxmin()]}')

    print(f'Highest production budget:\n{in_df.loc[in_df.USD_Production_Budget.idxmax()]}')
    print(f'Highest worldwide gross revenue:\n{in_df.loc[in_df.USD_Worldwide_Gross.idxmax()]}')


if __name__ == "__main__":
    setup_logging()

    file_path = 'input-78.csv'
    
    in_df = fetch_df(file_path)
    in_df = clean_df(in_df)

    # challenge1(in_df)

    # challenge 2
    print(f'Films grossed 0 domestic:\n{in_df[in_df.USD_Domestic_Gross == 0].sort_values(by='USD_Production_Budget', ascending=False)}')

    # challenge 3
    print(f'Films grossed 0 worldwide:\n{in_df[in_df.USD_Worldwide_Gross == 0].sort_values(by='USD_Production_Budget', ascending=False)}')

    

Films grossed 0 domestic:
      Rank Release_Date                         Movie_Title  \
5388    96   2020-12-31                         Singularity   
5387   126   2018-12-18                             Aquaman   
5384   321   2018-09-03                   A Wrinkle in Time   
5385   366   2018-10-08                      Amusement Park   
5090   556   2015-12-31  Don Gato, el inicio de la pandilla   
...    ...          ...                                 ...   
4787  5371   2014-12-31                Stories of Our Lives   
3056  5374   2007-12-31                         Tin Can Man   
4907  5381   2015-05-19                    Family Motocross   
5006  5389   2015-09-29             Signed Sealed Delivered   
5007  5390   2015-09-29                A Plague So Pleasant   

      USD_Production_Budget  USD_Worldwide_Gross  USD_Domestic_Gross  
5388              175000000                    0                   0  
5387              160000000                    0                   0  
5384