# import statements

In [1]:
import logging
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt


def setup_logging() -> None:
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(funcName)s | %(message)s',
        handlers=[
            logging.FileHandler('77-ipynb.log')
        ]
    )

    logging.info('Logging setup complete.')


def fetch_df(file_path: str) -> pd.DataFrame:
    logging.info('Initiating reading csv')
    
    try:
        in_df = pd.read_csv(file_path)
    except FileNotFoundError as notfound_err:
        logging.error(f'File not found at: {file_path}. Error: {notfound_err}')
        raise notfound_err
    except pd.errors.EmptyDataError as empty_err:
        logging.error(f'File empty: {empty_err}')
        raise empty_err
    except pd.errors.ParserError as parse_err:
        logging.error(f'Parse error: {parse_err}')
        raise parse_err
    except Exception as err:
        logging.error(f'Unexpected error: {err}')
        raise err
    else:
        logging.info(f'Dataframe loaded successfully from {file_path}')
        pd.options.display.float_format = '{:,.2f}'.format
        return in_df

# Explore and Clean the Dataset

In [None]:
def explore_df(in_df: pd.DataFrame) -> None:
    print(f'Dataframe shape: {in_df.shape}')

    print(f'Column names: {in_df.columns.to_list()}')
    print(f'Data types: {in_df.dtypes}')  

    print(f'NaN: {in_df.isna().values.any()}')
    print(f'NaN details:\n{in_df.isna().sum()}')


def explore_df_data(in_df: pd.DataFrame) -> None:
    print(f'Noble prize first awarded: {in_df.year.min()}')
    print(f'Noble prize latest awarded: {in_df.year.max()}')

    col_subset = ['year','category', 'laureate_type',
              'birth_date','full_name', 'organization_name']
    print(f'NaN Birth Rate:\n{in_df.loc[in_df.birth_date.isna()][col_subset]}')
    col_subset = ['year','category', 'laureate_type','full_name', 'organization_name']
    print(f'NaN Org Name:\n{in_df.loc[in_df.organization_name.isna()][col_subset]}')


def conv_df_dtype(in_df: pd.DataFrame) -> pd.DataFrame:
    in_df.birth_date = pd.to_datetime(in_df.birth_date, errors='coerce')

    share_parts = in_df.prize_share.str.split('/', expand=True)
    in_df['share_pct'] = pd.to_numeric(share_parts[0], errors='coerce') / pd.to_numeric(share_parts[1], errors='coerce')

    return in_df


if __name__ == '__main__':
    setup_logging()

    in_df = fetch_df('input-79.csv')
    explore_df(in_df)
    explore_df_data(in_df)
    in_df = conv_df_dtype(in_df)


Dataframe shape: (962, 16)
Column names: ['year', 'category', 'prize', 'motivation', 'prize_share', 'laureate_type', 'full_name', 'birth_date', 'birth_city', 'birth_country', 'birth_country_current', 'sex', 'organization_name', 'organization_city', 'organization_country', 'ISO']
Data types: year                      int64
category                 object
prize                    object
motivation               object
prize_share              object
laureate_type            object
full_name                object
birth_date               object
birth_city               object
birth_country            object
birth_country_current    object
sex                      object
organization_name        object
organization_city        object
organization_country     object
ISO                      object
dtype: object
NaN: True
NaN details:
year                       0
category                   0
prize                      0
motivation                88
prize_share                0
laureate_type 