# import statements

In [29]:
import logging
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt


def setup_logging() -> None:
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(funcName)s | %(message)s',
        handlers=[
            logging.FileHandler('77-ipynb.log')
        ]
    )

    logging.info('Logging setup complete.')


def fetch_df(file_path: str) -> pd.DataFrame:
    logging.info('Initiating reading csv')
    
    try:
        in_df = pd.read_csv(file_path)
    except FileNotFoundError as notfound_err:
        logging.error(f'File not found at: {file_path}. Error: {notfound_err}')
        raise notfound_err
    except pd.errors.EmptyDataError as empty_err:
        logging.error(f'File empty: {empty_err}')
        raise empty_err
    except pd.errors.ParserError as parse_err:
        logging.error(f'Parse error: {parse_err}')
        raise parse_err
    except Exception as err:
        logging.error(f'Unexpected error: {err}')
        raise err
    else:
        logging.info(f'Dataframe loaded successfully from {file_path}')
        pd.options.display.float_format = '{:,.2f}'.format
        return in_df

# Explore and Clean the Dataset

In [30]:
def explore_df(in_df: pd.DataFrame) -> None:
    print(f'Dataframe shape: {in_df.shape}')

    print(f'Column names: {in_df.columns.to_list()}')
    print(f'Data types: {in_df.dtypes}')  

    print(f'NaN: {in_df.isna().values.any()}')
    print(f'NaN details:\n{in_df.isna().sum()}')


def explore_df_data(in_df: pd.DataFrame) -> None:
    print(f'Noble prize first awarded: {in_df.year.min()}')
    print(f'Noble prize latest awarded: {in_df.year.max()}')

    col_subset = ['year','category', 'laureate_type',
              'birth_date','full_name', 'organization_name']
    print(f'NaN Birth Rate:\n{in_df.loc[in_df.birth_date.isna()][col_subset]}')
    col_subset = ['year','category', 'laureate_type','full_name', 'organization_name']
    print(f'NaN Org Name:\n{in_df.loc[in_df.organization_name.isna()][col_subset]}')


def conv_df_dtype(in_df: pd.DataFrame) -> pd.DataFrame:
    in_df.birth_date = pd.to_datetime(in_df.birth_date, errors='coerce')

    share_parts = in_df.prize_share.str.split('/', expand=True)
    in_df['share_pct'] = pd.to_numeric(share_parts[0], errors='coerce') / pd.to_numeric(share_parts[1], errors='coerce')

    return in_df


# if __name__ == '__main__':
#     setup_logging()

#     in_df = fetch_df('input-79.csv')
#     explore_df(in_df)
#     explore_df_data(in_df)
#     in_df = conv_df_dtype(in_df)


# plotly Bar & Donut Charts: Analyse Prize Categories & Women Winning Prizes

In [None]:
def generate_gender_donut(in_df: pd.DataFrame) -> None:
    logging.info('Initiating donut figure generation')
    
    gender_count = in_df.sex.value_counts()
    donut_fig = px.pie(
        labels=gender_count.index,
        values=gender_count.values,
        names=gender_count.index,
        title='Nobel Prize by Gender',
        hole=0.5
    )
    donut_fig.update_traces(
        textposition='outside',
        textinfo='percent+label'
    )
    donut_fig.show()

    logging.info('Donut figure generation complete')


def explore_female_df(in_df: pd.DataFrame) -> None:
    female_df = in_df[in_df.sex == 'Female'].sort_values(by='year', ascending=True).reset_index(drop=True).copy()
    print(f'First three female nobel info:\n{female_df.head(3)}')

    print(f'Prize won for:\n{female_df.category.value_counts()}')

    print(f'First 3 female birth country and organization:\n{female_df[['birth_country', 'birth_country_current', 'organization_name']].head(3)}')


def multiple_nobel(in_df: pd.DataFrame) -> None:
    logging.info('Checking for multiple Nobel winners')

    # Get index of names that appear more than once
    multiple_winners = in_df[
        'full_name'
    ].value_counts().loc[
        lambda x: x > 1
    ].index

    if multiple_winners.empty:
        print('No multiple winners')
        return

    # Filter and display relevant details
    filtered_df = in_df.loc[
        in_df['full_name'].isin(multiple_winners), 
        [
            'year', 
            'category', 
            'laureate_type', 
            'full_name'
        ]
    ].sort_values(
        by=[
            'full_name', 
            'year'
        ]
    )

    print(f'Multiple Nobel Prize winners:\n{filtered_df.to_string(index=False)}')


def gen_category_bar(ind_df: pd.DataFrame) -> None:
    logging.info('Initiating Bar chart for categories')

    category_df = in_df.category.value_counts().reset_index()
    category_df.columns = ['Category', 'Count']


    # print(f'{category_df}')
    bar_fig = px.bar(
        data_frame=category_df,
        x='Category',
        y='Count',
        title='Nobel prize distribution by Category',
        labels={'Category': 'Category', 'Count': 'Nobel count'},
        color='Count',
        color_continuous_scale='Aggrnyl'
    )
    bar_fig.update_layout(coloraxis_showscale=False)
    bar_fig.show()


def gen_bar_gender_category(in_df: pd.DataFrame) -> None:
    cat_gender_df = in_df.groupby(
        ['category', 'sex'],
        as_index=False
    ).agg({'prize': 'count'}).sort_values(
        'prize',
        ascending=False
    )
    bar_fig = px.bar(
        data_frame=cat_gender_df,
        x='category',
        y='prize',
        color='sex',
        title='Category distribution by Gender',
        labels={'category': 'Nobel prize category', 'prize': 'Number of Prizes'}
    )
    bar_fig.show()
    

if __name__ == '__main__':
    setup_logging()

    in_df = fetch_df('input-79.csv')
    in_df = conv_df_dtype(in_df)

    # generate_gender_donut(in_df)
    
    # explore_female_df(in_df)

    # multiple_nobel(in_df)

    # gen_category_bar(in_df)

    # print(f'First economics award info:\n{in_df[in_df.category == 'Economics'].sort_values(by="year").iloc[0]}')

    gen_bar_gender_category(in_df)