In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from sklearn.model_selection import train_test_split
from itertools import chain
import ast
from itertools import combinations
from collections import Counter, defaultdict
import eda as e #code from the original EDA we started, extracted from Oviya's notebook
alt.data_transformers.enable("vegafusion")
import ast
import warnings
warnings.filterwarnings('ignore') #to ignore warnings that come from Altair


In [16]:
#As per Professor Paulik's feedback: Show each feature category as 100% and stack the injury types as proportions within that category

# Define injury order
injury_order = ['No Injury', 'Minor', 'Serious', 'Fatal']


e.train_df['ExtentOfInjuryCode'] = pd.Categorical(
    e.train_df['ExtentOfInjuryCode'],
    categories=injury_order,
    ordered=True
)


for col in e.all_vars:
    count_dict = defaultdict(Counter)

    for index, row in e.train_df[[col, 'ExtentOfInjuryCode']].dropna().iterrows():
        val = row[col]
        injury = row['ExtentOfInjuryCode']
        
        # Handle list-style columns with literal_eval
        if col in e.list_style_vars:
            try:
                parsed_vals = ast.literal_eval(val)
                if not isinstance(parsed_vals, list):
                    parsed_vals = [parsed_vals]
            except Exception:
                continue
        else:
            parsed_vals = [val]

        for v in parsed_vals:
            count_dict[str(v)][injury] += 1

    # Get top 10 categories by total count
    top_10 = sorted(count_dict, key=lambda k: sum(count_dict[k].values()), reverse=True)[:10]

    # Build data for chart
    chart_data = [
        {
            'Category': category,
            'ExtentOfInjuryCode': injury,
            'Percentage': (count_dict[category][injury] / sum(count_dict[category].values())) * 100
        }
        for category in top_10
        for injury in injury_order
    ]

    df_chart = pd.DataFrame(chart_data)

    # Create 100% stacked bar chart
    selector = alt.selection_single(encodings=['x', 'color'])

    chart = alt.Chart(df_chart).mark_bar().encode(
        x=alt.X('Category:N', sort='-y'),
        y=alt.Y('Percentage:Q', stack='normalize', title='Injury Proportion (%)'),
        color=alt.condition(
            selector,
            alt.Color('ExtentOfInjuryCode:N', sort=injury_order, title='Injury Type',  scale=alt.Scale(
            domain=['No Injury', 'Minor', 'Serious', 'Fatal'],
            range=['lightgreen', '#ff9999', '#ff4c4c', '#990000']  # light red to dark red
        )),
            alt.value('lightgray')




        ),
        tooltip=['Category:N', 'ExtentOfInjuryCode:N', alt.Tooltip('Percentage:Q', format='.1f')]
    ).add_selection(
        selector
    ).properties(
        title=f"'{col}' by Extent of Injury (Top 10 Categories)",
        width=600,
        height=400
    )

    chart.show()
