<h1 align="center" style="margin-bottom: 20px;">Machine Learning 2024-25</h1>
<h3 align="center" style="margin-top: 20px; margin-bottom: 20px;">To Grant or Not to Grant: Deciding on Compensation Benefits</h3>
<h5 align="center" style="margin-top: 20px; margin-bottom: 0px;">Notebook 4: Open-Ended Section</h5>

### Group 38 - Members:
- Ana Marta Azinheira  - 20240496@novaims.unl.pt
- Braulio Damba - 20240007@novaims.unl.pt
- Catarina Ribeirinha - 20240507@novaims.unl.pt
- Marco Galão  - r20201545@novaims.unl.pt
- Rodrigo Sardinha - 20211627@novaims.unl.pt

# Table of Contents

* [1. Imports](#imports)
  * [1.1. Import Libraries](#import_libraries)
  * [1.2. Import the CV Returns and Best Model](#import_cv_returns_best_model)

* [2. Open-Ended Section](#open_ended)
  * [2.1 Treatment from Notebook 2](#treatment_nb2)
  * [2.2 Treatment and Prediction from Notebook 3](#treatment_prediction_nb3)

# 1. Imports <a id="imports"></a>

## 1.1. Import the Libraries <a class="anchor" id="import_libraries"></a>

In [1]:
# Cross Validation Utils
from utils import *

# Visualization Settings
%config InlineBackend.figure_format = 'retina'
sns.set()

# Omit Warnings
import warnings
warnings.filterwarnings("ignore")

## 1.2. Import the CV Returns and Best Model <a class="anchor" id="import_cv_returns_best_model"></a>

In [2]:
# Load the dictionary with the cv returns
cv_returns = joblib.load('cv_returns.pkl')

# Load the best model (trained)
best_model = joblib.load('best_model.pkl')

# Store the cv returns in variables
winsorization_bounds = cv_returns["winsorization_bounds"]
imputers = cv_returns["imputers"]
ordinal_encoders = cv_returns["ordinal_encoders"]
freq_encoders = cv_returns["freq_encoders"]
fill_values_freq_encoding = cv_returns["fill_values_freq_encoding"]
scaler = cv_returns["scaler"]
lencoder = cv_returns["lencoder"]
final_selected_features = cv_returns["final_selected_features"]

# 2. Open-Ended Section <a id="open_ended"></a>

In [3]:
# df_new_inputs = pd.read_csv('df_new_inputs.csv', sep=',', index_col='Claim Identifier')
# df_new_inputs.head()

In [4]:
df_new_inputs = pd.read_csv('../data/test_data.csv', sep=',', index_col='Claim Identifier')
df_new_inputs_with_predictions = df_new_inputs.copy()

df_new_inputs.head()

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6165911,2022-12-24,19,N,2023-01-02,N,,2003.0,2023-01-02,,INDEMNITY INSURANCE CO OF,...,IV,,31.0,"FALL, SLIP OR TRIP, NOC",10.0,CONTUSION,54.0,LOWER LEG,10466,1
6166141,2022-11-20,19,N,2023-01-02,N,,2003.0,2023-01-02,,A I U INSURANCE COMPANY,...,IV,,75.0,FALLING OR FLYING OBJECT,10.0,CONTUSION,10.0,MULTIPLE HEAD INJURY,11691,1
6165907,2022-12-26,59,N,2023-01-02,N,0.0,1963.0,2022-12-31,,AMGUARD INSURANCE COMPANY,...,III,,68.0,STATIONARY OBJECT,49.0,SPRAIN OR TEAR,62.0,BUTTOCKS,10604,0
6166047,2022-12-28,55,N,2023-01-02,N,0.0,0.0,2023-01-02,,INDEMNITY INS. OF N AMERICA,...,IV,,25.0,FROM DIFFERENT LEVEL (ELEVATION),10.0,CONTUSION,53.0,KNEE,11411,6
6166102,2022-12-20,25,N,2023-01-02,N,0.0,1997.0,2022-12-31,,NEW HAMPSHIRE INSURANCE CO,...,IV,,79.0,OBJECT BEING LIFTED OR HANDLED,40.0,LACERATION,37.0,THUMB,11212,5


## 2.1 Treatment from Notebook 2 <a id="treatment_nb2"></a>

In [5]:
# -------------------- Drop columns with a lot of missing values and replace placeholder values with NaN --------------------

# Drop columns that are not needed
df_new_inputs.drop(columns=["OIICS Nature of Injury Description", "IME-4 Count"], inplace=True)

# Replace placeholder values with NaN
placeholder_replacements = {
    "Carrier Type": "UNKNOWN",
    "Gender": "U",
    "Medical Fee Region": "UK",
    "Alternative Dispute Resolution": "U",
    "County of Injury": "UNKNOWN"
}
for col, placeholder in placeholder_replacements.items():
    df_new_inputs[col].replace(placeholder, np.nan, inplace=True)

# -------------------- Adjust data types for columns --------------------

# Define column groups for type adjustments
columns_object_to_datetime = ['Accident Date', 'Assembly Date', 'C-2 Date', 'C-3 Date', 'First Hearing Date']
code_columns = ['Industry Code', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code']
columns_float_to_int = ['Age at Injury', 'Birth Year', 'Number of Dependents']
columns_object_to_binary = ['Alternative Dispute Resolution', 'Attorney/Representative', 'COVID-19 Indicator']

# Convert object columns to datetime (missing dates become NaT)
for col in columns_object_to_datetime:
    df_new_inputs[col] = pd.to_datetime(df_new_inputs[col])

# Convert float columns with codes to object type (via Int64 for nullable integers)
for col in code_columns:
    df_new_inputs[col] = df_new_inputs[col].astype('Int64').astype(object)

# Convert float columns to integers (using Int64 for null handling)
for col in columns_float_to_int:
    df_new_inputs[col] = df_new_inputs[col].astype('Int64')

# Convert binary object columns to binary integers (Y/N to 1/0)
for col in columns_object_to_binary:
    df_new_inputs[col] = df_new_inputs[col].map({'Y': 1, 'N': 0}).astype('Int64')

# Handle gender conversion: convert 'Gender' column to binary and drop original
df_new_inputs["Male"] = df_new_inputs["Gender"].map({'M': 1, 'F': 0}).astype('Int64')
df_new_inputs.drop(columns=['Gender'], inplace=True)

# -------------------- Adjust date columns for consistency --------------------

# Apply the date adjustment function
df_new_inputs = df_new_inputs.apply(adjust_dates, axis=1)
df_new_inputs["Male"] = df_new_inputs["Male"].astype('Int64') # Convert 'Male' column back to nullable integers

# -------------------- Additional adjustments for specific columns --------------------

# Convert specific object columns to nullable integers
columns_object_to_int = ["Birth Year", "Alternative Dispute Resolution"]
for col in columns_object_to_int:
    df_new_inputs[col] = df_new_inputs[col].astype('Int64')

# Drop columns that are no longer needed
df_new_inputs.drop(columns=['Birth Year', 'Number of Dependents'], inplace=True)

# -------------------- Validate and clean ZIP codes --------------------

# Define the valid zip code pattern (5 digits)
valid_zip_pattern = r'^\d{5}$'
df_new_inputs['Zip Code'] = df_new_inputs['Zip Code'].apply(lambda x: x if re.match(valid_zip_pattern, str(x)) else np.nan)

# -------------------- Handle "WCIO Part Of Body Code" adjustments --------------------

# Convert "WCIO Part Of Body Code" column to numeric for processing
df_new_inputs["WCIO Part Of Body Code"].apply(pd.to_numeric)

# Adjust codes with invalid values
condition = df_new_inputs['WCIO Part Of Body Code'].isin([-9, 90])
df_new_inputs.loc[df_new_inputs['WCIO Part Of Body Code'] == -9, 'WCIO Part Of Body Code'] = 90
df_new_inputs.loc[condition, 'WCIO Part Of Body Description'] = 'Multiple Body Parts'

# Convert "WCIO Part Of Body Code" back to object
df_new_inputs['WCIO Part Of Body Code'] = df_new_inputs['WCIO Part Of Body Code'].astype('object')

## 2.2 Treatment and Prediction from Notebook 3 <a id="treatment_prediction_nb3"></a>

In [6]:
# -------------------- Treatment --------------------

# Winsorization
for col in feats_dict["winsorization"]:
    # Apply the same bounds to the new inputs
    df_new_inputs, _ = winsorization(df_new_inputs, col, bounds=winsorization_bounds)

# Missing values imputation
df_new_inputs, _ = impute_missing_values(df_new_inputs, feats_dict, imputers=imputers)

# Feature engineering
df_new_inputs = create_features(df_new_inputs)

# Drop description and date columns
df_new_inputs.drop(columns = feats_dict["codes_drop"] + feats_dict["descriptions_drop"] + feats_dict["dates_drop"], inplace=True)

# Ordinal encoding
_, df_new_inputs, _ = ordinal_encoder(feats_dict, data=df_new_inputs, encoders=ordinal_encoders)

# Frequency encoding
_, df_new_inputs, _, _ = frequency_encoder(feats_dict, data=df_new_inputs, encoders=freq_encoders, fill_values=fill_values_freq_encoding)

# Data scaling
_, df_new_inputs, _ = scale_data(data=df_new_inputs, scaler=scaler)

# Feature selection
df_new_inputs = df_new_inputs[final_selected_features]

# -------------------- Prediction --------------------

# Predict the 'Claim Injury Type' for the new inputs dataset by using the trained model
predictions = best_model.predict(df_new_inputs)

# Decode the predicted labels back to their original categorical values
decoded_predictions = lencoder.inverse_transform(predictions)

# Add the predictions as a new column to the original DataFrame
df_new_inputs_with_predictions['Claim Injury Type'] = decoded_predictions

# Display the DataFrame
df_new_inputs_with_predictions.head()

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents,Claim Injury Type
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6165911,2022-12-24,19,N,2023-01-02,N,,2003.0,2023-01-02,,INDEMNITY INSURANCE CO OF,...,,31.0,"FALL, SLIP OR TRIP, NOC",10.0,CONTUSION,54.0,LOWER LEG,10466,1,2. NON-COMP
6166141,2022-11-20,19,N,2023-01-02,N,,2003.0,2023-01-02,,A I U INSURANCE COMPANY,...,,75.0,FALLING OR FLYING OBJECT,10.0,CONTUSION,10.0,MULTIPLE HEAD INJURY,11691,1,2. NON-COMP
6165907,2022-12-26,59,N,2023-01-02,N,0.0,1963.0,2022-12-31,,AMGUARD INSURANCE COMPANY,...,,68.0,STATIONARY OBJECT,49.0,SPRAIN OR TEAR,62.0,BUTTOCKS,10604,0,2. NON-COMP
6166047,2022-12-28,55,N,2023-01-02,N,0.0,0.0,2023-01-02,,INDEMNITY INS. OF N AMERICA,...,,25.0,FROM DIFFERENT LEVEL (ELEVATION),10.0,CONTUSION,53.0,KNEE,11411,6,2. NON-COMP
6166102,2022-12-20,25,N,2023-01-02,N,0.0,1997.0,2022-12-31,,NEW HAMPSHIRE INSURANCE CO,...,,79.0,OBJECT BEING LIFTED OR HANDLED,40.0,LACERATION,37.0,THUMB,11212,5,2. NON-COMP


In [13]:
# # Export predictions from new inputs
# df_new_inputs.to_csv('group_38_df_new_inputs_predictions.csv')

In [None]:
pip install streamlit

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from your_feature_engineering_module import create_features, winsorization, impute_missing_values, ordinal_encoder, frequency_encoder, scale_data

# Load your pre-trained model and other necessary objects
best_model = joblib.load('best_model.pkl')
cv_returns = joblib.load('cv_returns.pkl')

# Set up the Streamlit application interface
st.title('Workers Compensation Claim Injury Type Prediction Dashboard')

# File upload section
st.subheader("Upload Claims Data (CSV format)")
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

if uploaded_file is not None:
    # Step 1: Load the uploaded data into a DataFrame
    df_new_inputs = pd.read_csv(uploaded_file)
    
    # Step 2: Feature engineering and preprocessing
    # Apply winsorization
    for col in feats_dict["winsorization"]:
        df_new_inputs, _ = winsorization(df_new_inputs, col, bounds=cv_returns["winsorization_bounds"])

    # Missing values imputation
    df_new_inputs, _ = impute_missing_values(df_new_inputs, feats_dict, imputers=cv_returns["imputers"])

    # Feature engineering
    df_new_inputs = create_features(df_new_inputs)

    # Drop description and date columns
    df_new_inputs.drop(columns=feats_dict["descriptions_drop"] + feats_dict["dates_drop"], inplace=True)

    # Ordinal encoding
    _, df_new_inputs, _ = ordinal_encoder(feats_dict, data=df_new_inputs, encoders=cv_returns["ordinal_encoders"])

    # Frequency encoding
    _, df_new_inputs, _, _ = frequency_encoder(feats_dict, data=df_new_inputs, encoders=cv_returns["freq_encoders"], fill_values=cv_returns["fill_values_freq_encoding"])

    # Data scaling
    _, df_new_inputs, _ = scale_data(data=df_new_inputs, scaler=cv_returns["scaler"])

    # Feature selection
    df_new_inputs = df_new_inputs[cv_returns["final_selected_features"]]

    # Step 3: Predict claim injury type for each row in the dataset
    predictions = best_model.predict(df_new_inputs)
    decoded_predictions = cv_returns["lencoder"].inverse_transform(predictions)

    # Add predictions to the DataFrame
    df_new_inputs['Predicted Claim Injury Type'] = decoded_predictions

    # Step 4: Display results as a table
    st.subheader('Predictions for Uploaded Claims')
    st.dataframe(df_new_inputs)

    # Step 5: Provide analytics of the predictions
    st.subheader('Analytics Dashboard')

    # Show the count of predicted categories
    prediction_counts = df_new_inputs['Predicted Claim Injury Type'].value_counts()
    st.write("Prediction Distribution:")
    st.bar_chart(prediction_counts)

    # Show some basic stats (e.g., Age vs Average Weekly Wage)
    st.subheader('Age vs Average Weekly Wage')
    st.scatter_chart(df_new_inputs[['Age at Injury', 'Average Weekly Wage']])

    # Display some statistics
    st.write("Basic Statistics of the Claims Data:")
    st.write(df_new_inputs.describe())

    # Show the number of claims processed
    st.write(f"Total number of claims processed: {len(df_new_inputs)}")

else:
    st.write("Please upload a CSV file to get started.")

In [483]:
# # Export predictions from new inputs
# df_test.to_csv('group_38_df_new_inputs_predictions.csv')