### **Group 31** <br>
* Ana Margarida Valente, nr 20240936
* Eduardo Mendes, nr 20240850
* Julia Karpienia, nr 20240514
* Marta Boavida, nr 20240519
* Victoria Goon, nr 20240550

## 0. Import Packages

In [1]:
## Import standard data processing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Setting seaborn style
sns.set()

from sklearn.preprocessing import LabelEncoder

## Import Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Import Cross Validation methods
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb


from imblearn.over_sampling import SMOTE

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_colwidth', None) #Show all columns

## Supress warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
# import sys
# !{sys.executable} -m pip install imbalanced-learn

<a class="anchor" id="importdatasets">

## 1. Import Datasets

</a>

Import datasets that you got from the notebook group_31_notebook

* If u want to use standard scaler

In [3]:
train_data = pd.read_csv("train_encoded_std.csv", low_memory=False)
validation_data = pd.read_csv("validation_encoded_std.csv", low_memory=False)
test_data = pd.read_csv("test_encoded_std.csv")

* If u want to use minmax scaler

In [None]:
# train_data = pd.read_csv("train_encoded_minmax.csv", low_memory=False)
# validation_data = pd.read_csv("validation_encoded_minmax.csv", low_memory=False)
# test_data = pd.read_csv("test_encoded_minmax.csv")

In [5]:
train_data = train_data.set_index("Claim Identifier")
validation_data = validation_data.set_index("Claim Identifier")
test_data = test_data.set_index("Claim Identifier")

* Making the copy of the csv file (each time when you want to try new model (with different feature combinations) you need to make a new copy (it "resets" the dataset to default option (with all variables))

In [21]:
train_data_c = train_data.copy()
validation_data_c = validation_data.copy()
test_data_c = test_data.copy()

In [22]:
X_train = train_data_c.drop('Claim Injury Type', axis = 1)
y_train = train_data_c['Claim Injury Type']

X_val = validation_data_c.drop('Claim Injury Type', axis = 1)
y_val = validation_data_c['Claim Injury Type']

In [23]:
X_train.head()

Unnamed: 0_level_0,WCIO Part of Body_cat,WCIO Nature of Injury Code_cat,WCIO Cause of Injury_cat,Industry Code_21,Industry Code_22,Industry Code_23,Industry Code_31,Industry Code_32,Industry Code_33,Industry Code_42,...,WCIO Part Of Body Code_99,Age at Injury,Average Weekly Wage,IME-4 Count,Number of Dependents,Days Between Accident_Assembly,Days Between Accident_C2,Industry_Avg_Weekly_Wage,Claim Count by Carrier,Claim Count by Gender
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5935707,Multiple Body Parts,Specific Injury,Strain or Injury By,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.28305,0.597492,-0.420035,1.493541,-0.118281,-0.12743,-0.30368,-1.538812,-1.20498
5868764,Lower Extremities,Specific Injury,Strain or Injury By,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.431636,-0.208755,-0.420035,-0.005048,-0.123253,-0.131062,1.783977,0.982451,0.829889
5986945,Head,Specific Injury,Struck or Injured By,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.257718,1.264815,-0.420035,-0.005048,-0.123253,-0.131062,-0.30368,-0.908064,0.829889
5665055,Trunk,Specific Injury,Struck or Injured By,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.20968,-0.535991,-0.420035,-1.004108,-0.085134,-0.091109,-0.30368,0.982451,-1.20498
5595404,Upper Extremities,Specific Injury,Miscellaneous Causes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.402611,1.72855,0.243009,-0.504578,-0.116624,0.074153,1.756214,-0.787763,0.829889


In [9]:
X_val.head()

Unnamed: 0_level_0,WCIO Part of Body_cat,WCIO Nature of Injury Code_cat,WCIO Cause of Injury_cat,Industry Code_21,Industry Code_22,Industry Code_23,Industry Code_31,Industry Code_32,Industry Code_33,Industry Code_42,...,WCIO Part Of Body Code_99,Age at Injury,Average Weekly Wage,IME-4 Count,Number of Dependents,Days Between Accident_Assembly,Days Between Accident_C2,Industry_Avg_Weekly_Wage,Claim Count by Carrier,Claim Count by Gender
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5517094,Lower Extremities,Specific Injury,"Fall, Slip or Trip Injury",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.477828,-0.535991,-0.420035,-1.004108,-0.13154,-0.140142,-0.30368,0.982451,0.829889
6133770,Upper Extremities,Specific Injury,Struck or Injured By,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.331088,-0.535991,-0.420035,0.994011,-0.134855,-0.143774,-0.801528,0.982451,0.829889
5741413,Trunk,Specific Injury,Strain or Injury By,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.06294,-0.535991,-0.420035,-1.004108,-0.129883,-0.140142,1.783977,0.982451,0.829889
6082466,Upper Extremities,Specific Injury,Struck or Injured By,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.404458,-0.535991,-0.420035,-0.005048,0.027567,0.028751,-0.904869,-0.787763,-1.20498
6086244,Lower Extremities,Specific Injury,Strain or Injury By,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.578376,-0.535991,-0.420035,-1.004108,-0.13154,-0.140142,-0.904869,-0.787763,-1.20498


In [10]:
y_val.head()

Claim Identifier
5517094    2. NON-COMP
6133770    2. NON-COMP
5741413    2. NON-COMP
6082466    2. NON-COMP
6086244    3. MED ONLY
Name: Claim Injury Type, dtype: object

### 1.1 Encode Target Variable
Label Encoder for target variable (training and validation):
<br/> <br/>
(This needs to be done in both the proprocessing notebook as well as here to be able to interpret the results properly when a model is tested.)

In [24]:
#Initiate Label encoder
label_encoder = LabelEncoder()

#Fit the encoder on the training target variable
Y_train_encoded = label_encoder.fit_transform(y_train)

#Transform the training and validation target variable
Y_val_encoded = label_encoder.transform(y_val)

y_val_unencoded = y_train.copy()

#Convert the results back to DataFrames while overriding the previous variable names
y_train = pd.DataFrame(Y_train_encoded, columns=['encoded_target'], index=pd.Series(y_train.index))
y_val = pd.DataFrame(Y_val_encoded, columns=['encoded_target'], index=pd.Series(y_val.index))

uncomment variables that you want to drop

In [25]:
variables_to_drop = [
    #  'Age at Injury', 
      # 'Alternative Dispute Resolution',
    #  'Attorney/Representative',
    #  'Average Weekly Wage',
      # 'Carrier Type',
      # 'County of Injury', 
    #  'COVID-19 Indicator', 
    #   'District Name', 
     # 'Gender',
    #   'IME-4 Count', 
      # 'Industry Code', 
    #   'Medical Fee Region',
   # 'WCIO Cause of Injury Code', 
   # 'WCIO Nature of Injury Code',
   # 'WCIO Part Of Body Code', 
       'Number of Dependents', 
      # 'zip_code_cat',
    #    'First Hearing Date Binary',
    #   'C-2 Date Bin', 
    #   'C-3 Date Bin',
    # 'Days Between Accident_Assembly', 
       'Days Between Accident_C2',
      #  'Season_of_Accident', 
      #  'Industry_Avg_Weekly_Wage'
      #  'Claim Count by Carrier',
      #  'Claim Count by Gender'
       ]

# Identify columns to drop for each dataset
columns_to_drop_train = [col for col in X_train.columns if any(col.startswith(var) for var in variables_to_drop)]
columns_to_drop_val = [col for col in X_val.columns if any(col.startswith(var) for var in variables_to_drop)]
columns_to_drop_test = [col for col in test_data_c.columns if any(col.startswith(var) for var in variables_to_drop)]

# Drop the identified columns
X_train = X_train.drop(columns=columns_to_drop_train)
X_val = X_val.drop(columns=columns_to_drop_val)
test_data_c = test_data_c.drop(columns=columns_to_drop_test)


#### Undersampling

In [14]:
# # add the encoded variables back to the x set
# training_data_undersampled = pd.concat([X_train, y_train], axis=1)

# # Separate majority and minority classes
# majority_classes = {}
# for x in range(0,8):
#     if x != 6:
#         majority_classes[x] = training_data_undersampled[training_data_undersampled["encoded_target"] == x]

# minority_class = training_data_undersampled[training_data_undersampled["encoded_target"] == 6]

# size = int(len(minority_class) + (len(minority_class) * 2))

# print(size)

# # Perform undersampling
# undersampled_majority_0 = majority_classes[0].sample(n=size, random_state=42)
# undersampled_majority_1 = majority_classes[1].sample(n=size, random_state=42)
# undersampled_majority_2 = majority_classes[2].sample(n=size, random_state=42)
# undersampled_majority_3 = majority_classes[3].sample(n=size, random_state=42)
# undersampled_majority_4 = majority_classes[4].sample(n=size, random_state=42)
# undersampled_majority_5 = majority_classes[5].sample(n=size, random_state=42)
# undersampled_majority_7 = majority_classes[7].sample(n=size, random_state=42)
# # undersampled_majority.head()
# balanced_data = pd.concat([undersampled_majority_0, undersampled_majority_1, undersampled_majority_2, 
#                            undersampled_majority_3, undersampled_majority_4, undersampled_majority_5, 
#                            minority_class, undersampled_majority_7])

# # Separate features and target
# X_train = balanced_data.drop(columns='encoded_target')
# y_train = balanced_data['encoded_target']

# # Check class distribution after undersampling
# print("Class distribution after undersampling:", y_train.value_counts())

#### Oversampling

In [15]:
# # Initialize SMOTE
# smote = SMOTE(sampling_strategy='auto', random_state=42)

# # Fit and resample the dataset
# X_train, y_train = smote.fit_resample(X_train, y_train)

# print("Class distribution after oversampling:", y_train.value_counts())


<a class="anchor" id="model">

## 2. Model
</a>

Type of Problem <br/>
The type of problem to be solved is a multiclassification problem where the output is between 8 different choices. We will use a simple Logistical Regression model set to be able to compute multiple classes.<br/>
<br/>
Metric used:<br/>
As a classification problem, we observed the following metrics to determine the effectiveness of our model:
 - accuracy
 - precision
 - recall
 - f1 score

 Each point is measured in a different and observing them all allows us to get an accurate view of our model's results.

In [13]:
# Functions to help display metrics for all models

# helper method for score_model - not to be used seperately
def print_scores(per_class):
    for x,y in zip(per_class, np.unique(y_val_unencoded)):
        if str(y) == "7. PTD": # add an extra tab for better alignment
            print("["+str(y)+"]:     \t\t" + str(round(x,2))) 
        else:
            print("["+str(y)+"]:     \t" + str(round(x,2)))

# displays the scores for Precision, Recall, and F1
def score_model(y_actual, y_predicted, score_train, score_test):

    print("------------ F1 ------------")
    f1_per_class = f1_score(y_actual, y_predicted, average=None)
    print_scores(f1_per_class)#, y_actual)
    f1_per_weighted = f1_score(y_actual, y_predicted, average='macro')
    print("\nMacro f1: " + str(round(f1_per_weighted, 3)) + "\n")

    print("------ Individual Score Comparisons ------ ")
    print("Train Score: " + str(score_train))
    print("Test Score: " + str(score_test))
    diff = np.abs(score_train - score_test)
    print("Difference: " + str(diff))

    print("--------- Accuracy ---------\n")
    acc_score = accuracy_score(y_actual, y_predicted)
    print("Accuracy Score: " + str(acc_score) + "\n")

    print("--------- Precision ---------")
    precision_per_class = precision_score(y_actual, y_predicted, average=None)
    print_scores(precision_per_class)#, y_actual)
    precision_weighted = precision_score(y_actual, y_predicted, average='macro')
    print("\nMacro precision: " + str(round(precision_weighted, 3)) + "\n")

    print("---------- Recall ----------")
    recall_per_class = recall_score(y_actual, y_predicted, average=None)
    print_scores(recall_per_class)#, y_actual)
    recall_per_weighted = recall_score(y_actual, y_predicted, average='macro')
    print("\nMacro recall: " + str(round(recall_per_weighted, 3)) + "\n")


#### Logistic Regression

Grid Search - Logistic Regression:

In [17]:
# param_grid = {'C': [0.1, 1, 10], 'solver': ['lbfgs'], 'class_weight': [None, 'balanced']}
# grid_search = GridSearchCV(LogisticRegression(multi_class='multinomial', random_state=42), param_grid, cv=5, scoring='f1_macro')
# grid_search.fit(X_train_std_scaler_encoded, Y_train_encoded_df)

# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

Model - Logistic Regression:

In [None]:
# 0.379
# (oversampling) - 0.323 - 40s

# Create the model
lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=10)

# Fit the model to the training set
lr_model.fit(X_train, y_train)

# Determine the scores for the model for both train and validation sets
score_train = lr_model.score(X_train, y_train)
score_test = lr_model.score(X_val, y_val)

# Use the model to predict on the validation set
lr_y_pred = lr_model.predict(X_val)

# Display the model metrics using the score_model function
score_model(y_val, lr_y_pred, score_train, score_test)

#### DECISION TREE

Gridsearch - decision tree:

In [None]:
# # # Initialize the Decision Tree Classifier
# dt_classifier = DecisionTreeClassifier(random_state=42)

# # Define the parameter grid to search
# param_grid = {
#     'criterion': ['gini', 'entropy'],                          
#     'splitter': ['best', 'random'],                             
#     'max_depth': [None, 10, 20, 30],                           
#     'min_samples_split': [2, 5, 10],                            
#     'min_samples_leaf': [1, 2, 4],                              
#     'max_features': [None, 'sqrt', 'log2'],                    
#     'max_leaf_nodes': [None, 10, 20, 30],                       
#     'min_impurity_decrease': [0.0, 0.1, 0.2]                   
# }

# # Initialize GridSearchCV:
# grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# # Fit GridSearchCV on the training data
# grid_search.fit(X_train, y_train)

# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

# best_model = grid_search.best_estimator_

# #Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
# #Best Score: 0.7769977245887005


Model - Decision Tree:

In [None]:
# 0.366
# (oversampling) - 0.343 - 21s

# Initialize the Decision Tree Classifier
decision_tree = DecisionTreeClassifier(
    criterion='gini',  
    max_depth=10, 
    max_features=None,
    max_leaf_nodes=None, 
    min_impurity_decrease= 0.0,
    min_samples_leaf= 1,
    min_samples_split=2,
    splitter='best',  
    random_state=42    
)

# Train the model
decision_tree.fit(X_train, y_train)

# Determine the scores for the model for both train and validation sets
score_train = decision_tree.score(X_train, y_train)
score_test = decision_tree.score(X_val, y_val)

# Make predictions
dt_y_pred = decision_tree.predict(X_val)

# Display the model metrics using the score_model function
score_model(y_val, dt_y_pred, score_train, score_test)

#### K Nearest Neighbors

Grid Search - KNN 

In [None]:
# # Define the parameter grid for Randomized Search
# param_distributions = {
#     'n_neighbors' : [5,10], 
#     'leaf_size': [30, 50],                                  
#     'metric': ['euclidean', 'manhattan'],          
# }

# # Initialize RandomizedSearchCV with KNN classifier
# random_search = RandomizedSearchCV(
#     estimator=KNeighborsClassifier(),
#     param_distributions=param_distributions,
#     n_iter=5,                                     
#     cv=2,                                          
#     scoring='f1_macro',                            
#     verbose=2,                                     
#     random_state=42,                               
#     n_jobs=-1                                      
# )

# # Fit the random search to the training data
# random_search.fit(X_train, y_train)

# print("Best Parameters:", random_search.best_params_)
# print("Best Score:", random_search.best_score_)

# # Best Parameters: {'n_neighbors': 5, 'metric': 'euclidean', 'leaf_size': 30}
# # Best Score: 0.328445945439427

Model - KNN<br/>
- KNN is not appropriate for too large datasets. Too computational expensive due to memorization requirements.
- KNN takes too long to process due to our large dataset so will be commented out.

In [None]:
# # 0.334
# # Initialize the KNN Classifier
# knn_model = KNeighborsClassifier(n_neighbors=5, leaf_size=30, metric='euclidean')  

# # Train the model
# knn_model.fit(X_train, y_train)

# # Determine the scores for the model for both train and validation sets
# score_train = knn_model.score(X_train, y_train)
# score_test = knn_model.score(X_val, y_val)

# # Predict on the validation set
# y_pred = knn_model.predict(X_val)

# # Display the model metrics using the score_model function
# score_model(y_val, y_pred, score_train, score_test)

Model - Gaussian Naive Bayes: 

- Assumes normality, independence, Homogeneity of Variance (Homoscedasticity): 
- Will be commented out

In [None]:
# # create the model
# model = GaussianNB()

# # fit the model to the training set
# model.fit(X_train, y_train)

# # determine the scores for the model for both train and validation
# score_train = model.score(X_train, y_train)
# score_test = model.score(X_val, y_val)

# # use model to predict on validation set
# y_pred = model.predict(X_val)

# # display the model metrics
# score_model(y_val, y_pred, score_train, score_test)


#### NEURAL NETWORK:

GridSearch - MLPClasssifer:

In [None]:
# # Define the parameter grid
# param_grid = {  
#     'hidden_layer_sizes': [
#         (50),        # Larger single-layer model
#         (50, 30),      # Moderate two-layer model
#         (100, 50),     # Larger two-layer model
#         (128, 64, 32)  # Complex three-layer model
#     ],
#     'activation': ['relu', 'logistic'],                
#     'solver': ['adam', 'sgd'],                     
#     'alpha': [0.0001, 0.001],                       
#     'learning_rate': ['adaptive', 'invscaling']          
# }

# # Initialize the Neural Network model
# mlp = MLPClassifier(random_state=42)

# # Initialize Random Search for hyperparameter tuning
# random_search = RandomizedSearchCV(
#     estimator=mlp,
#     param_distributions=param_grid,  # Using param_distributions for randomized search
#     n_iter=10,  # Number of random combinations to try
#     cv=2,  # 3-fold cross-validation
#     scoring='f1_macro',  # Evaluation metric
#     verbose=2,  # Display progress logs
#     n_jobs=-1,  # Use all available processors for parallel computation
#     random_state=42  # For reproducibility
# )

# # Fit the randomized search to the training data
# random_search.fit(X_train, y_train)

# 
# print("Best Parameters:", random_search.best_params_)
# print("Best Score:", random_search.best_score_)

# # Best Parameters: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (50, 30), 'alpha': 0.001, 'activation': 'logistic'}
# # Best Score: 0.4192846184298069


MODEL - Neural Network:

In [28]:
#UPDATED VERSION:

# # Initialize the Neural Network model
model = MLPClassifier(hidden_layer_sizes=(50, 30),  
                      activation='logistic',        
                      solver='adam',               
                      alpha=0.0001,                
                      learning_rate='adaptive',    
                      max_iter=200,                
                      random_state=42)             

# Train the model
model.fit(X_train, y_train)

# Determine the scores for the model for both train and validation sets
score_train = model.score(X_train, y_train)  # Accuracy on training data
score_test = model.score(X_val, y_val)      # Accuracy on validation data

# Use the model to predict on the validation set
y_pred = model.predict(X_val)

# Display the model metrics using the score_model function
score_model(y_val, y_pred, score_train, score_test)

------------ F1 ------------
[1. CANCELLED]:     	0.55
[2. NON-COMP]:     	0.9
[3. MED ONLY]:     	0.15
[4. TEMPORARY]:     	0.77
[5. PPD SCH LOSS]:     	0.61
[6. PPD NSL]:     	0.0
[7. PTD]:     		0.0
[8. DEATH]:     	0.32

Macro f1: 0.412

------ Individual Score Comparisons ------ 
Train Score: 0.7896692782013076
Test Score: 0.7751614326860541
Difference: 0.014507845515253526
--------- Accuracy ---------

Accuracy Score: 0.7751614326860541

--------- Precision ---------
[1. CANCELLED]:     	0.7
[2. NON-COMP]:     	0.85
[3. MED ONLY]:     	0.48
[4. TEMPORARY]:     	0.69
[5. PPD SCH LOSS]:     	0.69
[6. PPD NSL]:     	0.0
[7. PTD]:     		0.0
[8. DEATH]:     	0.4

Macro precision: 0.476

---------- Recall ----------
[1. CANCELLED]:     	0.45
[2. NON-COMP]:     	0.95
[3. MED ONLY]:     	0.09
[4. TEMPORARY]:     	0.88
[5. PPD SCH LOSS]:     	0.55
[6. PPD NSL]:     	0.0
[7. PTD]:     		0.0
[8. DEATH]:     	0.27

Macro recall: 0.398



- using StandardScaler and with deleting only Number of Dependents i got score 0.432
- using StandardScaler and with deleting: Number of Dependents and Days Between Acciden_C2 i got score: 0.412

In [None]:
#f1 score of 0.415
# (oversampling) - 0.407 - (0.11 diff in scores) - 12m 3s 

# Initialize the Neural Network model
mlpc_model = MLPClassifier(hidden_layer_sizes=(64, 32),  
                      activation='relu',           
                      solver='adam',               
                      alpha=0.0001,                
                      learning_rate_init=0.001,    
                      max_iter=200,                
                      random_state=42)             

# Train the model
mlpc_model.fit(X_train, y_train)

# Determine the scores for the model for both train and validation sets
score_train = mlpc_model.score(X_train, y_train)  # Accuracy on training data
score_test = mlpc_model.score(X_val, y_val)      # Accuracy on validation data

# Use the model to predict on the validation set
mplc_y_pred = mlpc_model.predict(X_val)

# Display the model metrics using the score_model function
score_model(y_val, mplc_y_pred, score_train, score_test)


- using StandardScaler and with deleting only Number of Dependents i got score 0.425

<a href="https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestClassifier.html">Random Forest</a> -> (overfits) <br/>
Fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. <br/>

In [27]:
# 0.379
# (oversampling) - 0.433 (overfitting w/ 0.23 diff) - 6m 5s

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Determine the scores for the model for both train and validation sets
score_train = rf_model.score(X_train, y_train)  # Accuracy on training data
score_test = rf_model.score(X_val, y_val)      # Accuracy on validation data

# Use the model to predict on the validation set
rf_y_pred = rf_model.predict(X_val)

# Display the model metrics using the score_model function
score_model(y_val, rf_y_pred, score_train, score_test)


------------ F1 ------------
[1. CANCELLED]:     	0.55
[2. NON-COMP]:     	0.9
[3. MED ONLY]:     	0.12
[4. TEMPORARY]:     	0.77
[5. PPD SCH LOSS]:     	0.58
[6. PPD NSL]:     	0.0
[7. PTD]:     		0.0
[8. DEATH]:     	0.09

Macro f1: 0.378

------ Individual Score Comparisons ------ 
Train Score: 0.9999447205624431
Test Score: 0.7760034377032425
Difference: 0.22394128285920056
--------- Accuracy ---------

Accuracy Score: 0.7760034377032425

--------- Precision ---------
[1. CANCELLED]:     	0.73
[2. NON-COMP]:     	0.85
[3. MED ONLY]:     	0.5
[4. TEMPORARY]:     	0.68
[5. PPD SCH LOSS]:     	0.71
[6. PPD NSL]:     	0.0
[7. PTD]:     		0.0
[8. DEATH]:     	0.54

Macro precision: 0.501

---------- Recall ----------
[1. CANCELLED]:     	0.45
[2. NON-COMP]:     	0.96
[3. MED ONLY]:     	0.07
[4. TEMPORARY]:     	0.89
[5. PPD SCH LOSS]:     	0.49
[6. PPD NSL]:     	0.0
[7. PTD]:     		0.0
[8. DEATH]:     	0.05

Macro recall: 0.364



- using StandardScaler and with deleting only Number of Dependents i got score 0.433
- using StandardScaler and with deleting: Number of Dependents and Days Between Acciden_C2 i got score 0.378 (2 min)

In [None]:
# pos_weight = np.sum(y_train == 6) / np.sum(y_train != 6)

# score = 0
# score_settings = ""

# for x in range(1,20):
#     for y in range(50, 151, 10):
#         for z in np.arange(0, 1.1, 0.1):
#             xgb_model = xgb.XGBClassifier(
#                 n_estimators=y,  # Number of trees
#                 learning_rate=z,  # Step size shrinkage
#                 max_depth=x,       # Maximum depth of a tree
#                 random_state=42,   # For reproducibility
#                 use_label_encoder=False,  # Avoid warning for encoding
#                 eval_metric='mlogloss',    # Evaluation metric for multi-class classification
#                 scale_pos_weight = pos_weight
#             )
#             xgb_model.fit(X_train, y_train)
#             xgb_y_pred = xgb_model.predict(X_val)
#             f1 = f1_score(y_val, xgb_y_pred, average="macro")

#             if f1 > score:
#                 score = f1
#                 score_settings = "max_depth: " + str(x) + " | n_estimators: " + str(y) + " | lr: " + str(z)

# print(score)
# print(score_settings)

<a href="https://xgboost.readthedocs.io/en/stable/tutorials/index.html">XGBoost</a> -> (tends to overfit): <br/>
Also using decision trees


In [26]:
# 0.442
# (oversampling) 0.453 (overfit by 0.10 diff) 4m 18s

# max_depth = 19, n_estimators = 150, lr = 0.6 -> overfitting
xgb_model = xgb.XGBClassifier(
    n_estimators=110,  # Number of trees
    learning_rate=0.2,  # Step size shrinkage
    max_depth=7,       # Maximum depth of a tree
    random_state=42,   # For reproducibility
    use_label_encoder=False,  # Avoid warning for encoding
    eval_metric='mlogloss'    # Evaluation metric for multi-class classification
)

# Train the model
xgb_model.fit(X_train, y_train)

# Determine the scores for the model for both train and validation sets
score_train = xgb_model.score(X_train, y_train)  # Accuracy on training data
score_test = xgb_model.score(X_val, y_val)      # Accuracy on validation data

# Use the model to predict on the validation set
xgb_y_pred = xgb_model.predict(X_val)

# Display the model metrics using the score_model function
score_model(y_val, xgb_y_pred, score_train, score_test)

------------ F1 ------------
[1. CANCELLED]:     	0.58
[2. NON-COMP]:     	0.91
[3. MED ONLY]:     	0.13
[4. TEMPORARY]:     	0.78
[5. PPD SCH LOSS]:     	0.62
[6. PPD NSL]:     	0.0
[7. PTD]:     		0.0
[8. DEATH]:     	0.39

Macro f1: 0.426

------ Individual Score Comparisons ------ 
Train Score: 0.7979662192382494
Test Score: 0.785671978073028
Difference: 0.012294241165221376
--------- Accuracy ---------

Accuracy Score: 0.785671978073028

--------- Precision ---------
[1. CANCELLED]:     	0.72
[2. NON-COMP]:     	0.85
[3. MED ONLY]:     	0.51
[4. TEMPORARY]:     	0.71
[5. PPD SCH LOSS]:     	0.68
[6. PPD NSL]:     	0.0
[7. PTD]:     		0.0
[8. DEATH]:     	0.6

Macro precision: 0.509

---------- Recall ----------
[1. CANCELLED]:     	0.49
[2. NON-COMP]:     	0.97
[3. MED ONLY]:     	0.07
[4. TEMPORARY]:     	0.88
[5. PPD SCH LOSS]:     	0.57
[6. PPD NSL]:     	0.0
[7. PTD]:     		0.0
[8. DEATH]:     	0.29

Macro recall: 0.408



- using StandardScaler and with deleting: Number of Dependents i got score 0.432
- using StandardScaler and with deleting: Number of Dependents and Days Between Accident_Assembly i got score 0.427 
- using StandardScaler and with deleting: Number of Dependents and Days Between Acciden_C2 i got score 0.426


Gradient Boosted Decision Trees

In [None]:
# 16 min = max_depth = 6 - .402 f1
gbdt_model = GradientBoostingClassifier(
    n_estimators=100,       # Number of boosting stages
    learning_rate=0.1,      # Shrinks contribution of each tree
    max_depth=6,            # Limits depth of each tree to prevent overfitting
    random_state=42         # For reproducibility
)

# Train the model
gbdt_model.fit(X_train, y_train)

# Determine the scores for the model for both train and validation sets
score_train = gbdt_model.score(X_train, y_train)  # Accuracy on training data
score_test = gbdt_model.score(X_val, y_val)      # Accuracy on validation data

# Use the model to predict on the validation set
gbdt_y_pred = gbdt_model.predict(X_val)

# Display the model metrics using the score_model function
score_model(y_val, gbdt_y_pred, score_train, score_test)

Bagging Methods

In [None]:
# oversampling - 0.405 - 6m 55s - overfitting by 0.29
# 0.4 f1 macro score
bagging_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)
bagging_model.fit(X_train, y_train)
bagging_y_pred = bagging_model.predict(X_val)

score_train = bagging_model.score(X_train, y_train)
score_test = bagging_model.score(X_val, y_val)

score_model(y_val, bagging_y_pred, score_train, score_test)

- using StandardScaler and with deleting only Number of Dependents i got score 0.403

In [None]:
# oversampling with correct hyperparameters - 0.451 - 0.104 for overfitting
# oversampling - 0.448 - 17m 59s - 0.107 diff for overfitting
# 0.425 f1 macro score
bagging_model = BaggingClassifier(estimator=xgb.XGBClassifier(
    n_estimators=110,  # Number of trees
    learning_rate=0.2,  # Step size shrinkage
    max_depth=7,       # Maximum depth of a tree
    random_state=42,   # For reproducibility
    use_label_encoder=False,  # Avoid warning for encoding
    eval_metric='mlogloss'    # Evaluation metric for multi-class classification
), n_estimators=10, random_state=42)
bagging_model.fit(X_train, y_train)
bagging_y_pred = bagging_model.predict(X_val)

score_train = bagging_model.score(X_train, y_train)
score_test = bagging_model.score(X_val, y_val)

score_model(y_val, bagging_y_pred, score_train, score_test)

- using StandardScaler and with deleting only Number of Dependents i got score 0.423

In [None]:
# (hidden_layer_sizes=(13,), max_iter=500, random_state=42) - 0.395 (no overfit) 12m 58s
# (hidden_layer_sizes=(15,), max_iter=500, random_state=42) - 0.407 (no overfit) 11m 34s
# (hidden_layer_sizes=(20,), max_iter=500, random_state=42) - 0.407 (no overfit) 15m 20s
# (hidden_layer_sizes=(10,), max_iter=500, random_state=42) - 0.389 (no overfit) 4m 6s
# (hidden_layer_sizes=(10,), max_iter=1000, random_state=42) - 0.389 (no overfit) 4m 8s
base_model = MLPClassifier(hidden_layer_sizes=(64, 32),  # Two hidden layers: 64 and 32 neurons
                      activation='relu',           # ReLU activation function
                      solver='adam',               # Adam optimizer
                      alpha=0.0001,                # Regularization term (L2 penalty)
                      learning_rate_init=0.001,    # Initial learning rate
                      max_iter=200,                # Maximum number of iterations
                      random_state=42)             # For reproducibility)
bagging_model = BaggingClassifier(estimator=base_model, n_estimators=10, random_state=42)
bagging_model.fit(X_train, y_train)
bagging_y_pred = bagging_model.predict(X_val)

score_train = bagging_model.score(X_train, y_train)
score_test = bagging_model.score(X_val, y_val)

score_model(y_val, bagging_y_pred, score_train, score_test)
# try this one

In [None]:
# 0.440 - LR -> XGB -> MLP w/ a 0.015 difference in scores (3m 55s)
# 0.425 - LR -> MLP -> XGB w/ a 0.0099 difference in scores (37m)
# 0.410 - MLP -> XGB -> GBC w/ a 0.011 difference in scores (93m 35s)

base_models = [
    ('mlpc', LogisticRegression(multi_class='multinomial', solver='lbfgs', C=10)),
    ('xgb', xgb.XGBClassifier(
    n_estimators=110,  # Number of trees
    learning_rate=0.2,  # Step size shrinkage
    max_depth=7,       # Maximum depth of a tree
    random_state=42,   # For reproducibility
    use_label_encoder=False,  # Avoid warning for encoding
    eval_metric='mlogloss'    # Evaluation metric for multi-class classification
) )
]

nn = MLPClassifier(hidden_layer_sizes=(64, 32),  # Two hidden layers: 64 and 32 neurons
                      activation='relu',           # ReLU activation function
                      solver='adam',               # Adam optimizer
                      alpha=0.0001,                # Regularization term (L2 penalty)
                      learning_rate_init=0.001,    # Initial learning rate
                      max_iter=200,                # Maximum number of iterations
                      random_state=42) 

stacked_model = StackingClassifier(estimators=base_models, final_estimator=nn)
stacked_model.fit(X_train, y_train)
y_pred = stacked_model.predict(X_val)

score_train = stacked_model.score(X_train, y_train)
score_test = stacked_model.score(X_val, y_val)

score_model(y_val, y_pred, score_train, score_test)

- using StandardScaler and with deleting only Number of Dependents i got score 0.429

Weighted Averaging

In [None]:
# # lr_y_pred_f1   = f1_score(y_val, lr_y_pred, average='macro')
# # dt_y_pred_f1   = f1_score(y_val, dt_y_pred, average='macro')
# # knn_y_pred_f1  = f1_score(y_val, knn_y_pred, average='macro')
# mplc_y_pred_f1 = f1_score(y_val, mplc_y_pred, average='macro')
# # rf_y_pred_f1   = f1_score(y_val, rf_y_pred, average='macro')
# xgb_y_pred_f1  = f1_score(y_val, xgb_y_pred, average='macro')
# gbdt_y_pred_f1 = f1_score(y_val, gbdt_y_pred, average='macro')

# # f1_score(y_actual, y_predicted, average='macro')

# # Assign weights based on F1 scores
# #weights = [lr_y_pred_f1, dt_y_pred_f1, knn_y_pred_f1, mplc_y_pred_f1, rf_y_pred_f1, xgb_y_pred_f1, gbdt_y_pred_f1]
# weights = [mplc_y_pred_f1, xgb_y_pred_f1, gbdt_y_pred_f1]
# weights = np.array(weights) / np.sum(weights)  # Normalize weights

# # Make weighted predictions
# # lr_probs    = lr_model.predict_proba(X_val)[:, 1]
# # dt_probs    = decision_tree.predict_proba(X_val)[:, 1]
# # knn_probs   = knn_model.predict_proba(X_val)[:, 1]
# mplc_probs  = mlpc_model.predict_proba(X_val)[:, 1]
# # rf_probs    = rf_model.predict_proba(X_val)[:, 1]
# xgb_probs   = xgb_model.predict_proba(X_val)[:, 1]
# gbdt_probs  = gbdt_model.predict_proba(X_val)[:, 1]

# # Aggregate predictions using weights
# weighted_probs = (
#                     # weights[0] * lr_probs +
#                 #   weights[1] * dt_probs +
#                 #   weights[2] * knn_probs +
#                   weights[0] * mplc_probs + 
#                 #   weights[4] * rf_probs + 
#                   weights[1] * xgb_probs + 
#                   weights[2] * gbdt_probs)

# # Final predictions (threshold = 0.5)
# final_predictions = (weighted_probs >= 0.2).astype(int)

# # Evaluate the ensemble
# final_f1 = f1_score(y_val, final_predictions, average='macro')
# print(f"Weighted Ensemble F1 Score: {final_f1:.2f}")


<a class="anchor" id="kaggle">

## 11. Kaggle Submission
</a>

In [37]:
# get the model prediction
# y_pred_test = model.predict(test_data)
y_pred_test = xgb_model.predict(test_data_c)

In [None]:
y_pred_test

In [None]:
# decode the prediction labels back to their original values
decoded_labels = label_encoder.inverse_transform(y_pred_test)
decoded_labels

In [None]:
test_data.shape

In [None]:
# combine the prediction values with their claim identifiers into a dataframe
kaggle_submission = pd.DataFrame({"Claim Identifier": test_data.index, "Claim Injury Type":decoded_labels})
kaggle_submission.head()

In [44]:
#Compile the resulting dataframe into a csv file named "Kaggle_submission.csv"
#this will be found in the directory the file is currently running from
#if a file exists with the same name, it will overwrite it with the new output.
kaggle_submission.to_csv("Kaggle_Submission.csv", index=False)