In [1]:
from os.path import join
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

## Import Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Import Cross Validation methods
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
sns.set()

<a class="anchor" id="importdatasets">

## 1. Import Datasets

</a>

Import datasets that you got from the notebook **Preprocessing for Agreement Reached**

In [2]:
df_train = pd.read_csv("train_encoded.csv", low_memory=False)
df_val = pd.read_csv("validation_encoded.csv")
df_test = pd.read_csv("test_encoded.csv")

In [3]:
# df_train.head()

In [4]:
# df_val.head()

In [5]:
# df_test.head()

In [6]:
df_train.set_index('Claim Identifier', inplace=True)
df_val.set_index('Claim Identifier', inplace=True)
df_test.set_index('Claim Identifier', inplace=True)

In [7]:
#Just to check the index
# df_val.head()

Defining the target variable

In [8]:
X_train = df_train.drop('Agreement Reached', axis= 1)
y_train = df_train['Agreement Reached']

X_val = df_val.drop('Agreement Reached', axis= 1)
y_val = df_val['Agreement Reached']

### 1.1 Encode Target Variable
Label Encoder for target variable (training and validation):
<br/> <br/>
(This needs to be done in both the proprocessing notebook as well as here to be able to interpret the results properly when a model is tested.)

In [9]:
from sklearn.preprocessing import LabelEncoder

#Initiate Label encoder
label_encoder = LabelEncoder()

#Fit the encoder on the training target variable
Y_train_encoded = label_encoder.fit_transform(y_train)

#Transform the training and validation target variable
Y_val_encoded = label_encoder.transform(y_val)

y_val_unencoded = y_train.copy()

#Convert the results back to DataFrames while overriding the previous variable names
y_train = pd.DataFrame(Y_train_encoded, columns=['encoded_target'], index=pd.Series(y_train.index))
y_val = pd.DataFrame(Y_val_encoded, columns=['encoded_target'], index=pd.Series(y_val.index))

In [10]:
# Functions to help display metrics for all models

# helper method for score_model - not to be used seperately
def print_scores(per_class):
    for x,y in zip(per_class, np.unique(y_val_unencoded)):
        if str(y) == "7. PTD": # add an extra tab for better alignment
            print("["+str(y)+"]:     \t\t" + str(round(x,2))) 
        else:
            print("["+str(y)+"]:     \t" + str(round(x,2)))

# displays the scores for Precision, Recall, and F1
def score_model(y_actual, y_predicted, score_train, score_test):

    print("------------ F1 ------------")
    f1_per_class = f1_score(y_actual, y_predicted, average=None)
    print_scores(f1_per_class)#, y_actual)
    f1_per_weighted = f1_score(y_actual, y_predicted, average='macro')
    print("\nMacro f1: " + str(round(f1_per_weighted, 3)) + "\n")

    print("------ Individual Score Comparisons ------ ")
    print("Train Score: " + str(score_train))
    print("Test Score: " + str(score_test))
    diff = np.abs(score_train - score_test)
    print("Difference: " + str(diff))

    print("--------- Accuracy ---------\n")
    acc_score = accuracy_score(y_actual, y_predicted)
    print("Accuracy Score: " + str(acc_score) + "\n")

    print("--------- Precision ---------")
    precision_per_class = precision_score(y_actual, y_predicted, average=None)
    print_scores(precision_per_class)#, y_actual)
    precision_weighted = precision_score(y_actual, y_predicted, average='macro')
    print("\nMacro precision: " + str(round(precision_weighted, 3)) + "\n")

    print("---------- Recall ----------")
    recall_per_class = recall_score(y_actual, y_predicted, average=None)
    print_scores(recall_per_class)#, y_actual)
    recall_per_weighted = recall_score(y_actual, y_predicted, average='macro')
    print("\nMacro recall: " + str(round(recall_per_weighted, 3)) + "\n")


Undersampling

In [16]:
import pandas as pd
from sklearn.utils import resample

# Assuming X_train and y_train are your original training data
training_data = pd.concat([X_train, y_train], axis=1)

# Identify minority and majority classes
class_counts = y_train.value_counts()
minority_class = class_counts.idxmin()
majority_class = class_counts.idxmax()

# Separate minority and majority data
minority_data = training_data[training_data['encoded_target'] == minority_class]
majority_data = training_data[training_data['encoded_target'] == majority_class]

# Calculate the target size for undersampling (using integer division)
target_size = len(minority_data) * 3

# Function to undersample a specific class
def undersample_class(data, n):
    return resample(data, n_samples=n, replace=False, random_state=42)

# Undersample the majority class
undersampled_majority = undersample_class(majority_data, target_size)

# Combine undersampled majority class with minority class
balanced_data = pd.concat([undersampled_majority, minority_data])

# Separate features and target
X_train_balanced = balanced_data.drop(columns='encoded_target')
y_train_balanced = balanced_data['encoded_target']

# Check class distribution after undersampling
print("Class distribution after undersampling:")
print(y_train_balanced.value_counts().sort_index())


Class distribution after undersampling:
encoded_target
0    54147
1    18049
Name: count, dtype: int64


We've decided to try this model, because it was one of the best models, based on f1_macro, to predict Claim Injury Type

In [12]:
xgb_model = xgb.XGBClassifier(
    n_estimators=110,  # Number of trees
    learning_rate=0.2,  # Step size shrinkage
    max_depth=7,       # Maximum depth of a tree
    random_state=42,   # For reproducibility
    use_label_encoder=False,  # Avoid warning for encoding
    eval_metric='mlogloss'    # Evaluation metric for multi-class classification
)

# Train the model
xgb_model.fit(X_train, y_train)

# Determine the scores for the model for both train and validation sets
score_train = xgb_model.score(X_train, y_train)  # Accuracy on training data
score_test = xgb_model.score(X_val, y_val)      # Accuracy on validation data

# Use the model to predict on the validation set
xgb_y_pred = xgb_model.predict(X_val)

# Display the model metrics using the score_model function
score_model(y_val, xgb_y_pred, score_train, score_test)

Parameters: { "use_label_encoder" } are not used.



------------ F1 ------------
[0]:     	0.98
[1]:     	0.2

Macro f1: 0.589

------ Individual Score Comparisons ------ 
Train Score: 0.960822405291702
Test Score: 0.95666287280498
Difference: 0.004159532486722073
--------- Accuracy ---------

Accuracy Score: 0.95666287280498

--------- Precision ---------
[0]:     	0.96
[1]:     	0.72

Macro precision: 0.84

---------- Recall ----------
[0]:     	1.0
[1]:     	0.12

Macro recall: 0.557



Since the model is not capable of predictiong that well the class 1, we are going to perform the same model with undersampling

In [17]:
xgb_model_under = xgb.XGBClassifier(
    n_estimators=110,  # Number of trees
    learning_rate=0.2,  # Step size shrinkage
    max_depth=7,       # Maximum depth of a tree
    random_state=42,   # For reproducibility
    use_label_encoder=False,  # Avoid warning for encoding
    eval_metric='mlogloss'    # Evaluation metric for multi-class classification
)

# Train the model
xgb_model_under.fit(X_train_balanced, y_train_balanced)

# Determine the scores for the model for both train and validation sets
score_train_under = xgb_model_under.score(X_train_balanced, y_train_balanced)  # Accuracy on training data
score_test_under = xgb_model_under.score(X_val, y_val)      # Accuracy on validation data

# Use the model to predict on the validation set
xgb_y_pred_under = xgb_model_under.predict(X_val)

# Display the model metrics using the score_model function
score_model(y_val, xgb_y_pred_under, score_train, score_test)

Parameters: { "use_label_encoder" } are not used.



------------ F1 ------------
[0]:     	0.94
[1]:     	0.4

Macro f1: 0.668

------ Individual Score Comparisons ------ 
Train Score: 0.960822405291702
Test Score: 0.95666287280498
Difference: 0.004159532486722073
--------- Accuracy ---------

Accuracy Score: 0.8880191396450804

--------- Precision ---------
[0]:     	0.99
[1]:     	0.27

Macro precision: 0.627

---------- Recall ----------
[0]:     	0.89
[1]:     	0.79

Macro recall: 0.842



<hr>

In [29]:
base_models = [
    ('lr', LogisticRegression(multi_class='multinomial', solver='lbfgs', C=10)),
    ('xgb', xgb.XGBClassifier(
    n_estimators=110,  # Number of trees
    learning_rate=0.2,  # Step size shrinkage
    max_depth=7,       # Maximum depth of a tree
    random_state=42,   # For reproducibility
    use_label_encoder=False,  # Avoid warning for encoding
    eval_metric='mlogloss'    # Evaluation metric for multi-class classification
) )
]

nn = MLPClassifier(hidden_layer_sizes=(64, 32),  # Two hidden layers: 64 and 32 neurons
                      activation='relu',           # ReLU activation function
                      solver='adam',               # Adam optimizer
                      alpha=0.0001,                # Regularization term (L2 penalty)
                      learning_rate_init=0.001,    # Initial learning rate
                      max_iter=200,                # Maximum number of iterations
                      random_state=42) 

stacked_model = StackingClassifier(estimators=base_models, final_estimator=nn)
stacked_model.fit(X_train_balanced, y_train_balanced)
y_pred = stacked_model.predict(X_val)

score_train = stacked_model.score(X_train_balanced, y_train_balanced)
score_test = stacked_model.score(X_val, y_val)

score_model(y_val, y_pred, score_train, score_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also re

------------ F1 ------------
[0]:     	0.94
[1]:     	0.4

Macro f1: 0.668

------ Individual Score Comparisons ------ 
Train Score: 0.8970857111197297
Test Score: 0.8880017188516213
Difference: 0.00908399226810841
--------- Accuracy ---------

Accuracy Score: 0.8880017188516213

--------- Precision ---------
[0]:     	0.99
[1]:     	0.27

Macro precision: 0.627

---------- Recall ----------
[0]:     	0.89
[1]:     	0.79

Macro recall: 0.843



In [30]:
base_models = [
    ('lr', LogisticRegression(multi_class='multinomial', solver='lbfgs', C=10)),
    ('xgb', xgb.XGBClassifier(
    n_estimators=110,  # Number of trees
    learning_rate=0.2,  # Step size shrinkage
    max_depth=7,       # Maximum depth of a tree
    random_state=42,   # For reproducibility
    use_label_encoder=False,  # Avoid warning for encoding
    eval_metric='mlogloss'    # Evaluation metric for multi-class classification
) )
]

nn = MLPClassifier(hidden_layer_sizes=(64, 32),  # Two hidden layers: 64 and 32 neurons
                      activation='relu',           # ReLU activation function
                      solver='adam',               # Adam optimizer
                      alpha=0.0001,                # Regularization term (L2 penalty)
                      learning_rate_init=0.001,    # Initial learning rate
                      max_iter=200,                # Maximum number of iterations
                      random_state=42) 

stacked_model = StackingClassifier(estimators=base_models, final_estimator=nn)
stacked_model.fit(X_train, y_train)
y_pred = stacked_model.predict(X_val)

score_train = stacked_model.score(X_train, y_train)
score_test = stacked_model.score(X_val, y_val)

score_model(y_val, y_pred, score_train, score_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



------------ F1 ------------
[0]:     	0.98
[1]:     	0.19

Macro f1: 0.583

------ Individual Score Comparisons ------ 
Train Score: 0.9604052213336818
Test Score: 0.9567035213230511
Difference: 0.0037017000106306375
--------- Accuracy ---------

Accuracy Score: 0.9567035213230511

--------- Precision ---------
[0]:     	0.96
[1]:     	0.75

Macro precision: 0.855

---------- Recall ----------
[0]:     	1.0
[1]:     	0.11

Macro recall: 0.553



<hr>

Now let's make our predictions, For the model without Undersampling:

In [25]:
# Make predictions on X_test
xgb_test_predictions = xgb_model.predict(df_test)

In [26]:
# Get probability scores for each class
xgb_test_probabilities = xgb_model.predict_proba(df_test)

In [27]:
# Create a DataFrame with predictions 
agreement_reached_predictions = pd.DataFrame({
    'Agreement Reached': xgb_test_predictions
}, index=df_test.index)

Now let's make our predictions, For the model with Undersampling:

In [18]:
# Make predictions on X_test
xgb_test_predictions_under = xgb_model_under.predict(df_test)

In [19]:
# Get probability scores for each class
xgb_test_probabilities_under = xgb_model_under.predict_proba(df_test)

In [20]:
# Create a DataFrame with predictions 
agreement_reached_predictions_under = pd.DataFrame({
    'Agreement Reached': xgb_test_predictions_under
}, index=df_test.index)

<hr>

In [24]:
print(f"Number of 0s with undersampling: {len(agreement_reached_predictions_under[agreement_reached_predictions_under['Agreement Reached'] == 0])}")
print(f"Number of 1s with undersampling: {len(agreement_reached_predictions_under[agreement_reached_predictions_under['Agreement Reached'] == 1])}")

Number of 0s with undersampling: 357911
Number of 1s with undersampling: 30064


In [28]:
print(f"Number of 0s without undersampling: {len(agreement_reached_predictions[agreement_reached_predictions['Agreement Reached'] == 0])}")
print(f"Number of 1s without undersampling: {len(agreement_reached_predictions[agreement_reached_predictions['Agreement Reached'] == 1])}")

Number of 0s without undersampling: 386982
Number of 1s without undersampling: 993


Download predictions for XGBoost

In [98]:
# agreement_reached_predictions.to_csv('Agreement_Reached_Predictions.csv')

Download predictions for XGBoost with Undersampling
Since our main goal is to maximize the f1_macro the one chosen was this one!

In [99]:
agreement_reached_predictions_under.to_csv('Agreement_Reached_Predictions_under.csv')

In [100]:
# df_test = pd.concat([df_test, agreement_reached_predictions], axis=1)
# df_test.to_csv('df_test_AR.csv')