# Model Training

In [18]:
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier 
from imblearn.over_sampling import SMOTE


#### Load Dataset

In [2]:
file_name = '../data/cleaned_auto_insurance_claims.csv'
data = pd.read_csv(file_name)

#### Split the Data into Training and Testing sets

In [12]:
# Split the data into X and y
X = data.drop(columns=['fraud_reported'])
y = data['fraud_reported']

smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Train the model

In [13]:
# Training a knn model

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'KNN Model - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}')

KNN Model - Accuracy: 0.838495575221239, Precision: 0.7921146953405018, Recall: 0.9364406779661016, F1: 0.858252427184466


In [14]:
# Training a logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'Logistic Regression Model - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}')

Logistic Regression Model - Accuracy: 0.7809734513274337, Precision: 0.8215962441314554, Recall: 0.7415254237288136, F1: 0.7795100222717148


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# Training a Random Forest Model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'Random Forest Model - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}')

Random Forest Model - Accuracy: 0.8694690265486725, Precision: 0.8968609865470852, Recall: 0.847457627118644, F1: 0.8714596949891068


In [16]:
# Training a Support Vector Machine Model
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'Support Vector Machine Model - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}')

Support Vector Machine Model - Accuracy: 0.8053097345132744, Precision: 0.8333333333333334, Recall: 0.7838983050847458, F1: 0.8078602620087337


In [20]:
# Training a stacking ensemble with gradient boosting
# classifier with knn and gradient boosting classifier
# as base models and logistic regression as the meta learner

base_models = [ 
    ('gb', GradientBoostingClassifier( 
        n_estimators=150, 
        learning_rate=0.1, 
        max_depth=3, 
        random_state=42)), 
    ('knn', KNeighborsClassifier( 
        n_neighbors=1)) ] 
meta_learner = LogisticRegression( max_iter=1000)

model = StackingClassifier(
    estimators=base_models, 
    final_estimator=meta_learner) 
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'Stacking Model - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}')


Stacking Model - Accuracy: 0.9070796460176991, Precision: 0.9041666666666667, Recall: 0.9194915254237288, F1: 0.9117647058823529


### Summary

- Different models were tested and evaluated, including a Random Forest, SVM, Logistic Regression, K Nearest Neighbor, and Gradient Boosting Classifier
- Creating a Stacking Ensemble with a combination of these models yields the best evaulation metrics with scores at 90%+
- The stacking ensemble was tuned through use of trial and error, as using other measures such as GridSearchCV was causing extremely long processing times 

Next Steps:
- Implement model within a Flask application