# Travel Insurance Claims Prediction

Change log
| Date     | Name | Changes | To-Do | 
| -------- | -----| --------|-------|
| 2022/08/11 | Matthew | FE & Modelling | 1. Reproducible code (use functions) <br> 2. Focus on FE & imbalance rather than modelling <br> 3. File structure (separate notebook and data) 

In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import autokeras as ak

from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from autosklearn.classification import AutoSklearnClassifier
from imblearn.over_sampling import *
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import BalancedBaggingClassifier, RUSBoostClassifier, BalancedRandomForestClassifier
from pandas_profiling import ProfileReport

### Load & Quick Examination of the Dataset

In [34]:
travel_df = pd.read_csv('travel_insurance_dataset.csv')

In [35]:
travel_df.head()
# 63326 rows

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,65,AUSTRALIA,-49.5,29.7,,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,60,AUSTRALIA,-39.6,23.76,,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,79,ITALY,-19.8,11.88,,41


### EDA using Pandas Profiling

In [55]:
prof = ProfileReport(travel_df)
prof.to_file(output_file='travel_eda.html')

Summarize dataset: 100%|██████████| 41/41 [00:50<00:00,  1.23s/it, Completed]                                         
Generate report structure: 100%|██████████| 1/1 [00:09<00:00,  9.23s/it]
Render HTML: 100%|██████████| 1/1 [00:02<00:00,  2.50s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 168.37it/s]


### Feature Engineering

In [47]:
# add to remove duplicated data
def feature_engineering(df):
        df.loc[df['Age']>100,'Age'] = np.NaN 
        df = pd.get_dummies(df,drop_first=True)
        col = df.columns
        imputer = KNNImputer(n_neighbors=2)
        df = pd.DataFrame(imputer.fit_transform(df))
        df.columns = col
    return df

cleaned_df = feature_engineering(travel_df)

### Data Splitting

In [63]:
def data_splitting(df,target_col,test_size):
    global X_train, X_test, y_train, y_test
    X = df.loc[:, df.columns != target_col]
    y = df.loc[:, target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 0)

data_splitting(cleaned_df,'Claim_Yes',0.3)

### Imbalanced Data

In [64]:
cleaned_df['Claim_Yes'].value_counts()

0.0    62409
1.0      917
Name: Claim_Yes, dtype: int64

SMOTE

In [98]:
sm = SMOTE(random_state = 1)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

Modelling

In [None]:
import xgboost as xgb
# fit model to training data
model = xgb.XGBRFClassifier(n_estimators=1000, random_state=100)
# cross validation score
score = cross_val_score(model, X_train_res, y_train_res, cv=2, scoring="roc_auc", n_jobs=-1)
print("XGB ROC-AUC Mean Score: ", np.mean(score))

In [111]:
from bayes_opt import BayesianOptimization
import warnings
warnings.filterwarnings('ignore')

In [115]:
# Convert to special data format
# https://xgboost.readthedocs.io/en/latest/python/python_intro.html
dtrain = xgb.DMatrix(X_train_res, y_train_res, feature_names=X_train.columns.values)

def hyp_xgb(max_depth, subsample, colsample_bytree,min_child_weight, gamma, learning_rate):
    params = {
    'objective': 'binary:logistic',
    'eval_metric':'auc',
    'nthread':-1
     }
    
    params['max_depth'] = int(round(max_depth))
    params['subsample'] = max(min(subsample, 1), 0)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['min_child_weight'] = int(min_child_weight)
    params['gamma'] = max(gamma, 0)
    params['learning_rate'] = learning_rate
    scores = xgb.cv(params, dtrain, num_boost_round=500,verbose_eval=False, 
                    early_stopping_rounds=10, nfold=3)
    return scores['test-auc-mean'].iloc[-1]

In [116]:
pds ={
  'min_child_weight':(3, 20),
  'gamma':(0, 10),
  'subsample':(0.5, 1),
  'colsample_bytree':(0.1, 1),
  'max_depth': (2, 15),
  'learning_rate': (0.01, 0.5)
}

In [117]:
optimizer = BayesianOptimization(hyp_xgb, pds, random_state=1)
optimizer.maximize(init_points=4, n_iter=10)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9848  [0m | [0m 0.4753  [0m | [0m 7.203   [0m | [0m 0.01006 [0m | [0m 5.93    [0m | [0m 5.495   [0m | [0m 0.5462  [0m |
| [95m 2       [0m | [95m 0.9976  [0m | [95m 0.2676  [0m | [95m 3.456   [0m | [95m 0.2044  [0m | [95m 9.005   [0m | [95m 10.13   [0m | [95m 0.8426  [0m |
| [0m 3       [0m | [0m 0.9967  [0m | [0m 0.284   [0m | [0m 8.781   [0m | [0m 0.02342 [0m | [0m 10.72   [0m | [0m 10.09   [0m | [0m 0.7793  [0m |
| [0m 4       [0m | [0m 0.9976  [0m | [0m 0.2263  [0m | [0m 1.981   [0m | [0m 0.4024  [0m | [0m 14.59   [0m | [0m 8.328   [0m | [0m 0.8462  [0m |


TypeError: 'float' object is not subscriptable

In [118]:
optimizer.max['params']

{'colsample_bytree': 0.2676341902399038,
 'gamma': 3.4556072704304777,
 'learning_rate': 0.20441606237302828,
 'max_depth': 9.00461754204364,
 'min_child_weight': 10.126306744856013,
 'subsample': 0.8426097501983798}

In [119]:
# Copied from above
# Some params need to be an integer
params = {
    'colsample_bytree': 0.2676341902399038,
    'gamma': 3.4556072704304777,
    'learning_rate': 0.20441606237302828,
    'max_depth': 9,
    'min_child_weight': 10,
    'subsample': 0.8426097501983798,
    'objective': 'binary:logistic',
    'eval_metric':'auc',
    'n_jobs':-1
}

In [120]:
xgbr =  xgb.XGBClassifier(**params, random_state=12345, nthread=-1)
xgbr.fit(X_train_res, y_train_res)

In [129]:
y_pred = xgbr.predict(X_test)
testing_accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test,y_pred)
print("Test Accuracy score {0}".format(testing_accuracy))
print("ROC AUC Score {0}".format(roc_auc))
print(classification_report(y_test, y_pred))

Test Accuracy score 0.9764712074955259
ROC AUC Score 0.5346194761650499
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     18751
         1.0       0.08      0.08      0.08       247

    accuracy                           0.98     18998
   macro avg       0.54      0.53      0.54     18998
weighted avg       0.98      0.98      0.98     18998



In [79]:
def ml_modelling(xtrain,ytrain,xtest,ytest):
    global model, y_pred
    model = AutoSklearnClassifier(
        time_left_for_this_task=120,
        per_run_time_limit=30,
        tmp_folder='/tmp/autosklearn_resampling',
        disable_evaluator_output=False,
        resampling_strategy='cv',
        resampling_strategy_arguments={'folds': 5}
    )
    model.fit(xtrain,ytrain)
    y_pred = model.predict(xtest)

Modelling (Auto-Sklearn)

In [82]:
# With imbalanced dataset
ml_modelling(X_train,y_train,X_test,y_test)
testing_accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test,y_pred)
print("Test Accuracy score {0}".format(testing_accuracy))
print("ROC AUC Score {0}".format(roc_auc))
print(classification_report(y_test, y_pred))

Test Accuracy score 0.9869986314348879
ROC AUC Score 0.5
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     18751
         1.0       0.00      0.00      0.00       247

    accuracy                           0.99     18998
   macro avg       0.49      0.50      0.50     18998
weighted avg       0.97      0.99      0.98     18998



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [85]:
# With balanced dataset
combinations = [
    SMOTEENN(random_state=7),
    SMOTETomek(random_state=7)
]

for method in combinations:
    X_train_res, y_train_res = method.fit_resample(X_train, y_train.ravel())
    ml_modelling(X_train_res,y_train_res,X_test,y_test)
    roc_auc = roc_auc_score(y_test,y_pred)
    print("ROC AUC Score {0}".format(roc_auc))

ROC AUC Score 0.5
ROC AUC Score 0.5


Modelling (Auto-Keras)

In [50]:
# It tries 10 different models.
clf = ak.StructuredDataClassifier(overwrite=True, max_trials=3)
# Feed the structured data classifier with training data.
clf.fit(X_train_res, y_train_res, epochs=10)
# Predict with the best model.
predicted_y = clf.predict(X_test)
# Evaluate the best model with testing data.
print(clf.evaluate(X_test, y_test))

Trial 3 Complete [00h 10m 28s]
val_accuracy: 1.0

Best val_accuracy So Far: 1.0
Total elapsed time: 00h 31m 37s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[3039.328125, 0.015264764428138733]


In [51]:
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     18748
         1.0       0.01      1.00      0.03       250

    accuracy                           0.02     18998
   macro avg       0.51      0.50      0.02     18998
weighted avg       0.99      0.02      0.00     18998

