In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, plot_confusion_matrix, f1_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler

In [16]:
data = pd.read_csv('TravelInsurancePrediction.csv', usecols=['Age', 'Employment Type', 'GraduateOrNot', 'AnnualIncome',
       'FamilyMembers', 'ChronicDiseases', 'FrequentFlyer',
       'EverTravelledAbroad', 'TravelInsurance'])

data

Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,31,Government Sector,Yes,400000,6,1,No,No,0
1,31,Private Sector/Self Employed,Yes,1250000,7,0,No,No,0
2,34,Private Sector/Self Employed,Yes,500000,4,1,No,No,1
3,28,Private Sector/Self Employed,Yes,700000,3,1,No,No,0
4,28,Private Sector/Self Employed,Yes,700000,8,1,Yes,No,0
...,...,...,...,...,...,...,...,...,...
1982,33,Private Sector/Self Employed,Yes,1500000,4,0,Yes,Yes,1
1983,28,Private Sector/Self Employed,Yes,1750000,5,1,No,Yes,0
1984,28,Private Sector/Self Employed,Yes,1150000,6,1,No,No,0
1985,34,Private Sector/Self Employed,Yes,1000000,6,0,Yes,Yes,1


In [17]:
enc = OneHotEncoder()
transformed = enc.fit_transform(data[['Employment Type']])
data[enc.categories_[0]] = transformed.toarray()
data = data.drop(['Employment Type'], axis=1)
data['GraduateOrNot'] = data['GraduateOrNot'].replace({'Yes': 1, 'No': 0})
data['FrequentFlyer'] = data['FrequentFlyer'].replace({'Yes': 1, 'No': 0})
data['EverTravelledAbroad'] = data['EverTravelledAbroad'].replace({'Yes': 1, 'No': 0})

data

Unnamed: 0,Age,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance,Government Sector,Private Sector/Self Employed
0,31,1,400000,6,1,0,0,0,1.0,0.0
1,31,1,1250000,7,0,0,0,0,0.0,1.0
2,34,1,500000,4,1,0,0,1,0.0,1.0
3,28,1,700000,3,1,0,0,0,0.0,1.0
4,28,1,700000,8,1,1,0,0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
1982,33,1,1500000,4,0,1,1,1,0.0,1.0
1983,28,1,1750000,5,1,0,1,0,0.0,1.0
1984,28,1,1150000,6,1,0,0,0,0.0,1.0
1985,34,1,1000000,6,0,1,1,1,0.0,1.0


In [24]:
rand = 0
x = data.drop(['TravelInsurance'], axis=1)
cols = x.columns
y = data['TravelInsurance']
ss = StandardScaler()
x = ss.fit_transform(x)
pd.DataFrame(x, columns=cols)

Unnamed: 0,Age,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,Government Sector,Private Sector/Self Employed
0,0.463430,0.417552,-1.414061,0.774964,1.612339,-0.515369,-0.486277,1.576694,-1.576694
1,0.463430,0.417552,0.842012,1.396373,-0.620217,-0.515369,-0.486277,-0.634238,0.634238
2,1.493446,0.417552,-1.148641,-0.467855,1.612339,-0.515369,-0.486277,-0.634238,0.634238
3,-0.566587,0.417552,-0.617800,-1.089265,1.612339,-0.515369,-0.486277,-0.634238,0.634238
4,-0.566587,0.417552,-0.617800,2.017783,1.612339,1.940358,-0.486277,-0.634238,0.634238
...,...,...,...,...,...,...,...,...,...
1982,1.150107,0.417552,1.505563,-0.467855,-0.620217,1.940358,2.056440,-0.634238,0.634238
1983,-0.566587,0.417552,2.169114,0.153554,1.612339,-0.515369,2.056440,-0.634238,0.634238
1984,-0.566587,0.417552,0.576591,0.774964,1.612339,-0.515369,-0.486277,-0.634238,0.634238
1985,1.493446,0.417552,0.178461,0.774964,-0.620217,1.940358,2.056440,-0.634238,0.634238


In [19]:
data.groupby('TravelInsurance').count()

Unnamed: 0_level_0,Age,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,Government Sector,Private Sector/Self Employed
TravelInsurance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1277,1277,1277,1277,1277,1277,1277,1277,1277
1,710,710,710,710,710,710,710,710,710


In [20]:
ros = RandomOverSampler(random_state=rand)
x_oversampled, y_oversampled = ros.fit_resample(x, y)
x_oversampled, y_oversampled = shuffle(x_oversampled, y_oversampled, random_state=rand)
x_train, x_test, y_train, y_test = train_test_split(x_oversampled, y_oversampled, test_size=0.2, random_state=rand)
pd.concat([pd.DataFrame(x_oversampled), pd.DataFrame(y_oversampled)], axis=1).groupby('TravelInsurance').count()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8
TravelInsurance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1277,1277,1277,1277,1277,1277,1277,1277,1277
1,1277,1277,1277,1277,1277,1277,1277,1277,1277


In [22]:
rf = BaggingClassifier(n_estimators=600, random_state=rand, bootstrap_features=True)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print(f'Balanced accuracy score: {balanced_accuracy_score(y_test, y_pred)}')
print(f'F1 accuracy score: {f1_score(y_test, y_pred)}')
print(f'ROC AUC accuracy score: {roc_auc_score(y_test, y_pred)}')

Balanced accuracy score: 0.8447503986262725
F1 accuracy score: 0.8377823408624231
ROC AUC accuracy score: 0.8447503986262725
