# insurance-prediction Notebook

This notebook covers:
- Data preprocessing
- Creating models

### Importing libraries

In [1]:
# dataframe and plotting
import pandas as pd
import numpy as np

# machine learning
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score, f1_score,roc_auc_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split ,StratifiedKFold,GridSearchCV

##feature Scaling
from sklearn.preprocessing import StandardScaler


import warnings
warnings.filterwarnings('ignore')

###  Load the dataset

In [2]:
# Load files into a pandas dataframe
train = pd.read_csv('cleaned_train.csv')
test = pd.read_csv('cleaned_test.csv')

In [3]:
train.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,52.0,0,1,0,1,1,290.0,1,1960.0,.,5,0
1,H2037,2015,52.0,0,0,1,0,0,490.0,1,1850.0,4,5,0
2,H3802,2014,52.0,0,1,0,1,1,595.0,1,1960.0,.,5,0
3,H3834,2013,52.0,0,0,0,1,1,2840.0,1,1960.0,.,5,0
4,H5053,2014,52.0,0,0,1,0,0,680.0,1,1800.0,3,5,0


## Data preprocessing

In [4]:
#getting dummies for the number of windows column

wind_train_dummies = pd.get_dummies(train['NumberOfWindows'])
wind_test_dummies = pd.get_dummies(test['NumberOfWindows'])

In [5]:
#adding the dummy column to the original column

train = pd.concat([train, wind_train_dummies], axis=1)
test = pd.concat([test, wind_test_dummies], axis=1)

In [6]:
train = train.drop('NumberOfWindows', axis=1)
test = test.drop('NumberOfWindows', axis=1)

In [7]:
test_2=test.copy()

In [8]:
train.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,...,1,2,3,4,5,6,7,8,9,>=10
0,H14663,2013,52.0,0,1,0,1,1,290.0,1,...,0,0,0,0,0,0,0,0,0,0
1,H2037,2015,52.0,0,0,1,0,0,490.0,1,...,0,0,0,1,0,0,0,0,0,0
2,H3802,2014,52.0,0,1,0,1,1,595.0,1,...,0,0,0,0,0,0,0,0,0,0
3,H3834,2013,52.0,0,0,0,1,1,2840.0,1,...,0,0,0,0,0,0,0,0,0,0
4,H5053,2014,52.0,0,0,1,0,0,680.0,1,...,0,0,1,0,0,0,0,0,0,0


In [9]:
train = train.drop('Customer Id', axis=1)
test_data = test.drop('Customer Id', axis=1)

In [10]:
test_data2=test_data.copy()

In [11]:
#Separate training features from target
X_train = train.drop(['Claim'], axis=1)
y_train = train['Claim']

In [12]:
# Split train_data
from sklearn.model_selection import train_test_split

X_Train, X_Val, y_Train, y_val = train_test_split(X_train, y_train, stratify = y_train, 
                                                  test_size = 0.1, random_state=42)

## model building using XGBoost

In [13]:
xg_model = XGBClassifier()

In [14]:
# Optimize model parameters
param_grid = {'min_child_weighth': [1, 3],
        'n_estimators':[600,700,500],
        'learning_rate':[0.1,0.05],
        'colsample_bylevel':[0.8],
        'reg_alpha':[0.8],              
        'subsample': [0.8, 1.0],
        'max_depth': [3, 5]
        }
model = GridSearchCV(xg_model, param_grid,scoring='roc_auc',n_jobs=-1,verbose=2)

In [15]:
model.fit(X_Train,y_Train)
print(model.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Parameters: { "min_child_weighth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


{'colsample_bylevel': 0.8, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weighth': 1, 'n_estimators': 500, 'reg_alpha': 0.8, 'subsample': 1.0}


In [16]:
print(model.best_score_)

0.7177620540276213


## model evaluation

In [17]:
print(roc_auc_score(y_val, model.predict(X_Val)))

0.5822063701616392


In [18]:
# Get the predicted result for the test Data
test.Claim = model.predict_proba(test_data)[:, 1]

  test.Claim = model.predict_proba(test_data)[:, 1]


## model building using CATBoost

In [22]:
cat_model=CatBoostClassifier()

In [26]:
# Optimize model parameters
param_grid2 = {'learning_rate':[0.1,0.05],
        'n_estimators':[800,600,500],
        'od_wait':[50],
        'reg_lambda':[3],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
model2 = GridSearchCV(cat_model, param_grid2,scoring='roc_auc',n_jobs=-1,verbose=2)

In [27]:
model2.fit(X_Train,y_Train)
print(model2.best_params_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
0:	learn: 0.6719583	total: 155ms	remaining: 1m 17s
1:	learn: 0.6521151	total: 164ms	remaining: 40.8s
2:	learn: 0.6357051	total: 174ms	remaining: 28.9s
3:	learn: 0.6206144	total: 184ms	remaining: 22.8s
4:	learn: 0.6067641	total: 193ms	remaining: 19.1s
5:	learn: 0.5938063	total: 202ms	remaining: 16.6s
6:	learn: 0.5821238	total: 210ms	remaining: 14.8s
7:	learn: 0.5727237	total: 219ms	remaining: 13.5s
8:	learn: 0.5645690	total: 228ms	remaining: 12.4s
9:	learn: 0.5564260	total: 237ms	remaining: 11.6s
10:	learn: 0.5493588	total: 246ms	remaining: 10.9s
11:	learn: 0.5434443	total: 254ms	remaining: 10.3s
12:	learn: 0.5384107	total: 263ms	remaining: 9.87s
13:	learn: 0.5337972	total: 272ms	remaining: 9.44s
14:	learn: 0.5287416	total: 281ms	remaining: 9.1s
15:	learn: 0.5247368	total: 290ms	remaining: 8.78s
16:	learn: 0.5207101	total: 298ms	remaining: 8.47s
17:	learn: 0.5170538	total: 305ms	remaining: 8.18s
18:	learn: 0.5139597	total: 31

173:	learn: 0.4642745	total: 856ms	remaining: 1.6s
174:	learn: 0.4641626	total: 859ms	remaining: 1.6s
175:	learn: 0.4640980	total: 863ms	remaining: 1.59s
176:	learn: 0.4640318	total: 866ms	remaining: 1.58s
177:	learn: 0.4639675	total: 869ms	remaining: 1.57s
178:	learn: 0.4638584	total: 873ms	remaining: 1.56s
179:	learn: 0.4637672	total: 877ms	remaining: 1.56s
180:	learn: 0.4637247	total: 881ms	remaining: 1.55s
181:	learn: 0.4636258	total: 885ms	remaining: 1.54s
182:	learn: 0.4635746	total: 888ms	remaining: 1.54s
183:	learn: 0.4634685	total: 892ms	remaining: 1.53s
184:	learn: 0.4634115	total: 895ms	remaining: 1.52s
185:	learn: 0.4633810	total: 900ms	remaining: 1.52s
186:	learn: 0.4633015	total: 904ms	remaining: 1.51s
187:	learn: 0.4632118	total: 909ms	remaining: 1.51s
188:	learn: 0.4631301	total: 912ms	remaining: 1.5s
189:	learn: 0.4630626	total: 915ms	remaining: 1.49s
190:	learn: 0.4630422	total: 918ms	remaining: 1.49s
191:	learn: 0.4629866	total: 921ms	remaining: 1.48s
192:	learn: 0.4

386:	learn: 0.4486946	total: 1.56s	remaining: 455ms
387:	learn: 0.4486381	total: 1.56s	remaining: 451ms
388:	learn: 0.4485494	total: 1.57s	remaining: 447ms
389:	learn: 0.4485079	total: 1.57s	remaining: 443ms
390:	learn: 0.4484623	total: 1.57s	remaining: 439ms
391:	learn: 0.4484160	total: 1.58s	remaining: 434ms
392:	learn: 0.4483539	total: 1.58s	remaining: 430ms
393:	learn: 0.4483190	total: 1.58s	remaining: 426ms
394:	learn: 0.4482121	total: 1.59s	remaining: 422ms
395:	learn: 0.4481445	total: 1.59s	remaining: 418ms
396:	learn: 0.4480859	total: 1.59s	remaining: 414ms
397:	learn: 0.4480552	total: 1.6s	remaining: 409ms
398:	learn: 0.4479830	total: 1.6s	remaining: 405ms
399:	learn: 0.4479296	total: 1.6s	remaining: 401ms
400:	learn: 0.4479212	total: 1.61s	remaining: 397ms
401:	learn: 0.4478978	total: 1.61s	remaining: 392ms
402:	learn: 0.4478555	total: 1.61s	remaining: 388ms
403:	learn: 0.4477649	total: 1.61s	remaining: 384ms
404:	learn: 0.4477280	total: 1.62s	remaining: 379ms
405:	learn: 0.4

In [28]:
print(model2.best_score_)

0.7226135313812564


## model evaluation

In [29]:
print(roc_auc_score(y_val, model2.predict(X_Val)))

0.584563840291106


In [30]:
# Get the predicted result for the test Data
test_2.Claim = model2.predict_proba(test_data2)[:, 1]

  test_2.Claim = model2.predict_proba(test_data2)[:, 1]
