In [185]:
# import the neccessary algorithms
# I will be using Logistic Regression
# The competition says we should AUC as our metric

In [223]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,train_test_split,LeaveOneOut,StratifiedKFold
from sklearn.metrics import roc_auc_score,auc
from sklearn.preprocessing import MinMaxScaler,StandardScaler


In [224]:
# Read in all the data including your submission

In [225]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')
submission = pd.read_csv('sample_submission.csv')

In [226]:
new_submission = pd.DataFrame()
new_submission['Customer Id'] = test['Customer Id']
new_submission['Claim'] = np.zeros(len(test))

In [227]:
#checking the first three rows train data 
train.head(3)

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0


In [228]:
#checking the first three rows test data 
test.head(3)

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code
0,H11920,2013,1.0,0,V,N,O,R,300.0,1,1960.0,3,3310
1,H11921,2016,0.997268,0,V,N,O,R,300.0,1,1960.0,3,3310
2,H9805,2013,0.369863,0,V,V,V,U,790.0,1,1960.0,.,3310


In [229]:
# This is an example of what our submission file to look like
submission.head()

Unnamed: 0,Customer Id,Claim
0,H0,1
1,H10000,1
2,H10001,1
3,H10002,1
4,H10003,1


You'll notice that the test data doesn't have the column <b>Claim</b> because that's what are predicting in this project

In [230]:
# checking the info of the data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer Id         7160 non-null   object 
 1   YearOfObservation   7160 non-null   int64  
 2   Insured_Period      7160 non-null   float64
 3   Residential         7160 non-null   int64  
 4   Building_Painted    7160 non-null   object 
 5   Building_Fenced     7160 non-null   object 
 6   Garden              7153 non-null   object 
 7   Settlement          7160 non-null   object 
 8   Building Dimension  7054 non-null   float64
 9   Building_Type       7160 non-null   int64  
 10  Date_of_Occupancy   6652 non-null   float64
 11  NumberOfWindows     7160 non-null   object 
 12  Geo_Code            7058 non-null   object 
 13  Claim               7160 non-null   int64  
dtypes: float64(3), int64(4), object(7)
memory usage: 783.2+ KB


In [231]:

print("Test shape",test.shape)
print("Train shape",train.shape)

Test shape (3069, 13)
Train shape (7160, 14)


In [232]:
# you see we have a lot of missing values
train.isnull().sum()

Customer Id             0
YearOfObservation       0
Insured_Period          0
Residential             0
Building_Painted        0
Building_Fenced         0
Garden                  7
Settlement              0
Building Dimension    106
Building_Type           0
Date_of_Occupancy     508
NumberOfWindows         0
Geo_Code              102
Claim                   0
dtype: int64

In [233]:
# let deal with that

In [234]:
train['Garden'].fillna(train['Garden'].mode()[0],inplace=True)
train['Building Dimension'].fillna(train['Building Dimension'].mean(),inplace=True)
train['Date_of_Occupancy'].fillna(np.round(train['Date_of_Occupancy'].mean()),inplace=True)
train['Geo_Code'].fillna(train['Garden'].mode()[0],inplace=True)

test['Garden'].fillna(test['Garden'].mode()[0],inplace=True)
test['Building Dimension'].fillna(test['Building Dimension'].mean(),inplace=True)
test['Date_of_Occupancy'].fillna(np.round(test['Date_of_Occupancy'].mean()),inplace=True)
test['Geo_Code'].fillna(train['Garden'].mode()[0],inplace=True)

In [235]:
train.isna().sum()

Customer Id           0
YearOfObservation     0
Insured_Period        0
Residential           0
Building_Painted      0
Building_Fenced       0
Garden                0
Settlement            0
Building Dimension    0
Building_Type         0
Date_of_Occupancy     0
NumberOfWindows       0
Geo_Code              0
Claim                 0
dtype: int64

In [236]:
#convering categorical variable into numerical values
# Here i used label encoding


In [237]:
catgorcial_cols = ['Building_Painted','Building_Fenced','Garden','Settlement']

In [238]:
for col in catgorcial_cols:
    train[col] = train[col].factorize()[0] # x.factorize() is a pandas way to achieve label encoding without using sklearn
    test[col] = test[col].factorize()[0]
    

In [239]:
# I drop some columns based on my intuition, try checking them later may be you could generate ineresting features 

In [240]:
to_drop = ['Customer Id','NumberOfWindows','Geo_Code']

In [241]:
train.drop(to_drop,axis=1,inplace=True)
test.drop(to_drop,axis=1,inplace=True)

In [242]:
train

Unnamed: 0,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,Claim
0,2013,1.000000,0,0,0,0,0,290.00000,1,1960.0,0
1,2015,1.000000,0,1,1,1,1,490.00000,1,1850.0,0
2,2014,1.000000,0,0,0,0,0,595.00000,1,1960.0,0
3,2013,1.000000,0,1,0,0,0,2840.00000,1,1960.0,0
4,2014,1.000000,0,1,1,1,1,680.00000,1,1800.0,0
...,...,...,...,...,...,...,...,...,...,...,...
7155,2012,1.000000,1,1,0,0,0,1883.72753,1,2001.0,0
7156,2013,1.000000,0,1,0,0,0,1883.72753,2,1980.0,1
7157,2016,0.038251,0,1,0,0,0,1883.72753,1,1992.0,0
7158,2013,1.000000,0,1,0,0,0,1883.72753,1,1972.0,0


In [243]:
#Let's get our x and y from train data
X = train.drop('Claim',axis=1)
y = train['Claim']

In [244]:
X.shape,test.shape

((7160, 10), (3069, 10))

### Using Simple Hold out

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [245]:
# To simply achieve stratify in simple holdout we just add stratify=y 
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.3,random_state=42) 

In [246]:
model = LogisticRegression(C=0.001)

In [247]:
model.fit(X_train,y_train)

LogisticRegression(C=0.001)

In [248]:
y_hat = model.predict_proba(X_test)[:,1]

In [249]:
roc_auc_score(y_test,y_hat)

0.6891232367494645

In [250]:
holdout_pred_on_test_data = model.predict_proba(test)[:,1]

In [251]:
new_submission['Claim'] = holdout_pred_on_test_data
new_submission.to_csv('holdout.csv',index=False)

### Using KFOLD

In [252]:
validation_scores = []
test_prediction = np.zeros(len(test))

kf = KFold(n_splits=5,random_state=42,shuffle=True)
for fold,(train_index,test_index) in enumerate(kf.split(X)):
    X_train,X_test = X.iloc[train_index],X.iloc[test_index]
    y_train,y_test = y.iloc[train_index],y.iloc[test_index]
    
    model2 = LogisticRegression(C=0.001)
    model2.fit(X_train,y_train)
    y_hat2 = model2.predict_proba(X_test)[:,1]
    validation_scores.append(roc_auc_score(y_test,y_hat2))
    test_prediction += model2.predict_proba(test)[:,1]
    
    

In [253]:
validation_scores

[0.6985428051001821,
 0.7029230558598029,
 0.687006010998849,
 0.7097171843513307,
 0.7380727509720749]

In [254]:
np.mean(validation_scores)

0.707252361456448

In [255]:
test_prediction

array([0.64972984, 0.65165219, 0.72836526, ..., 0.95872355, 1.22770915,
       1.03759143])

In [256]:
# Here we divide it by the number of split used during KFOLD
average_prediction = test_prediction/5

In [257]:
new_submission['Claim'] = average_prediction

In [258]:
new_submission.to_csv('kfold.csv',index=False)

### USING StratifieKFOLD

In [260]:
stratify_validation_scores = []
stratify_test_prediction = np.zeros(len(test))

skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
for fold,(train_index,test_index) in enumerate(skf.split(X,y)):
    X_train,X_test = X.iloc[train_index],X.iloc[test_index]
    y_train,y_test = y.iloc[train_index],y.iloc[test_index]
    
    model3 = LogisticRegression(C=0.001)
    model3.fit(X_train,y_train)
    y_hat3 = model3.predict_proba(X_test)[:,1]
    stratify_validation_scores.append(roc_auc_score(y_test,y_hat3))
    stratify_test_prediction += model3.predict_proba(test)[:,1]
    
    

In [261]:
stratify_validation_scores

[0.6884450681724892,
 0.6839442622497129,
 0.7239182476095589,
 0.724543705979216,
 0.7168693871338232]

In [262]:
np.mean(stratify_validation_scores)

0.7075441342289601

In [263]:
stratify_test_prediction

array([0.64896622, 0.65089583, 0.73052816, ..., 0.95828526, 1.22454685,
       1.03688364])

In [264]:
stratify_pred = stratify_test_prediction/5

In [265]:
new_submission['Claim'] = stratify_pred

In [266]:
new_submission.to_csv("StratifiedKFOLD.csv",index=False)