# Importing libraries

In [79]:
import numpy as np
import scipy 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold,train_test_split,LeaveOneOut,StratifiedKFold
from sklearn.metrics import roc_auc_score,auc
from sklearn.preprocessing import MinMaxScaler,StandardScaler

%matplotlib inline

# Load the data


In [91]:
train=pd.read_csv("train_data.csv")
submission=pd.read_csv("sample_submission.csv")
test=pd.read_csv("test_data.csv")

In [92]:
new_submission = pd.DataFrame()
new_submission['Customer Id'] = test['Customer Id']
new_submission['Claim'] = np.zeros(len(test))

In [93]:
train.head

<bound method NDFrame.head of      Customer Id  YearOfObservation  Insured_Period  Residential  \
0         H14663               2013        1.000000            0   
1          H2037               2015        1.000000            0   
2          H3802               2014        1.000000            0   
3          H3834               2013        1.000000            0   
4          H5053               2014        1.000000            0   
...          ...                ...             ...          ...   
7155       H5290               2012        1.000000            1   
7156       H5926               2013        1.000000            0   
7157       H6204               2016        0.038251            0   
7158       H6537               2013        1.000000            0   
7159       H7470               2014        1.000000            0   

     Building_Painted Building_Fenced Garden Settlement  Building Dimension  \
0                   N               V      V          U               290.

In [94]:
test.head

<bound method NDFrame.head of      Customer Id  YearOfObservation  Insured_Period  Residential  \
0         H11920               2013        1.000000            0   
1         H11921               2016        0.997268            0   
2          H9805               2013        0.369863            0   
3          H7493               2014        1.000000            0   
4          H7494               2016        1.000000            0   
...          ...                ...             ...          ...   
3064      H11583               2015        1.000000            0   
3065      H11720               2012        1.000000            0   
3066      H11721               2012        1.000000            0   
3067      H12408               2013        1.000000            0   
3068       H9021               2012        1.000000            0   

     Building_Painted Building_Fenced Garden Settlement  Building Dimension  \
0                   V               N      O          R               300.

In [95]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer Id         7160 non-null   object 
 1   YearOfObservation   7160 non-null   int64  
 2   Insured_Period      7160 non-null   float64
 3   Residential         7160 non-null   int64  
 4   Building_Painted    7160 non-null   object 
 5   Building_Fenced     7160 non-null   object 
 6   Garden              7153 non-null   object 
 7   Settlement          7160 non-null   object 
 8   Building Dimension  7054 non-null   float64
 9   Building_Type       7160 non-null   int64  
 10  Date_of_Occupancy   6652 non-null   float64
 11  NumberOfWindows     7160 non-null   object 
 12  Geo_Code            7058 non-null   object 
 13  Claim               7160 non-null   int64  
dtypes: float64(3), int64(4), object(7)
memory usage: 783.2+ KB


In [96]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3069 entries, 0 to 3068
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer Id         3069 non-null   object 
 1   YearOfObservation   3069 non-null   int64  
 2   Insured_Period      3069 non-null   float64
 3   Residential         3069 non-null   int64  
 4   Building_Painted    3069 non-null   object 
 5   Building_Fenced     3069 non-null   object 
 6   Garden              3065 non-null   object 
 7   Settlement          3069 non-null   object 
 8   Building Dimension  3056 non-null   float64
 9   Building_Type       3069 non-null   int64  
 10  Date_of_Occupancy   2341 non-null   float64
 11  NumberOfWindows     3069 non-null   object 
 12  Geo_Code            3056 non-null   object 
dtypes: float64(3), int64(3), object(7)
memory usage: 311.8+ KB


In [97]:
train.isnull().sum()

Customer Id             0
YearOfObservation       0
Insured_Period          0
Residential             0
Building_Painted        0
Building_Fenced         0
Garden                  7
Settlement              0
Building Dimension    106
Building_Type           0
Date_of_Occupancy     508
NumberOfWindows         0
Geo_Code              102
Claim                   0
dtype: int64

In [98]:
train['Garden'].fillna(train['Garden'].mode()[0],inplace=True)
train['Building Dimension'].fillna(train['Building Dimension'].mean(),inplace=True)
train['Date_of_Occupancy'].fillna(np.round(train['Date_of_Occupancy'].mean()),inplace=True)
train['Geo_Code'].fillna(train['Garden'].mode()[0],inplace=True)

test['Garden'].fillna(test['Garden'].mode()[0],inplace=True)
test['Building Dimension'].fillna(test['Building Dimension'].mean(),inplace=True)
test['Date_of_Occupancy'].fillna(np.round(test['Date_of_Occupancy'].mean()),inplace=True)
test['Geo_Code'].fillna(train['Geo_Code'].mode()[0],inplace=True)

In [99]:
train.isna().sum()

Customer Id           0
YearOfObservation     0
Insured_Period        0
Residential           0
Building_Painted      0
Building_Fenced       0
Garden                0
Settlement            0
Building Dimension    0
Building_Type         0
Date_of_Occupancy     0
NumberOfWindows       0
Geo_Code              0
Claim                 0
dtype: int64

In [100]:
#Changing categorical colmns to numerical values

In [101]:
CategoricalCols=["Building_Painted","Building_Fenced","Garden","Settlement"]

In [102]:
#Looping through the categorical columns
for col in CategoricalCols:
    train[col]=train[col].factorize()[0]
    test[col]=test[col].factorize()[0]

In [103]:
# drop four columns
dropCOl=["Customer Id", "NumberOfWindows", "Geo_Code"]

In [104]:
train.drop(dropCOl,axis=1,inplace=True)
test.drop(dropCOl,axis=1,inplace=True)

In [105]:
train

Unnamed: 0,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,Claim
0,2013,1.000000,0,0,0,0,0,290.00000,1,1960.0,0
1,2015,1.000000,0,1,1,1,1,490.00000,1,1850.0,0
2,2014,1.000000,0,0,0,0,0,595.00000,1,1960.0,0
3,2013,1.000000,0,1,0,0,0,2840.00000,1,1960.0,0
4,2014,1.000000,0,1,1,1,1,680.00000,1,1800.0,0
...,...,...,...,...,...,...,...,...,...,...,...
7155,2012,1.000000,1,1,0,0,0,1883.72753,1,2001.0,0
7156,2013,1.000000,0,1,0,0,0,1883.72753,2,1980.0,1
7157,2016,0.038251,0,1,0,0,0,1883.72753,1,1992.0,0
7158,2013,1.000000,0,1,0,0,0,1883.72753,1,1972.0,0


In [106]:
# Getting x and y from the train data
x=train.drop("Claim",axis=1)
y=train["Claim"]

In [107]:
x.shape,test.shape

((7160, 10), (3069, 10))

# Train the model with train test split


In [108]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [109]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,random_state=42)

In [110]:
logreg=LogisticRegression(C=0.001)


In [111]:
logreg.fit(x_train,y_train)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [112]:
y_pred1=logreg.predict_proba(x_test)[:,1]

In [113]:
roc_auc_score(y_test,y_pred1)

0.7008213855347613

In [114]:
holdout_pred_on_test_data=logreg.predict_proba(test)[:,1]

In [115]:
new_submission['Claim'] = holdout_pred_on_test_data
new_submission.to_csv('holdout.csv',index=False)

# Building our model using KFOld


In [139]:
Validation_scores=[]
test_pred=np.zeros(len(test))
kf=KFold(n_splits=5,random_state=42,shuffle=True)
for fold,(train_index,test_index) in enumerate(kf.split(x)):
    x_train,x_test=x.iloc[train_index],x.iloc[test_index]
    y_train,y_test=y.iloc[train_index],y.iloc[test_index]
    logreg1=LogisticRegression(C=0.0001)
    logreg1.fit(x_train,y_train)
    y_pred=logreg1.predict_proba(x_test)[:,1]
    Validation_scores.append(roc_auc_score(y_test,y_pred))
    test_pred+=logreg1.predict_proba(test)[:,1]

    
    
    

    

In [140]:
Validation_scores

[0.6877120076786318,
 0.7014115553121577,
 0.6804379645024229,
 0.7025682718365646,
 0.730506031283139]

In [141]:
np.mean(Validation_scores)

0.7005271661225831

In [144]:
test_pred

array([0.73276273, 0.73595566, 0.82653149, ..., 0.96645884, 1.41873143,
       1.06047763])

In [145]:
#calculate average_pred
#this is test_pred divided by the number of KFold used
average_pred=test_pred/5

In [146]:
average_pred

array([0.14655255, 0.14719113, 0.1653063 , ..., 0.19329177, 0.28374629,
       0.21209553])

In [147]:
new_submission["Claim"]=average_pred


In [148]:
new_submission.to_csv("kfold.csv",index=False)

# Using  of stratified KFold model

In [152]:
Stratify_validation_scores=[]
Stratify_test_pred=np.zeros(len(test))
skf=StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
for fold,(train_index,test_index) in enumerate(skf.split(x,y)):
    x_train,x_test=x.iloc[train_index],x.iloc[test_index]
    y_train,y_test=y.iloc[train_index],y.iloc[test_index]
    logreg2=LogisticRegression(C=0.001)
    logreg2.fit(x_train,y_train)
    y_pred=logreg2.predict_proba(x_test)[:,1]
    Stratify_validation_scores.append(roc_auc_score(y_test,y_pred))
    Stratify_test_pred+=logreg2.predict_proba(test)[:,1]
    

In [153]:
Stratify_validation_scores

[0.6884450681724892,
 0.6839442622497129,
 0.7239182476095589,
 0.724543705979216,
 0.7168693871338232]

In [154]:
np.mean(Stratify_validation_scores)

0.7075441342289601

In [155]:
Stratify_test_pred

array([0.64896622, 0.65089583, 0.73052816, ..., 0.95828526, 1.22454685,
       1.03688364])

In [157]:
#calculate average_pred
#this is Stratify_test_pred divided by the number of KFold used
average_pred=Stratify_test_pred/5

In [158]:
average_pred

array([0.12979324, 0.13017917, 0.14610563, ..., 0.19165705, 0.24490937,
       0.20737673])

In [159]:
new_submission['Claim']=average_pred

In [160]:
new_submission.to_csv("Stratifiedkfold.csv",index=False)