In [125]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

In [3]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
submission=pd.read_csv('Sample_submission.csv')

In [5]:
train.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [7]:
train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [9]:
train.select_dtypes(exclude= np.number).isna().sum()

Gender            0
Vehicle_Age       0
Vehicle_Damage    0
dtype: int64

In [11]:
train.select_dtypes(include= np.number).isna().sum()

id                      0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

In [13]:
numcols=['id','Age','Driving_License','Region_Code','Previously_Insured','Annual_Premium','Policy_Sales_Channel','Vintage']
catohcols=['Gender','Vehicle_Age','Vehicle_Damage']

In [15]:
X=train[numcols+catohcols]

In [17]:
y=train['Response']

In [19]:
train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.3, random_state=312)

In [21]:
numercial_pipeline = Pipeline(steps = 
         [('scale',StandardScaler())
         ])
categorical_onehot_pipeline = Pipeline(steps = 
         [('onehot',OneHotEncoder(handle_unknown='ignore'))
         ])

In [23]:
pipe_processing = ColumnTransformer(transformers=[
    ('numerical', numercial_pipeline, numcols),
    ('categorical_onehot', categorical_onehot_pipeline, catohcols)
])

In [27]:
pipe = Pipeline(steps = 
        [
            ('preprocessing', pipe_processing),
            ('model', DecisionTreeClassifier())
        ])

In [29]:
pipe

In [31]:
train_X

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Gender,Vehicle_Age,Vehicle_Damage
331810,331811,23,1,30.0,1,28846.0,152.0,224,Male,< 1 Year,No
121364,121365,38,1,43.0,0,2630.0,156.0,177,Male,1-2 Year,Yes
38972,38973,61,1,15.0,1,51190.0,14.0,291,Male,1-2 Year,No
376034,376035,23,1,13.0,1,21146.0,152.0,227,Female,< 1 Year,No
70257,70258,26,1,41.0,1,34306.0,152.0,37,Male,< 1 Year,No
...,...,...,...,...,...,...,...,...,...,...,...
225466,225467,38,1,28.0,1,32816.0,152.0,37,Male,1-2 Year,No
160339,160340,39,1,8.0,1,41481.0,124.0,174,Male,1-2 Year,No
373378,373379,22,1,13.0,0,26590.0,152.0,151,Female,< 1 Year,Yes
89555,89556,23,1,43.0,1,2630.0,152.0,238,Female,< 1 Year,No


In [33]:
train_y

331810    0
121364    1
38972     0
376034    0
70257     0
         ..
225466    0
160339    0
373378    0
89555     0
223648    0
Name: Response, Length: 266776, dtype: int64

In [39]:
pipe.fit(train_X,train_y)

In [36]:
train_X

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Gender,Vehicle_Age,Vehicle_Damage
331810,331811,23,1,30.0,1,28846.0,152.0,224,Male,< 1 Year,No
121364,121365,38,1,43.0,0,2630.0,156.0,177,Male,1-2 Year,Yes
38972,38973,61,1,15.0,1,51190.0,14.0,291,Male,1-2 Year,No
376034,376035,23,1,13.0,1,21146.0,152.0,227,Female,< 1 Year,No
70257,70258,26,1,41.0,1,34306.0,152.0,37,Male,< 1 Year,No
...,...,...,...,...,...,...,...,...,...,...,...
225466,225467,38,1,28.0,1,32816.0,152.0,37,Male,1-2 Year,No
160339,160340,39,1,8.0,1,41481.0,124.0,174,Male,1-2 Year,No
373378,373379,22,1,13.0,0,26590.0,152.0,151,Female,< 1 Year,Yes
89555,89556,23,1,43.0,1,2630.0,152.0,238,Female,< 1 Year,No


In [41]:
train_pred=pipe.predict(train_X)

In [43]:
test_pred=pipe.predict(test_X)

In [51]:
print("train_Acc:",accuracy_score(train_y,train_pred))
print("test ROC_AUC:",roc_auc_score(train_y,train_pred))

train_Acc: 1.0
test ROC_AUC: 1.0


In [180]:
print("test_acc:",accuracy_score(test_y,test_pred))
print("test_ROC_AUC:",roc_auc_score(test_y,test_pred))

test_acc: 0.8220198892708142
test_ROC_AUC: 0.6022707401424165


In [182]:
train_with_valid_columns=train[numcols+catohcols]

In [184]:
test_with_valid_columns=test[numcols+catohcols]

In [186]:
Response=pipe.predict(test_with_valid_columns)

In [188]:
result=pd.DataFrame(Response,columns=['Response'])

In [190]:
result_dt = pd.concat([test['id'], result], axis = 1)

In [192]:
result_dt.to_csv('dt_result1.csv',index=False)

In [72]:
pipe['model']

In [74]:
pipe

In [76]:
ohe = pipe.named_steps['preprocessing'].named_transformers_['numerical']

encoded_columns = ohe.get_feature_names_out(numcols)

In [78]:
encoded_columns

array(['id', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Annual_Premium', 'Policy_Sales_Channel',
       'Vintage'], dtype=object)

In [80]:
from sklearn import tree
import matplotlib as plt

In [None]:
tree.plot_tree(pipe['model'])

Another Model

In [97]:
def preprocessing(data):
     data=data.dropna()
     data=data.drop('id',axis=1)
     cols=['Gender','Vehicle_Damage','Vehicle_Age']
     d_dummy=pd.get_dummies(data=data,columns=cols,drop_first=True )
     d_dummy=d_dummy.drop('Vintage', axis = 1)
     return d_dummy

In [81]:
train_dummy=preprocessing(train)
test_dummy=preprocessing(test)

In [89]:
in_cols=train_dummy.drop('Response',axis=1)
t_cols=train_dummy['Response']
X_test=test.copy()

In [91]:
in_cols.head()

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Gender_Male,Vehicle_Damage_Yes,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years
0,44,1,28.0,0,40454.0,26.0,True,True,False,True
1,76,1,3.0,0,33536.0,26.0,True,False,False,False
2,47,1,28.0,0,38294.0,26.0,True,True,False,True
3,21,1,11.0,1,28619.0,152.0,True,False,True,False
4,29,1,41.0,1,27496.0,152.0,False,False,True,False


In [133]:
sKf=StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
for train_tx,val_tx in sKf.split(in_cols,t_cols):
    X_train,X_val = in_cols.iloc[train_tx],in_cols.iloc[val_tx]
    y_train,y_val=t_cols.iloc[train_tx],t_cols.iloc[val_tx]

In [141]:
s=MinMaxScaler()
X_train_s=s.fit_transform(X_train)
X_val_s=s.transform(X_val)

In [144]:
r=RandomForestClassifier(max_depth=7,n_estimators=500)
r.fit(X_train_s,y_train)
y_val_pred = rf.predict_proba(X_val_s)[:,1]
score=roc_auc_score(y_val, y_val_pred)
print(score)

0.8562305372508647


In [166]:
y_pred=r.predict_proba(X_val_s)[:,1]

In [168]:
result_two=pd.DataFrame(y_pred,columns=['Response'])

In [170]:
result_rf = pd.concat([test['id'], result_two], axis = 1)

In [172]:
result_rf.to_csv('result_random_forest.csv',index=False)