In [2]:
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [3]:
raw_train_df=pd.read_csv('Dataset/train.csv')
raw_test_df=pd.read_csv('Dataset/test.csv')

#Exploratory Data Analysis

In [None]:
raw_train_df

In [None]:
raw_test_df

In [None]:
raw_train_df.info()

In [None]:
raw_test_df.info()

In [None]:
raw_train_df.isna().sum()

In [None]:
raw_train_df.isnull().sum()

In [None]:
raw_test_df.isna().sum()

In [None]:
raw_test_df.isnull().sum()

In [None]:
raw_test_df.nunique()

In [None]:
raw_train_df.describe()

In [None]:
raw_train_df.describe(include=['object'])

#Vizualization Of Data

In [None]:
fig=px.histogram(raw_train_df,
                x='Response',
                marginal='box',
                color='Gender',
                color_discrete_sequence=['blue', 'red'],
                title='Gender and Response Correlation'
                )
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
sns.catplot(x="Response",y="Age", data=raw_train_df, kind='box')
plt.title("Age and Response Correlation", size=20, y=1.0);

In [None]:
fig=px.histogram(raw_train_df,
                x='Response',
                marginal='box',
                color='Driving_License',
                color_discrete_sequence=['blue', 'red'],
                title='Driving License and Response Correlation'
                )
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
sns.catplot(x="Response", y="Region_Code", data=raw_train_df, kind="box")
plt.title("Region code and Response Correlation", size=20, y=1.0);

In [None]:
sns.catplot(x="Response", y="Vintage", data=raw_train_df, kind="box")
plt.title("Vintage code and Response Correlation", size=20, y=1.0);

In [None]:
fig=px.histogram(raw_train_df,
                x='Response',
                marginal='box',
                color='Vehicle_Damage',
                color_discrete_sequence=['blue', 'red'],
                title='Vehicle_Damage and Response Correlation'
                )
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
sns.countplot(data=raw_train_df,x='Vehicle_Age',hue='Vehicle_Damage');

In [None]:
corr_data=raw_train_df.drop(['id'], inplace=False, axis=1)

In [None]:
plt.figure(figsize=(20, 17))
matrix = np.triu(corr_data.corr())
sns.heatmap(corr_data.corr(), annot=True, linewidth=.8, mask=matrix, cmap="rainbow");

##Data Preprocessing

In [None]:
raw_train_df.drop("id", inplace=True, axis=1)
raw_test_df.drop("id",inplace=True, axis=1)

In [None]:
raw_train_df['Gender']=raw_train_df['Gender'].replace({'Male':1,'Female':0})
raw_train_df.head()

In [None]:
raw_train_df['Vehicle_Age'].unique()

In [None]:
raw_train_df['Vehicle_Damage'].replace({'Yes':1,'No':0}, inplace=True)
raw_train_df['Vehicle_Age'].replace({'< 1 Year':1,'1-2 Year':2,'> 2 Years':3}, inplace=True)
raw_train_df.head()

In [None]:
train_df=raw_train_df

In [None]:
plt.figure(figsize=(16,9))
sns.heatmap(train_df.corr(), annot=True)

In [None]:
hig_corr = train_df.corr()
hig_corr_features = hig_corr.index[abs(hig_corr["Response"]) >= 0.2]
hig_corr_features

In [None]:
raw_test_df['Gender']=raw_test_df['Gender'].replace({'Male':1,'Female':0})

In [None]:
raw_test_df['Vehicle_Damage'].replace({'Yes':1,'No':0}, inplace=True)
raw_test_df['Vehicle_Age'].replace({'< 1 Year':1,'1-2 Year':2,'> 2 Years':3}, inplace=True)

In [None]:
test_df=raw_test_df
test_df

###Scaling Numeric Data

In [None]:
numeric_cols=['Age','Region_Code','Annual_Premium','Policy_Sales_Channel','Vintage']

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler=MinMaxScaler()

In [None]:
scaler.fit(train_df[numeric_cols])

In [None]:
train_df[numeric_cols]=scaler.transform(train_df[numeric_cols])
test_df[numeric_cols]=scaler.transform(test_df[numeric_cols])

In [None]:
train_df.describe().loc[['min', 'max']]

In [None]:
test_df.describe().loc[['min', 'max']]

In [None]:
train_df.head()

In [None]:
test_df.head()

##Training, Validation and Text Data

In [None]:
input_cols=list(train_df.columns)[0:-1]
target_col='Response'

In [None]:
inputs=train_df[input_cols]
targets=train_df[target_col]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_inputs, val_inputs, train_targets,val_targets=train_test_split(inputs, targets, test_size=0.2, random_state=42)

In [None]:
val_inputs.head()

In [None]:
len(val_inputs)

In [None]:
len(val_targets)

#Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

In [None]:
model=LogisticRegression(solver='liblinear')

In [None]:
model.fit(train_inputs, train_targets)

In [None]:
%%time
train_preds=model.predict(train_inputs)
train_probs=model.predict_proba(train_inputs)
accuracy_score(train_targets, train_preds)

In [None]:
from sklearn.metrics import confusion_matrix
def predict_and_plot(inputs, targets,name=''):
    preds=model.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure()
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));    
    return preds

In [None]:
%%time
train_preds=predict_and_plot(train_inputs, train_targets,'Train')

In [None]:
%%time
val_preds=predict_and_plot(val_inputs, val_targets,'Validation')

#Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model_2=RandomForestClassifier(n_jobs=-1, random_state=42)

In [None]:
model_2.fit(train_inputs, train_targets)

In [None]:
%%time
model_2.score(train_inputs, train_targets)

In [None]:
from sklearn.metrics import confusion_matrix
def predict_and_plot_2(inputs, targets,name=''):
    preds=model_2.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure()
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));    
    return preds

In [None]:
%%time
val_preds_2 = predict_and_plot_2(val_inputs, val_targets, 'Validation')

###Hyperparameter Tuning of Random Forest

In [None]:
model_2.feature_importances_

In [None]:
importance_df=pd.DataFrame({
    'feature':train_inputs.columns,
    'importance':model_2.feature_importances_
}).sort_values('importance',ascending=False)

In [None]:
importance_df

In [None]:
model_2_1_1 = RandomForestClassifier(random_state=42,n_jobs=-1,n_estimators = 10)

In [None]:
model_2_1_1.fit(train_inputs,train_targets)

In [None]:
%%time
model_2_1_1.score(train_inputs, train_targets)

In [None]:
%%time
model_2_1_1.score(val_inputs, val_targets)

In [None]:
%%time
model_2_1_2 = RandomForestClassifier(random_state=42,n_jobs=-1,n_estimators = 20)
model_2_1_2.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_1_2.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_1_2.score(val_inputs, val_targets))   

In [None]:
%%time
model_2_1_3 = RandomForestClassifier(random_state=42,n_jobs=-1,n_estimators = 100)
model_2_1_3.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_1_3.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_1_3.score(val_inputs, val_targets))

In [None]:
%%time
model_2_1_4 = RandomForestClassifier(random_state=42,n_jobs=-1,n_estimators = 200)
model_2_1_4.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_1_4.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_1_4.score(val_inputs, val_targets))

In [None]:
%%time
model_2_2_1 = RandomForestClassifier(random_state=42,n_jobs=-1,max_features = 2)
model_2_2_1.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_2_1.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_2_1.score(val_inputs, val_targets)) 

In [None]:
%%time
model_2_2_2 = RandomForestClassifier(random_state=42,n_jobs=-1,max_features = 10)
model_2_2_2.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_2_2.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_2_2.score(val_inputs, val_targets))  

In [None]:
%%time
model_2_2_3 = RandomForestClassifier(random_state=42,n_jobs=-1,max_features = 6)
model_2_2_3.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_2_3.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_2_3.score(val_inputs, val_targets)) 

In [None]:
model_2_all = RandomForestClassifier(n_jobs=-1, 
                               random_state=42, 
                               n_estimators=100,
                               max_features=6)

In [None]:
model_2_all.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_all.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_all.score(val_inputs, val_targets)) 

#Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model_3=DecisionTreeClassifier(random_state=42)

In [None]:
model_3.fit(train_inputs, train_targets)

In [None]:
%%time
train_preds_3=model_3.predict(train_inputs)
train_probs_3=model_3.predict_proba(train_inputs)
accuracy_score(train_targets, train_preds_3)

In [None]:
def predict_and_plot_3(inputs, targets, name=''):
    preds = model_3.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure()
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));    
    return preds

In [None]:
%%time
val_preds = predict_and_plot_2(val_inputs, val_targets, 'Validation')

In [None]:
from sklearn.tree import plot_tree, export_text

In [None]:
plt.figure(figsize=(100,20))
plot_tree(model_3, feature_names=train_inputs.columns, max_depth=3, filled=True);

In [None]:
model_3_text=export_text(model_3, feature_names=list(train_inputs.columns))
print(model_3_text[:3000])

In [None]:
model_3.feature_importances_

In [None]:
importance_df = pd.DataFrame({
    'feature': train_inputs.columns,
    'importance': model_3.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
importance_df

###Hyperparameter Tuning of Decision Tree

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params_grid = {
    'max_leaf_nodes':[5,10,15,20,25]
}

In [None]:
grid = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),cv=5,param_grid=params_grid)

In [None]:
grid.fit(train_inputs,train_targets)

In [None]:
grid.best_params_

In [None]:
model_3_1=grid.best_estimator_

In [None]:
model_3_1.score(train_inputs, train_targets)

In [None]:
model_3_1.score(val_inputs, val_targets)

In [None]:
params_grid = {
    'max_depth':[10,20,30,40,50]
}

In [None]:
grid = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),cv=5,param_grid=params_grid)

In [None]:
grid.fit(train_inputs,train_targets)

In [None]:
grid.best_params_

In [None]:
model_3_2=grid.best_estimator_

In [None]:
model_3_2.score(train_inputs, train_targets)

In [None]:
model_3_2.score(val_inputs, val_targets)

In [None]:
params_grid = {
    'criterion':['gini','entropy'],
    'splitter':['best','random']
}

In [None]:
grid = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),cv=5,param_grid=params_grid)

In [None]:
grid.fit(train_inputs,train_targets)

In [None]:
grid.best_params_

In [None]:
model_3_3=grid.best_estimator_

In [None]:
model_3_3.score(train_inputs, train_targets)

In [None]:
model_3_3.score(val_inputs, val_targets)

In [None]:
model_3_all = DecisionTreeClassifier(random_state=42,
                                       criterion='entropy',
                                       splitter='best',
                                       max_leaf_nodes=5,
                                       max_depth=10)

In [None]:
model_3_all.fit(train_inputs, train_targets)

In [None]:
model_3_all.score(train_inputs, train_targets)

In [None]:
model_3_all.score(val_inputs, val_targets)