In [61]:
# Import the necessary packages of Python that we will/may use in this notebook
# pandas and numpy for dataframe creation and manipulation
# matplot lib for data visualization
# sklearn for statistical algorithms and splitting the dataset to training and testing datasets

# General
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.datasets import make_classification
import warnings
from IPython.display import display
import seaborn as sns
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as stats

# Features pre-processing and principal component analysis (pca)
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Train-test split
from sklearn.model_selection import train_test_split

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier

# Classifiers ensembling
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from mlxtend.classifier import StackingClassifier

# Classifiers evaluation metrics
from sklearn.metrics import accuracy_score, roc_auc_score, auc, roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef

# Random resampling
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Tuning hyperparameters
from sklearn.model_selection import RandomizedSearchCV

# Other
from time import time
from scipy.stats import ttest_ind

# Ploting
from matplotlib import pyplot as plt
%matplotlib inline
sns.set_style('white')
pd.options.display.float_format = '{:.3f}'.format

from matplotlib.ticker import MaxNLocator
from collections import namedtuple
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300


# Suppressing annoying harmless error
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)
warnings.simplefilter('ignore')

In [62]:
# Read the data from the local drive

safe_driver = pd.read_excel('IT_3.xlsx')

In [63]:
# Check if the data has any negaitve values

safe_driver.where(safe_driver < 0).sum()

ID                                                                     0.000
target                                                                 0.000
Gender                     FFMMMFFFFFMFFFMMMFMFMMMMFFMMMFMFMMMFFMMFMMMMMM...
EngineHP                                                               0.000
credit_history                                                         0.000
Years_Experience                                                       0.000
annual_claims                                                          0.000
Marital_Status             MarriedMarriedMarriedMarriedMarriedMarriedMarr...
Vehicle_Type               CarCarVanVanVanTruckTruckCarCarTruckUtilityTru...
Miles_driven_annually                                                  0.000
size_of_family                                                         0.000
Age_bucket                 <1828-34>4018-27>40>40>40>40>4035-40>4035-40>4...
EngineHP_bucket            >350>35090-16090-16090-16090-16090-160<90>3509...

In [64]:
# Check if there are any NULL data that need to be dropped
safe_driver.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30240 entries, 0 to 30239
Data columns (total 17 columns):
ID                              30240 non-null int64
target                          30240 non-null int64
Gender                          30240 non-null object
EngineHP                        30240 non-null int64
credit_history                  30240 non-null int64
Years_Experience                30240 non-null int64
annual_claims                   30240 non-null int64
Marital_Status                  30240 non-null object
Vehicle_Type                    30240 non-null object
Miles_driven_annually           30232 non-null float64
size_of_family                  30240 non-null int64
Age_bucket                      30240 non-null object
EngineHP_bucket                 30240 non-null object
Years_Experience_bucket         30240 non-null object
Miles_driven_annually_bucket    30232 non-null object
credit_history_bucket           30240 non-null object
State                           3

In [65]:
safe_driver.describe()

Unnamed: 0,ID,target,EngineHP,credit_history,Years_Experience,annual_claims,Miles_driven_annually,size_of_family
count,30240.0,30240.0,30240.0,30240.0,30240.0,30240.0,30232.0,30240.0
mean,15120.5,0.708,196.604,685.77,13.256,1.138,17422.939,4.521
std,8729.68,0.455,132.347,102.454,9.89,1.083,17483.783,2.287
min,1.0,0.0,80.0,300.0,1.0,0.0,5000.0,1.0
25%,7560.75,0.0,111.0,668.0,5.0,0.0,9668.5,3.0
50%,15120.5,1.0,141.0,705.0,10.0,1.0,12280.0,5.0
75%,22680.25,1.0,238.0,753.0,20.0,2.0,14697.25,7.0
max,30240.0,1.0,1005.0,850.0,40.0,4.0,99943.0,8.0


In [66]:
# Check and see if we have an imbalanced class label in the dataset
# Calculate the percentage of success data ('target' == 1) with respect to the failure data ('target' == 0)

true_claims = (safe_driver['target'] == 1).sum()
print('True Claims is  {}'.format(true_claims))

total_records = len(safe_driver['target'])
print('Total number of records is {}'.format(total_records))

print('The percentage of true claims is {}%'.format(
    round(true_claims / total_records * 100), 2))

True Claims is  21396
Total number of records is 30240
The percentage of true claims is 71.0%


Our dataset is indeed imbalanced. We will balance it later using SMOTE technique.

The dataset contains several categorical data that ends with `_bucket` that need to be either dropped or converted to numerical values using dummies. All features that are of type object are categorical variables that needs to either:<br>
<br>
a. Converted to numeric using dummies<br>
b. Dropped or<br>
c. Assigned a binary value<br>

In [67]:
cat_features = safe_driver.select_dtypes(include=['object']).copy()
print(cat_features.columns)

Index(['Gender', 'Marital_Status', 'Vehicle_Type', 'Age_bucket',
       'EngineHP_bucket', 'Years_Experience_bucket',
       'Miles_driven_annually_bucket', 'credit_history_bucket', 'State'],
      dtype='object')


Among the categorical variables we retain the following:<br>
<br>
1. Gender<br>
2. Marital_Status<br>
3. Vehicle_Type, and<br>
4. Age_bucket<br>
<br>
EngineHP_bucket, Years_Experience_bucket, Miles_driven_annually_bucket, credit_history_bucket have a corresponding continuous variable. Creating each with their own dummies along with the continuous variable does not make sense. We will keep the Age_bucket as there is no continuous variable to represent age.<br>
<br>
We can split the dataset by State (one sub-dataset for each state) and analyze each state by itself. As each US state has its own regulations it may make sense to analyze each state by itself. We could aggregate our results across states later to get a national statistic.<br>
<br>
Or, for now, we could drop the State column and analyze the data across the nation later.

In [68]:
# Drop these 5 columns: ID, EngineHP_bucket, Years_Experience_bucket, Miles_driven_annually_bucket, credit_history_bucket

safe_driver.drop(['ID', 'EngineHP_bucket', 'Years_Experience_bucket',
                  'Miles_driven_annually_bucket',
                  'credit_history_bucket'], axis=1, inplace=True)

In [69]:
# Check if the dataset has any NaN values as these values will make our algorithms throw an exception

safe_driver.isnull().sum()

target                   0
Gender                   0
EngineHP                 0
credit_history           0
Years_Experience         0
annual_claims            0
Marital_Status           0
Vehicle_Type             0
Miles_driven_annually    8
size_of_family           0
Age_bucket               0
State                    0
dtype: int64

The Miles_driven_annually feature has some null values. Let us explore which particular cells have NaN and ingest them with the median data.

In [70]:
safe_driver[safe_driver.isnull().any(axis=1)]

Unnamed: 0,target,Gender,EngineHP,credit_history,Years_Experience,annual_claims,Marital_Status,Vehicle_Type,Miles_driven_annually,size_of_family,Age_bucket,State
1235,1,F,124,793,27,0,Married,Truck,,3,>40,NJ
7365,0,F,465,696,5,0,Married,Truck,,8,18-27,SD
11464,1,F,137,787,18,1,Married,Truck,,1,>40,CT
18158,0,F,108,747,8,1,Married,Truck,,1,18-27,OR
19795,1,F,121,774,19,0,Married,Truck,,2,28-34,NY
25731,1,F,355,694,15,1,Married,Truck,,5,28-34,CT
26512,1,F,109,743,40,0,Married,Truck,,1,>40,OR
27045,1,F,83,784,21,0,Married,Truck,,1,>40,CT


It may make sense to ingest the median of  `Vehicle_Type=='Truck'` as all the NaN values are for Truck only. Let us look at the median of Miles_driven_annually by each vehicle type. 

In [71]:
median_values = safe_driver.groupby('Vehicle_Type').median()
median_values

Unnamed: 0_level_0,target,EngineHP,credit_history,Years_Experience,annual_claims,Miles_driven_annually,size_of_family
Vehicle_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Car,1,148,695,7,1,13147.5,4
Truck,1,150,694,8,1,12370.5,5
Utility,1,132,741,14,1,11117.0,5
Van,1,128,721,15,1,11272.0,5


In [72]:
# Replace NaN values in Miles_driven_annually with the median value for Truck
# There may be better ways to impute missing data. But we have just 8 NaN cells out of some 30,000+ rows which is
# less than 0.03%
# So, imputing with median for all the 8 cells is not going to skew our results.

safe_driver.fillna(
    median_values.loc['Truck', 'Miles_driven_annually'], inplace=True)

In [73]:
# Check for null values again to make sure we did not miss any accidentally

safe_driver[safe_driver.isnull().any(axis=1)]

Unnamed: 0,target,Gender,EngineHP,credit_history,Years_Experience,annual_claims,Marital_Status,Vehicle_Type,Miles_driven_annually,size_of_family,Age_bucket,State


In [74]:
# Check the data types of all remaining features

safe_driver.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30240 entries, 0 to 30239
Data columns (total 12 columns):
target                   30240 non-null int64
Gender                   30240 non-null object
EngineHP                 30240 non-null int64
credit_history           30240 non-null int64
Years_Experience         30240 non-null int64
annual_claims            30240 non-null int64
Marital_Status           30240 non-null object
Vehicle_Type             30240 non-null object
Miles_driven_annually    30240 non-null float64
size_of_family           30240 non-null int64
Age_bucket               30240 non-null object
State                    30240 non-null object
dtypes: float64(1), int64(6), object(5)
memory usage: 2.8+ MB


Looking at the feature values above, the range of values of each vary a lot. For example `'Miles_driven_annually'` is in the 10s of thousands, whereas 'credit_history' is in the 100s and 'annual-claims' is in single digit. Due to the varying magnitudes of the feature values we will scale the features with Z-scores using `sklearn.preprocessing.scale`.<br>
<br>

In [76]:
# To standardize the numeric features we need to isolate them first into a separate dataframe

safe_driver_num_features = safe_driver.drop(
    safe_driver.select_dtypes(['object']), axis=1)

# Do not standardize 'target' which is our label

safe_driver_num_features.drop(['target'], axis=1, inplace=True)

safe_driver_cat_features = safe_driver.select_dtypes(['object'])

In [77]:
from sklearn.preprocessing import MinMaxScaler

# Restore the column names from the original dataset

safe_driver_scaled = pd.DataFrame(preprocessing.scale(safe_driver_num_features),
                                  columns=safe_driver_num_features.columns)
scaler = MinMaxScaler()

safe_driver_scaled = pd.DataFrame(scaler.fit_transform(
    safe_driver_num_features), columns=safe_driver_num_features.columns)

# We now have the scaled feature set. Now we need to concatenate the categorical features back with our scaled
# dataset before running OneHotEncoder or dummies.

In [78]:
# We will concatenate the scaled dataframe with the categorical feature set

safe_driver = pd.concat(
    [safe_driver_scaled, safe_driver['target'], safe_driver_cat_features], axis=1)

# We will add the 'target' label back to the scaled dataframe as we may need it later
safe_driver_scaled = pd.concat(
    [safe_driver_scaled, safe_driver['target']], axis=1)

In [79]:
# Check if there are any NaN values one more time

safe_driver_num_features[safe_driver_num_features.isnull().any(axis=1)]

Unnamed: 0,EngineHP,credit_history,Years_Experience,annual_claims,Miles_driven_annually,size_of_family


In [80]:
safe_driver_num_features = pd.concat(
    [safe_driver_num_features, safe_driver['target']], axis=1)

In [81]:
# Convert Gender to a 1 or a 2
safe_driver['Gender'] = np.where(safe_driver['Gender'] == 'F', 1, 2)

# Convert Marital_Status to a 1 or a 2
safe_driver['Marital_Status'] = np.where(
    safe_driver['Marital_Status'] == 'Single', 1, 2)

# Convert Vehicle_Type using LabelEncoder
le = preprocessing.LabelEncoder()
le.fit(safe_driver['Vehicle_Type'])

safe_driver['Vehicle_Type'] = le.transform(safe_driver['Vehicle_Type'])

# Convert Age_bucket using LabelEncoder
le.fit(safe_driver['Age_bucket'])

safe_driver['Age_bucket'] = le.transform(safe_driver['Age_bucket'])

In [82]:
safe_driver.head(10)

Unnamed: 0,EngineHP,credit_history,Years_Experience,annual_claims,Miles_driven_annually,size_of_family,target,Gender,Marital_Status,Vehicle_Type,Age_bucket,State
0,0.478,0.647,0.0,0.0,0.103,0.571,1,1,2,0,3,IL
1,0.661,0.735,0.385,0.0,0.109,0.714,1,1,2,0,1,NJ
2,0.057,0.711,0.359,0.0,0.052,0.286,1,2,2,3,4,CT
3,0.071,0.764,0.205,0.0,0.762,0.286,1,2,2,3,0,CT
4,0.052,0.856,0.821,0.25,0.097,0.429,1,2,2,3,4,WY
5,0.069,0.767,0.436,0.25,0.076,1.0,1,1,2,1,4,DE
6,0.077,0.887,0.769,0.75,0.094,0.143,1,1,2,1,4,NJ
7,0.009,0.813,0.513,0.25,0.097,0.571,1,1,1,0,4,ME
8,0.619,0.758,0.846,0.0,0.127,0.0,1,1,2,0,4,CA
9,0.043,0.882,0.462,0.25,0.012,0.0,0,1,2,1,2,NJ


In [83]:
# Drop the 'target' column from training dataframe as that is our label
X = safe_driver.drop(['target', 'State'], 1)

# The 'target' column is our label or outcome that we want to predict
y = safe_driver['target']

# Drop and NaN values
X = X.dropna(axis=1)

In [84]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

os = SMOTE(random_state=0)

columns = X.columns
os_data_X, os_data_y = os.fit_sample(X, y)
os_data_X = pd.DataFrame(data=os_data_X, columns=columns)
os_data_y = pd.DataFrame(data=os_data_y, columns=['y'])

# Split the resulting balanced data set as train and test

X_train, X_test, y_train, y_test = train_test_split(
    os_data_X, os_data_y, test_size=0.3, random_state=0)

# Check the size of our new data
print("length of oversampled data is ", len(os_data_X))
print("Number of negative class in oversampled data",
      len(os_data_y[os_data_y['y'] == 0]))
print("Number of positive class in oversampled data",
      len(os_data_y[os_data_y['y'] == 1]))
print("Proportion of negative class in oversampled data is ",
      len(os_data_y[os_data_y['y'] == 0])/len(os_data_X))
print("Proportion of positive class in oversampled data is ",
      len(os_data_y[os_data_y['y'] == 1])/len(os_data_X))
from sklearn.model_selection import train_test_split

# Split the resulting balanced data set as train and test

X_train, X_test, y_train, y_test = train_test_split(os_data_X, os_data_y, test_size = 0.3, random_state = 0)

length of oversampled data is  42792
Number of negative class in oversampled data 21396
Number of positive class in oversampled data 21396
Proportion of negative class in oversampled data is  0.5
Proportion of positive class in oversampled data is  0.5


<h2>Gradient Boosting Classifier</h2>

In [88]:
# Finally, we try GradientBoostingClassifier

from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(loss='deviance', max_depth=10)
clf_model = clf.fit(X_train, y_train)
print(clf_model)
print('Training set score:', clf.score(X_train, y_train))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
Training set score: 0.9001135073779796


In [89]:
CLF_score = cross_val_score(clf, X_train, y_train, cv=5)
print('\nEach Cross Validated Accuracy: \n', CLF_score)
print("\nOverall Gradient Boosted Classifier Accuracy: %0.2f (+/- %0.2f)\n" %
      (CLF_score.mean(), CLF_score.std() * 2))


Each Cross Validated Accuracy: 
 [0.74399199 0.7325989  0.74561843 0.73789649 0.74123539]

Overall Gradient Boosted Classifier Accuracy: 0.74 (+/- 0.01)



In [90]:
CLF_test_score = cross_val_score(clf, X_test, y_test, cv=5)

In [91]:
y_predict = clf.predict(X_train)
target_names = ['Safe Driver', 'Non-safe Driver']
GB_scores = classification_report(
    y_train, y_predict, target_names=target_names, output_dict=True)
confusion_matrix(y_train, y_predict)

array([[12127,  2851],
       [  141, 14835]])

In [98]:
from sklearn.pipeline import Pipeline
from sklearn import metrics, linear_model
from sklearn.neural_network import BernoulliRBM
from sklearn.model_selection import train_test_split, cross_val_score
#from skimage import color

# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(2000,))
mlp.fit(X_train, y_train)

# Compare MLP and GBM models
print("Logistic regression using MLP features:\n%s\n" % (
    metrics.classification_report(
        y_test,
        mlp.predict(X_test))))

print("Logistic regression using GBM features:\n%s\n" % (
    metrics.classification_report(
        y_test,
        clf.predict(X_test))))

Logistic regression using MLP features:
              precision    recall  f1-score   support

           0       0.55      0.61      0.58      6418
           1       0.56      0.50      0.53      6420

   micro avg       0.56      0.56      0.56     12838
   macro avg       0.56      0.56      0.56     12838
weighted avg       0.56      0.56      0.56     12838


Logistic regression using GBM features:
              precision    recall  f1-score   support

           0       0.84      0.60      0.70      6418
           1       0.69      0.89      0.78      6420

   micro avg       0.74      0.74      0.74     12838
   macro avg       0.77      0.74      0.74     12838
weighted avg       0.77      0.74      0.74     12838




In [99]:
mlp.score(X_train, y_train)

0.5936435868331441

In [100]:
cross_val_score(mlp, X_test, y_test, cv=5)

array([0.5288162 , 0.52803738, 0.52258567, 0.53992988, 0.52201013])

In [101]:
print(mlp)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(2000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
