In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split

In [42]:
# Read the safe driver prediction data (fro Kaggle)
safe_driver = pd.read_excel('IT_3.xlsx')

In [43]:
# Check if there are any NULL data that need to be dropped
safe_driver.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30240 entries, 0 to 30239
Data columns (total 17 columns):
ID                              30240 non-null int64
target                          30240 non-null int64
Gender                          30240 non-null object
EngineHP                        30240 non-null int64
credit_history                  30240 non-null int64
Years_Experience                30240 non-null int64
annual_claims                   30240 non-null int64
Marital_Status                  30240 non-null object
Vehicle_Type                    30240 non-null object
Miles_driven_annually           30232 non-null float64
size_of_family                  30240 non-null int64
Age_bucket                      30240 non-null object
EngineHP_bucket                 30240 non-null object
Years_Experience_bucket         30240 non-null object
Miles_driven_annually_bucket    30232 non-null object
credit_history_bucket           30240 non-null object
State                           3

In [44]:
# Look for unique values in columns so that such columns may be dropped.
categorical = safe_driver.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())

Gender
2
Marital_Status
2
Vehicle_Type
4
Age_bucket
5
EngineHP_bucket
4
Years_Experience_bucket
5
Miles_driven_annually_bucket
3
credit_history_bucket
5
State
50


In [45]:
# Drop other columns with many unique variables, and the ID column as it does not 
# contribute in anyway to the prediction
safe_driver.drop(['ID', 'Age_bucket', 'EngineHP_bucket', 'Years_Experience_bucket', \
            'Miles_driven_annually_bucket'], 1, inplace=True)

In [47]:
# Drop the 'target' column from training dataframe as that is our label
X = safe_driver.drop('target', 1)

# The 'target' column is our label or outcome that we want to predict
y = safe_driver['target']

# Use pd.dummies to resolve the categorical data (e.g. State) into numerical values
X = pd.get_dummies(X)

# Drop and NaN values
X = X.dropna(axis=1)

# Break the dataset into test and train with 2/3 (train) and 1/3 (test) split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [48]:
# Run DecisionTree Classifier model on the training data set

from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

predict_lbl = tree.predict(X_test)
tree.score(X_test, y_test) 

0.575250501002004

In [49]:
# Check how many positives and negatives were predicted by the model
# A 1 = Safe Driver and a 0 = Unsafe driver

pd.Series(predict_lbl).value_counts()

1    6817
0    3163
dtype: int64

In [62]:
logisticRegr = LogisticRegression(penalty ='l2', solver='newton-cg', max_iter=100)

# Fit the data to the model
logisticRegr.fit(X_train, y_train)

# Convert the returned dataframes to numpy arrays for prediction
#X_test = X_test.values
#y_test = y_test.values

# Predict for Observation
predict_lbl = logisticRegr.predict(X_test)
print('Prediction = ', predict_lbl)

logisticRegr.score(X_test, y_test)

Prediction =  [1 1 1 ... 1 1 1]


0.7102204408817635

The plain vanilla LogisticRegression model is not able to fit a good model for our dataset. It always returns a 1 (= Safe Driver) with a 71% accuracy. See the next cell below where all 9980 rows data are predicted with a 1.

In [63]:
pd.Series(predict_lbl).value_counts()

1    9980
dtype: int64

In [113]:
# For RidgeClassifier change the zeros in y_train to a -1
#y_ridge_test = pd.Series()
y_train[y_train == 0] = -1

In [110]:
from sklearn import linear_model
from sklearn import preprocessing

ridgeregr = linear_model.RidgeClassifier(alpha=1.0, fit_intercept=False, max_iter=1000) 
ridgeregr.fit(X_train, y_train)
print(ridgeregr.score(X_train, y_train))
origparams = ridgeregr.coef_[0]
print(origparams)

0.7062191510365252
[-1.07311988e-04  1.49555229e-04 -2.96491974e-04  2.77366042e-04
 -1.97070420e-03  1.01825587e-01  1.33588430e-01  1.12226914e-01
  1.23187104e-01  6.79876194e-02  6.43555335e-02  6.29025139e-02
  4.01683509e-02  1.14772170e-02  4.82631208e-02  4.84359406e-02
  3.14231472e-02  9.58145921e-02]


In [111]:
print(ridgeregr.score(X_train, y_train))

0.7062191510365252


In [112]:
predict_label = ridgeregr.predict(X_test)
pd.Series(predict_label).value_counts()

1    9980
dtype: int64

The Ridge linear classifier model also, unfortunately, returns poor results. But RandomForest model below gives a higher accuracy than DecisionTree with a 67% score and a good split of 1s and 0s predicted.

In [114]:
from sklearn.ensemble import RandomForestClassifier

tree = RandomForestClassifier(n_estimators=500)
tree.fit(X_train, y_train)

predict_label = tree.predict(X_test)
print(tree.score(X_test, y_test))
print(pd.Series(predict_label).value_counts())

0.6532064128256513
 1    9194
-1     786
dtype: int64


In [115]:
lass = linear_model.Lasso(alpha=.6)
lassfit = lass.fit(X_train, y_train)
print('R² for the model with few features:')
print(lass.score(X_train, y_train))
origparams = np.append(lassfit.coef_, lassfit.intercept_)
print('\nParameter estimates for the model with few features:')
print(origparams)

R² for the model with few features:
0.00015136490697265081

Parameter estimates for the model with few features:
[-5.69208861e-05 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00  4.23643802e-01]


In [116]:
print(lass.score(X_test, y_test))

-0.4309387230050008
