In [58]:
import pandas as pd
import numpy as np
import scipy.stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

from sklearn import metrics
from sklearn.metrics import accuracy_score

### Import the data into a dataframe

In [59]:
# Read the dataset
Ins_df = pd.read_csv("CarInsurance-1-1.csv",index_col=0) 

In [60]:
# Check whether the load is successul?
Ins_df.head()

Unnamed: 0,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,...,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,Purchase
1,33,1,3,2,8,0,5,1,3,7,...,0,0,0,1,0,0,0,0,0,No
2,37,1,2,2,8,1,4,1,4,6,...,0,0,0,1,0,0,0,0,0,No
3,37,1,2,2,8,0,4,2,4,3,...,0,0,0,1,0,0,0,0,0,No
4,9,1,3,3,3,2,3,2,4,5,...,0,0,0,1,0,0,0,0,0,No
5,40,1,4,2,10,1,4,1,4,7,...,0,0,0,1,0,0,0,0,0,No


In [62]:
Ins_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5822 entries, 1 to 5822
Data columns (total 86 columns):
MOSTYPE     5822 non-null int64
MAANTHUI    5822 non-null int64
MGEMOMV     5822 non-null int64
MGEMLEEF    5822 non-null int64
MOSHOOFD    5822 non-null int64
MGODRK      5822 non-null int64
MGODPR      5822 non-null int64
MGODOV      5822 non-null int64
MGODGE      5822 non-null int64
MRELGE      5822 non-null int64
MRELSA      5822 non-null int64
MRELOV      5822 non-null int64
MFALLEEN    5822 non-null int64
MFGEKIND    5822 non-null int64
MFWEKIND    5822 non-null int64
MOPLHOOG    5822 non-null int64
MOPLMIDD    5822 non-null int64
MOPLLAAG    5822 non-null int64
MBERHOOG    5822 non-null int64
MBERZELF    5822 non-null int64
MBERBOER    5822 non-null int64
MBERMIDD    5822 non-null int64
MBERARBG    5822 non-null int64
MBERARBO    5822 non-null int64
MSKA        5822 non-null int64
MSKB1       5822 non-null int64
MSKB2       5822 non-null int64
MSKC        5822 non-null int

### 1. Separate given ‘carInsurance.csv’ data set into train and test sets 

In [63]:
X = Ins_df.drop("Purchase",axis=1).values
Y = Ins_df["Purchase"].values
print(type(X))
print(type(Y))
print(X)
print(Y)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[[33  1  3 ...  0  0  0]
 [37  1  2 ...  0  0  0]
 [37  1  2 ...  0  0  0]
 ...
 [33  1  3 ...  0  0  0]
 [34  1  3 ...  0  0  0]
 [33  1  3 ...  0  0  0]]
['No' 'No' 'No' ... 'Yes' 'No' 'No']


In [64]:
# Split 70% train and 30% test
test_size = 0.30 # taking 70:30 training and test set
seed = 2  # Random numbmer seeding for reapeatability of the code
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [65]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(4075, 85)
(4075,)
(1747, 85)
(1747,)


### 2. Find whether a given individual will buy a car insurance policy or not by using   

#### 2.1. Logistic regression

In [66]:
model = LogisticRegression()
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [67]:
# Predict using Logistic model
y_predict_logistic = model.predict(x_test)

In [68]:
acc_log = accuracy_score(y_test,y_predict_logistic)
acc_log

0.9398969662278192

#### 2.2 Naive Bayes

In [69]:
gnb = GaussianNB()

In [70]:
gnb.fit(x_train, y_train)

GaussianNB(priors=None)

In [71]:
y_predict_gnb = gnb.predict(x_test)

In [72]:
acc_gnb = accuracy_score(y_test,y_predict_gnb)
acc_gnb

0.1877504293073841

In [73]:
# Calculating Confusion 
cr=metrics.classification_report(y_test,y_predict_gnb)
print(cr)

             precision    recall  f1-score   support

         No       0.98      0.14      0.24      1645
        Yes       0.06      0.96      0.12       102

avg / total       0.93      0.19      0.24      1747



#### 2.3 KNN

In [74]:
knn = KNeighborsClassifier(n_neighbors=30, weights ='uniform', metric='euclidean')

In [75]:
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=30, p=2,
           weights='uniform')

In [76]:
y_predict_knn = knn.predict(x_test)

In [77]:
acc_knn = accuracy_score(y_test,y_predict_knn)
acc_knn

0.9416141957641672

#### 2.4. Support vector machines

In [78]:
svm_model = svm.SVC(kernel = 'linear', C = 100)
svm_model.fit(x_train, y_train)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [79]:
y_predict_svm = svm_model.predict(x_test)

In [80]:
acc_svm = accuracy_score(y_test,y_predict_svm)
acc_svm

0.9416141957641672

In [81]:
### Summary
print ("Accuracy by Logistic Reg model: {0:.2%}".format(acc_log))
print ("Accuracy by GNB model: {0:.2%}".format(acc_gnb))
print ("Accuracy by KNN model: {0:.2%}".format(acc_knn))
print ("Accuracy by Support Vector model: {0:.2%}".format(acc_svm))

Accuracy by Logistic Reg model: 93.99%
Accuracy by GNB model: 18.78%
Accuracy by KNN model: 94.16%
Accuracy by Support Vector model: 94.16%


From the above model and accuracy scores, Models based on Logistic Regression, K nearest neighbor and Support Vector are predicting the results with an accuracy of 93% and above. However the model based on Naives Bayes (GNB) predicts very poor with a accuracy score of 18%. The reason behind the poor accuracy is there were no feature selection done on selecting the independent variables for the GNB model to understand the relevance between the Independent variables and Resultant variable. So selecting the relevant variables is very important in boosting the accuracy of classifiers.

In [107]:
# Lets try to apply Chi Square Test 
# Below logic is to apply Chi Square Contingency between the 85 Independent Variables Column and Purchase Column
# and select only independent variables which has p-value less than 0.01

col_list = []

for column in Ins_df.drop("Purchase",axis=1):
    cont = pd.crosstab(Ins_df[column], Ins_df['Purchase'])
    chi_cont = scipy.stats.chi2_contingency(cont)        
    if chi_cont[1] <= 0.01:
        col_list.append(column)
        col_sel = col_sel + 1    
    col_cnt = col_cnt + 1
    
len(col_list)
# There are 51 variables which as associated to the Dependent variable.

39

In [108]:
# Select only 51 columns from the master dataframe
Ins_df_gnb = Ins_df[col_list]
Ins_df_gnb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5822 entries, 1 to 5822
Data columns (total 39 columns):
MOSTYPE     5822 non-null int64
MOSHOOFD    5822 non-null int64
MGODPR      5822 non-null int64
MRELGE      5822 non-null int64
MRELOV      5822 non-null int64
MOPLHOOG    5822 non-null int64
MOPLLAAG    5822 non-null int64
MBERHOOG    5822 non-null int64
MBERMIDD    5822 non-null int64
MBERARBG    5822 non-null int64
MBERARBO    5822 non-null int64
MSKA        5822 non-null int64
MSKC        5822 non-null int64
MSKD        5822 non-null int64
MHHUUR      5822 non-null int64
MHKOOP      5822 non-null int64
MAUT1       5822 non-null int64
MAUT0       5822 non-null int64
MZFONDS     5822 non-null int64
MZPART      5822 non-null int64
MINKM30     5822 non-null int64
MINK4575    5822 non-null int64
MINK7512    5822 non-null int64
MINKGEM     5822 non-null int64
MKOOPKLA    5822 non-null int64
PWAPART     5822 non-null int64
PPERSAUT    5822 non-null int64
PMOTSCO     5822 non-null int

In [101]:
X2 = Ins_df_gnb.values
Y2 = Ins_df['Purchase'].values

In [102]:
# Split 70% train and 30% test
test_size = 0.30 # taking 70:30 training and test set
seed = 10  # Random numbmer seeding for reapeatability of the code
x2_train, x2_test, y2_train, y2_test = train_test_split(X2, Y2, test_size=test_size, random_state=seed)

In [103]:
gnb2 = GaussianNB()

In [104]:
gnb2.fit(x2_train, y2_train)

GaussianNB(priors=None)

In [105]:
y_predict_gnb2 = gnb2.predict(x2_test)

In [106]:
accuracy_score(y2_test,y_predict_gnb2)

0.8746422438465942

### Conclusion
After applything Chi Square Contingency Test to select the Independent variables based on p-value (selected columns with chi square p-value less than 0.01), the accuracy has been improved a lot from 17% to 87%. It indicates the importance of doing the feature selection before apply any models for prediction.