In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm



#Read data and have a first look at the data
pd.set_option('display.max_columns', None)
train = pd.read_csv('train_auto.csv')

#Check out a sample of the data
train.head(10)

Unnamed: 0,INDEX,TARGET_FLAG,TARGET_AMT,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,SEX,EDUCATION,JOB,TRAVTIME,CAR_USE,BLUEBOOK,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CAR_AGE,URBANICITY
0,1,0,0.0,0,60.0,0,11.0,"$67,349",No,$0,z_No,M,PhD,Professional,14,Private,"$14,230",11,Minivan,yes,"$4,461",2,No,3,18.0,Highly Urban/ Urban
1,2,0,0.0,0,43.0,0,11.0,"$91,449",No,"$257,252",z_No,M,z_High School,z_Blue Collar,22,Commercial,"$14,940",1,Minivan,yes,$0,0,No,0,1.0,Highly Urban/ Urban
2,4,0,0.0,0,35.0,1,10.0,"$16,039",No,"$124,191",Yes,z_F,z_High School,Clerical,5,Private,"$4,010",4,z_SUV,no,"$38,690",2,No,3,10.0,Highly Urban/ Urban
3,5,0,0.0,0,51.0,0,14.0,,No,"$306,251",Yes,M,<High School,z_Blue Collar,32,Private,"$15,440",7,Minivan,yes,$0,0,No,0,6.0,Highly Urban/ Urban
4,6,0,0.0,0,50.0,0,,"$114,986",No,"$243,925",Yes,z_F,PhD,Doctor,36,Private,"$18,000",1,z_SUV,no,"$19,217",2,Yes,3,17.0,Highly Urban/ Urban
5,7,1,2946.0,0,34.0,1,12.0,"$125,301",Yes,$0,z_No,z_F,Bachelors,z_Blue Collar,46,Commercial,"$17,430",1,Sports Car,no,$0,0,No,0,7.0,Highly Urban/ Urban
6,8,0,0.0,0,54.0,0,,"$18,755",No,,Yes,z_F,<High School,z_Blue Collar,33,Private,"$8,780",1,z_SUV,no,$0,0,No,0,1.0,Highly Urban/ Urban
7,11,1,4021.0,1,37.0,2,,"$107,961",No,"$333,680",Yes,M,Bachelors,z_Blue Collar,44,Commercial,"$16,970",1,Van,yes,"$2,374",1,Yes,10,7.0,Highly Urban/ Urban
8,12,1,2501.0,0,34.0,0,10.0,"$62,978",No,$0,z_No,z_F,Bachelors,Clerical,34,Private,"$11,200",1,z_SUV,no,$0,0,No,0,1.0,Highly Urban/ Urban
9,13,0,0.0,0,50.0,0,7.0,"$106,952",No,$0,z_No,M,Bachelors,Professional,48,Commercial,"$18,510",7,Van,no,$0,0,No,1,17.0,z_Highly Rural/ Rural


In [2]:
# First thing to note: The target flag displays zeroes and ones
# in our samples. To make sure these are the only two values:
print(train['TARGET_FLAG'].nunique())

2


In [3]:
# So with two classes this is going to be a classification problem
# or an anomaly detection problem (which is also classification in the larger sence)
print(train.TARGET_FLAG.unique())

[0 1]


In [4]:
# Check the number of occurences of both types, as well as of unasigned cases
TARGET_FLAG=train.TARGET_FLAG.astype(int)
count_flags = Counter(train.TARGET_FLAG)
print(count_flags)
print('Case 0:', float(count_flags[0])/float(train.shape[0])*100.,'%',
      ',  Case 1: ', float(count_flags[1])/float(train.shape[0])*100.,'%',
      ',  Unasigned cases: ',float(train.shape[0]-count_flags[0]-count_flags[1])/float(train.shape[0])*100.,'%')


Counter({0: 6008, 1: 2153})
Case 0: 73.61842911407915 % ,  Case 1:  26.38157088592084 % ,  Unasigned cases:  0.0 %


In [5]:
# There are a fair amount of cases in both classes, and therefore 
# I think that we can use a classification algorithm.


# Before thinking about the features to use lets think a second about 
# what the target_flag could be. Since it is related to car insurance
# the two that come immediately to my mind, are either that it is linked
# to the occurence of damage over a period of time or that it is linked to
# whether insurance is provided or not. The fact that there is a target amount
# suggests that target_flag probably refers to damage occurence.


# With this in mind let's now have a look at features. Some of them will need 
# some transormation, in particular those with $ values:
train.INCOME=train.INCOME.str.replace(',','').str.extract('(\d+)', expand=False)
train.HOME_VAL=train.HOME_VAL.str.replace(',','').str.extract('(\d+)', expand=False)
train.BLUEBOOK=train.BLUEBOOK.str.replace(',','').str.extract('(\d+)', expand=False)
train.OLDCLAIM 	=train.OLDCLAIM .str.replace(',','').str.extract('(\d+)', expand=False)


In [6]:
# Now lets check for missing values
# among the features.
# We have several ways to proceed:
# We can ignore training examples with
# missing values.
# We can also try to replace missing values
# with the mean of the feature (if possible)


train_nomiss=train
for column in train:
    train_nomiss=train_nomiss[pd.notnull(train_nomiss[column])]
    
print(train_nomiss.shape[0])
print(Counter(train_nomiss.TARGET_FLAG))

6045
Counter({0: 4443, 1: 1602})


In [7]:
# If we remove all examples with at least one
# missing feature we are still left with about
# 75% of the data. So let's for now proceed in this
# manner. We could still see if this number drops
# later on, if we remove redundant features for example.


# Now lets divide the data into features
# that divide the cases into "categories"
# and into features that take a wider range
# of values that we denote as "numeric".
# For the numeric type, let's have a look at
# a few simple statistics and see whether
# we have any unrealistic looking values.

train_np=train_nomiss.to_numpy()
features=train_nomiss.loc[:,'KIDSDRIV':'URBANICITY']


num_list=[]
cat_list=[]
feat_name_list=[]
for feat in features:
    if features[feat].nunique() >=6 and feat!='JOB' and feat!='EDUCATION' and feat!='CAR_TYPE' :
        num_list.append(feat)
        feat_name_list.append(feat)
        print(feat,' min: ',np.amin(np.array(features[feat]).astype(float)), 
              ' max: ', np.amax(np.array(features[feat]).astype(float)), 
              ' mean: ', np.mean(np.array(features[feat]).astype(float)),
              ' median: ', np.median(np.array(features[feat]).astype(float)))
    else: 
        cat_list.append(feat)
print("  ")        
features_num=np.array(features.loc[:,num_list].astype(float))
print("Numerical features: ", features_num.shape[1])

features_cat=features.loc[:,cat_list]
print("Class features: ", features_cat.shape[1])

target=np.array(train_nomiss.loc[:,'TARGET_FLAG'])



AGE  min:  16.0  max:  81.0  mean:  44.628453267162946  median:  45.0
HOMEKIDS  min:  0.0  max:  5.0  mean:  0.743424317617866  median:  0.0
YOJ  min:  0.0  max:  23.0  mean:  10.494623655913978  median:  11.0
INCOME  min:  0.0  max:  367030.0  mean:  58177.01323407775  median:  51624.0
HOME_VAL  min:  0.0  max:  885282.0  mean:  150102.07460711332  median:  159152.0
TRAVTIME  min:  5.0  max:  142.0  mean:  33.69429280397022  median:  33.0
BLUEBOOK  min:  1500.0  max:  65970.0  mean:  15235.60959470637  median:  14080.0
TIF  min:  1.0  max:  25.0  mean:  5.36029776674938  median:  4.0
OLDCLAIM  min:  0.0  max:  57037.0  mean:  4004.875599669148  median:  0.0
CLM_FREQ  min:  0.0  max:  5.0  mean:  0.7841191066997518  median:  0.0
MVR_PTS  min:  0.0  max:  13.0  mean:  1.6997518610421836  median:  1.0
CAR_AGE  min:  -3.0  max:  28.0  mean:  7.920926385442514  median:  8.0
  
Numerical features:  12
Class features:  11


In [8]:
# The only value that looks out of the ordinary is the negative CAR_AGE.
# Maybe the maximum travtime also seems high, but since I am unaware of
# the units and since commercially driven cars could spend a lot of time
# on the road this could be an issue.
# I could now set the minimum car age to zero, but I suspect that this is
# not going to affect the prediction substantially.


# Let's start transforming the categorical data with two values.
# To these, I will just assign 0 and 1. The order should not be important.

features_cat.PARENT1=features_cat.PARENT1.replace('No','0').replace('Yes','1')
features_cat.MSTATUS=features_cat.MSTATUS.replace('z_No','0').replace('Yes','1')
features_cat.SEX=features_cat.SEX.replace('z_F','0').replace('M','1')
features_cat.CAR_USE=features_cat.CAR_USE.replace('Private','0').replace('Commercial','1')
features_cat.RED_CAR=features_cat.RED_CAR.replace('no','0').replace('yes','1')
features_cat.REVOKED=features_cat.REVOKED.replace('No','0').replace('Yes','1')
features_cat.URBANICITY=features_cat.URBANICITY.replace('z_Highly Rural/ Rural','0').replace('Highly Urban/ Urban','1')



features_cat_2=np.array(features_cat.loc[:,['KIDSDRIV','PARENT1','MSTATUS','SEX',
                                            'CAR_USE','RED_CAR','REVOKED','URBANICITY']]).astype(float)
feat_name_list.append(['KIDSDRIV','PARENT1','MSTATUS','SEX',
                                            'CAR_USE','RED_CAR','REVOKED','URBANICITY'])

features_merge=np.concatenate((features_num, features_cat_2), axis=1)
print(features_merge.shape)




(6045, 20)


In [9]:
# Let's integrate the remainder of the
# categorical data. For the multi-category categories, I
# will assign each category the percentage of positives. This may
# lead to overfitting the data, but I will check anyway if 
# this is a problem later with the test set.


features_CAR_TYPE = train_nomiss.groupby(['CAR_TYPE'])['TARGET_FLAG'].mean()
print(features_CAR_TYPE)
features_cat.CAR_TYPE=features_cat.CAR_TYPE.replace('Minivan',features_CAR_TYPE['Minivan'])
features_cat.CAR_TYPE=features_cat.CAR_TYPE.replace('Panel Truck',features_CAR_TYPE['Panel Truck'])
features_cat.CAR_TYPE=features_cat.CAR_TYPE.replace('Pickup',features_CAR_TYPE['Pickup'])
features_cat.CAR_TYPE=features_cat.CAR_TYPE.replace('Sports Car',features_CAR_TYPE['Sports Car'])
features_cat.CAR_TYPE=features_cat.CAR_TYPE.replace('Van',features_CAR_TYPE['Van'])
features_cat.CAR_TYPE=features_cat.CAR_TYPE.replace('z_SUV',features_CAR_TYPE['z_SUV'])



features_EDUCATION = train_nomiss.groupby(['EDUCATION'])['TARGET_FLAG'].mean()
print(features_EDUCATION)
features_cat.EDUCATION=features_cat.EDUCATION.replace('<High School',features_EDUCATION['<High School'])
features_cat.EDUCATION=features_cat.EDUCATION.replace('z_High School',features_EDUCATION['z_High School'])
features_cat.EDUCATION=features_cat.EDUCATION.replace('Bachelors',features_EDUCATION['Bachelors'])
features_cat.EDUCATION=features_cat.EDUCATION.replace('Masters',features_EDUCATION['Masters'])
features_cat.EDUCATION=features_cat.EDUCATION.replace('PhD',features_EDUCATION['PhD'])


features_JOB = train_nomiss.groupby(['JOB'])['TARGET_FLAG'].mean()
print(features_JOB)
features_cat.JOB=features_cat.JOB.replace('Student',features_JOB['Student'])
features_cat.JOB=features_cat.JOB.replace('z_Blue Collar',features_JOB['z_Blue Collar'])
features_cat.JOB=features_cat.JOB.replace('Clerical',features_JOB['Clerical'])
features_cat.JOB=features_cat.JOB.replace('Home Maker',features_JOB['Home Maker'])
features_cat.JOB=features_cat.JOB.replace('Professional',features_JOB['Professional'])
features_cat.JOB=features_cat.JOB.replace('Lawyer',features_JOB['Lawyer'])
features_cat.JOB=features_cat.JOB.replace('Doctor',features_JOB['Doctor'])
features_cat.JOB=features_cat.JOB.replace('Manager',features_JOB['Manager'])

features_cat_3 = np.array(features_cat.loc[:,['CAR_TYPE','EDUCATION','JOB']]).astype(float)
feat_name_list.append(['CAR_TYPE','EDUCATION','JOB'])

features_merge_full=np.concatenate((features_merge,features_cat_3), axis=1)
print(features_merge_full.shape)


CAR_TYPE
Minivan        0.161670
Panel Truck    0.299712
Pickup         0.321499
Sports Car     0.343137
Van            0.250000
z_SUV          0.297586
Name: TARGET_FLAG, dtype: float64
EDUCATION
<High School     0.327749
Bachelors        0.231034
Masters          0.169651
PhD              0.172662
z_High School    0.339209
Name: TARGET_FLAG, dtype: float64
JOB
Clerical         0.302619
Doctor           0.135000
Home Maker       0.274793
Lawyer           0.189552
Manager          0.127086
Professional     0.218894
Student          0.372439
z_Blue Collar    0.348238
Name: TARGET_FLAG, dtype: float64
(6045, 23)


In [10]:
# We note that these addiational categories have some
# differences in their positive rates.

# Now that we have the data arranged let's divide it into a training and a testing dataset:



X_train, X_test, y_train, y_test = train_test_split(
    features_merge_full, target, test_size=0.30, random_state=11)

In [11]:
# Let's try to make a first prediction using logistic regression 
# which is among the most popular machine machine learning techniques
# and often gives satisfactory performance.

# First let's rescale our features using
# the training data set.
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


#Let's train the model and see how it does on the training set first.
LR = LogisticRegression(random_state=0).fit(X_train,y_train)
y_train_pred=LR.predict(X_train)
y_test_pred=LR.predict(X_test)
#print(target_prediction.shape)

# To evaluate the model I will look at the number of occurence of
# True Negatives, False Negative, False Positives and True Positives,
# as well as at a few quantities derived from these (Hit rate, Recall 
# and Precision). This will allow me to assess how often we predict
# a certain outcome and how often 

def evaluation(B_train,B_train_pred,B_test,B_test_pred):
    Confusion = confusion_matrix(B_train,B_train_pred)
    print('Performance on training set:')
    print('--------------------------------:')
    print('True negative: ', Confusion[0,0])
    print('False negative: ', Confusion[1,0])
    print('False positve: ', Confusion[0,1])
    print('True positive: ', Confusion[1,1])
    print('------------------------------')
    print('Hit rate: ', (Confusion[0,0]+Confusion[1,1])/(Confusion[0,0]+Confusion[1,1]+Confusion[0,1]+Confusion[1,0]))
    print('Pecision: ',Confusion[1,1]/(Confusion[1,1]+Confusion[0,1]))
    print('Recall: ',Confusion[1,1]/(Confusion[1,1]+Confusion[1,0]))
    print('Fraction of positive predictions',
          (Confusion[1,1]+Confusion[0,1])/(Confusion[0,0]+Confusion[1,1]+Confusion[0,1]+Confusion[1,0]))
    #(target)
    print('   ')
    Confusion = confusion_matrix(B_test,B_test_pred)
    print('Performance on test set:')
    print('--------------------------------:')
    print('True negative: ', Confusion[0,0])
    print('False negative: ', Confusion[1,0])
    print('False positve: ', Confusion[0,1])
    print('True positive: ', Confusion[1,1])
    print('------------------------------')
    print('Hit rate: ', (Confusion[0,0]+Confusion[1,1])/(Confusion[0,0]+Confusion[1,1]+Confusion[0,1]+Confusion[1,0]))
    print('Pecision: ',Confusion[1,1]/(Confusion[1,1]+Confusion[0,1]))
    print('Recall: ',Confusion[1,1]/(Confusion[1,1]+Confusion[1,0]))
    print('Fraction of positive predictions',
          (Confusion[1,1]+Confusion[0,1])/(Confusion[0,0]+Confusion[1,1]+Confusion[0,1]+Confusion[1,0]))
    print('   ')
    print('------------------------------')
    
evaluation(y_train,y_train_pred,y_test,y_test_pred)

print('Weights of features:')
print(feat_name_list)
print(LR.coef_)




Performance on training set:
--------------------------------:
True negative:  2882
False negative:  627
False positve:  225
True positive:  497
------------------------------
Hit rate:  0.7986291656818719
Pecision:  0.6883656509695291
Recall:  0.44217081850533807
Fraction of positive predictions 0.17064523753249822
   
Performance on test set:
--------------------------------:
True negative:  1219
False negative:  280
False positve:  117
True positive:  198
------------------------------
Hit rate:  0.7811466372657111
Pecision:  0.6285714285714286
Recall:  0.41422594142259417
Fraction of positive predictions 0.17364939360529216
   
------------------------------
Weights of features:
['AGE', 'HOMEKIDS', 'YOJ', 'INCOME', 'HOME_VAL', 'TRAVTIME', 'BLUEBOOK', 'TIF', 'OLDCLAIM', 'CLM_FREQ', 'MVR_PTS', 'CAR_AGE', ['KIDSDRIV', 'PARENT1', 'MSTATUS', 'SEX', 'CAR_USE', 'RED_CAR', 'REVOKED', 'URBANICITY'], ['CAR_TYPE', 'EDUCATION', 'JOB']]
[[-0.00669893  0.04484545 -0.02993572 -0.10157288 -0.22780

In [12]:
# The performance is about similar on the training set and on the
# test set. There is certainly some skill, but the overall performance
# could be better. In particular, the model strongly underestimates 
# positive events. If the variable that we would like to predict is
# indeed that damage has occured, an insurance company would not want
# to underestimate this.
# Looking at the coefficients (weights), I find interesting that the Urbanicity
# gets such a large weight. Since we rescaled the values before training
# comparing the coefficients can be an indication of the influence of parameters
# although one needs to be careful, as the range of possible feature values
# can still vary somewhat, since the rescaling occured using the variance.

# So let us try to do a bit better on the positive cases. An idea to
# train the model towards more readily predicting damage (positive cases) is to choose a
# cost function that adds a higher cost to false negative than to false
# positive classifications. For this purpose we will choose a suport vector
# machine (SVM), in which we can adjust the class weight.

# Let's first see how the SVM does without balanced weights. The default svm
# here uses a gaussian-like kernel.

Tsvm = svm.SVC()
#Tsvm = svm.SVC(class_weight='balanced')
#Tsvm.fit(X_test, y_test)


svm_tr = Tsvm.fit(X_train,y_train)
y_train_pred=svm_tr.predict(X_train)
y_test_pred=svm_tr.predict(X_test)
#print(target_prediction.shape)

evaluation(y_train,y_train_pred,y_test,y_test_pred)


Performance on training set:
--------------------------------:
True negative:  2966
False negative:  510
False positve:  141
True positive:  614
------------------------------
Hit rate:  0.8461356653273457
Pecision:  0.8132450331125828
Recall:  0.5462633451957295
Fraction of positive predictions 0.17844481210115812
   
Performance on test set:
--------------------------------:
True negative:  1227
False negative:  281
False positve:  109
True positive:  197
------------------------------
Hit rate:  0.7850055126791621
Pecision:  0.6437908496732027
Recall:  0.4121338912133891
Fraction of positive predictions 0.16868798235942667
   
------------------------------


In [13]:
# The SVN does better on the training set, but
# is comparable to logistic regression on the
# test set. The fact that the performance is not
# increasing anymore in the test set despite the improvement
# in the training set, could indicate that with our features
# and the amount of training data we have, we will not be 
# able to do much better.

# Now let's see what happens if we choose to change class weight.

Tsvm = svm.SVC(class_weight='balanced')
#Tsvm.fit(X_test, y_test)
svmW_tr = Tsvm.fit(X_train,y_train)
y_train_pred=svmW_tr.predict(X_train)
y_test_pred=svmW_tr.predict(X_test)
#print(target_prediction.shape)

evaluation(y_train,y_train_pred,y_test,y_test_pred)

Performance on training set:
--------------------------------:
True negative:  2478
False negative:  169
False positve:  629
True positive:  955
------------------------------
Hit rate:  0.8113921058851336
Pecision:  0.6029040404040404
Recall:  0.849644128113879
Fraction of positive predictions 0.3743795792956748
   
Performance on test set:
--------------------------------:
True negative:  1001
False negative:  135
False positve:  335
True positive:  343
------------------------------
Hit rate:  0.74090407938258
Pecision:  0.5058997050147492
Recall:  0.7175732217573222
Fraction of positive predictions 0.3737596471885336
   
------------------------------


In [14]:
# With the adjusted cost function, the resulting
# fraction of predicted positive results is much higher,
# at the expense of substantially losing Precision due to
# a high rate of false positives. If avoiding false positives
# than this could be an option. However, the overall hit rate
# is comparable tp what it would be,
# if we had always predicted TARGET_VALUE=0

# Possible paths forward could be to try and use decision tree
# algorithms, or to find more features or more training data.
# For instance, I could fill unassigned values in the data I 
# discarded.

In [15]:
# Let's finally do a few predictions.
test = pd.read_csv('test_auto.csv')

test_features=test.loc[:,'INDEX':'URBANICITY']

# Now if this were prediction that were going to be used
# I would think about what values to assign to the missing
# values. For numerical values good missing values could be
# the mean or the median. For categorical data, either I could
# choose the most frequent category, or I could choose one which
# is unlikely going to affect the prediction (if that is possible)
# I also checked the data and saw that only a few features have missing
# values, so an alternative strategy could have been to train the 
# models without the affected features and check performance. 
#
# In the end, to save some time, I will just ignore the data with
# missing values. I hope this is ok.

#test_features=train_nomiss.loc[:,'KIDSDRIV':'URBANICITY']
test_nomiss=test_features
for column in test_features:
    if column !='TARGET_FLAG' and column !='TARGET_AMT':
        test_nomiss=test_nomiss[pd.notnull(test_nomiss[column])]

test_nomiss.INCOME=test_nomiss.INCOME.str.replace(',','').str.extract('(\d+)', expand=False)
test_nomiss.HOME_VAL=test_nomiss.HOME_VAL.str.replace(',','').str.extract('(\d+)', expand=False)
test_nomiss.BLUEBOOK=test_nomiss.BLUEBOOK.str.replace(',','').str.extract('(\d+)', expand=False)
test_nomiss.OLDCLAIM=test_nomiss.OLDCLAIM .str.replace(',','').str.extract('(\d+)', expand=False)

test_nomiss_index=np.array(test_nomiss.INDEX.astype(int))

test_num_list=[]
test_cat_list=[]
for feat in test_nomiss:
    if test_nomiss[feat].nunique() >=6 and feat !='JOB' and feat!='EDUCATION' and feat!='CAR_TYPE' and feat !='INDEX':
        test_num_list.append(feat)
    else:
        if feat !='INDEX':
            test_cat_list.append(feat)
        
test_features_num=np.array(test_nomiss.loc[:,num_list].astype(float))
print("Numerical features: ", test_features_num.shape[1])

test_features_cat=test_nomiss.loc[:,cat_list]
print("Class features: ", test_features_cat.shape[1])


#This step could be automated if it was to be repeated regularly:
test_features_cat.PARENT1=test_features_cat.PARENT1.replace('No','0').replace('Yes','1')
test_features_cat.MSTATUS=test_features_cat.MSTATUS.replace('z_No','0').replace('Yes','1')
test_features_cat.SEX=test_features_cat.SEX.replace('z_F','0').replace('M','1')
test_features_cat.CAR_USE=test_features_cat.CAR_USE.replace('Private','0').replace('Commercial','1')
test_features_cat.RED_CAR=test_features_cat.RED_CAR.replace('no','0').replace('yes','1')
test_features_cat.REVOKED=test_features_cat.REVOKED.replace('No','0').replace('Yes','1')
test_features_cat.URBANICITY=test_features_cat.URBANICITY.replace('z_Highly Rural/ Rural','0').replace('Highly Urban/ Urban','1')


test_features_cat_2=np.array(test_features_cat.loc[:,['KIDSDRIV','PARENT1','MSTATUS','SEX',
                                            'CAR_USE','RED_CAR','REVOKED','URBANICITY']]).astype(float)

test_features_merge=np.concatenate((test_features_num, test_features_cat_2), axis=1)



# Again: This step could be automated if necessary. Be careful here, as we have to take the
# mean values of the training set.
test_features_cat.CAR_TYPE=test_features_cat.CAR_TYPE.replace('Minivan',features_CAR_TYPE['Minivan'])
test_features_cat.CAR_TYPE=test_features_cat.CAR_TYPE.replace('Panel Truck',features_CAR_TYPE['Panel Truck'])
test_features_cat.CAR_TYPE=test_features_cat.CAR_TYPE.replace('Pickup',features_CAR_TYPE['Pickup'])
test_features_cat.CAR_TYPE=test_features_cat.CAR_TYPE.replace('Sports Car',features_CAR_TYPE['Sports Car'])
test_features_cat.CAR_TYPE=test_features_cat.CAR_TYPE.replace('Van',features_CAR_TYPE['Van'])
test_features_cat.CAR_TYPE=test_features_cat.CAR_TYPE.replace('z_SUV',features_CAR_TYPE['z_SUV'])

test_features_cat.EDUCATION=test_features_cat.EDUCATION.replace('<High School',features_EDUCATION['<High School'])
test_features_cat.EDUCATION=test_features_cat.EDUCATION.replace('z_High School',features_EDUCATION['z_High School'])
test_features_cat.EDUCATION=test_features_cat.EDUCATION.replace('Bachelors',features_EDUCATION['Bachelors'])
test_features_cat.EDUCATION=test_features_cat.EDUCATION.replace('Masters',features_EDUCATION['Masters'])
test_features_cat.EDUCATION=test_features_cat.EDUCATION.replace('PhD',features_EDUCATION['PhD'])

test_features_cat.JOB=test_features_cat.JOB.replace('Student',features_JOB['Student'])
test_features_cat.JOB=test_features_cat.JOB.replace('z_Blue Collar',features_JOB['z_Blue Collar'])
test_features_cat.JOB=test_features_cat.JOB.replace('Clerical',features_JOB['Clerical'])
test_features_cat.JOB=test_features_cat.JOB.replace('Home Maker',features_JOB['Home Maker'])
test_features_cat.JOB=test_features_cat.JOB.replace('Professional',features_JOB['Professional'])
test_features_cat.JOB=test_features_cat.JOB.replace('Lawyer',features_JOB['Lawyer'])
test_features_cat.JOB=test_features_cat.JOB.replace('Doctor',features_JOB['Doctor'])
test_features_cat.JOB=test_features_cat.JOB.replace('Manager',features_JOB['Manager'])

test_features_cat_3 = np.array(test_features_cat.loc[:,['CAR_TYPE','EDUCATION','JOB']]).astype(float)

test_features_merge_full=np.concatenate((test_features_merge,test_features_cat_3), axis=1)

        

print(test_features_merge_full.shape)


Numerical features:  12
Class features:  11
(1612, 23)


In [16]:
# Now that we have processed that data and put it in the
# same order as for the training examples, we just need to
# do the feature scaling and we are good to go:
X_pred = scaler.transform(test_features_merge_full)

# Let's do the predictions with the SVM with and
# without adjusted weights

y_pred_svm=svm_tr.predict(X_pred)
y_pred_svmW=svmW_tr.predict(X_pred)

print('Fraction of positive predictions for SVM without adjustment: ',np.mean(y_pred_svm))
print('Fraction of positive predictions for SVM with adjustment: ',np.mean(y_pred_svmW))

Fraction of positive predictions for SVM without adjustment:  0.16377171215880892
Fraction of positive predictions for SVM with adjustment:  0.3939205955334988


In [17]:
# This shows what we could expect: The SVM with adjustment 
# predicts more often positive cases. The value of positive
# cases was about 26% (see above), so the SVM without adjustment
# tends to underestimate positive cases and the SVM with adjustment
# tends to overestimate positive cases.
# We could now try to go back to the training and the test set and try
# to adjust the weights further. But in that case we would need to divide
# the test set, into a validation and a cross-validation test as we would
# use the weight parameter to try and fit the validation set.

# Now let's prepare the data to write it to the output.
# The selection of the categorical data retained the original
# indexing, so despite dropping cases with missing values
# the different cases are still identifiable.
y_pred_con=np.stack((test_nomiss_index.T,y_pred_svm.T,y_pred_svmW.T), axis=1)
Prediction=pd.DataFrame(y_pred_con,index=test_features_cat.index,
                        columns=['INDEX','TARGET_FLAG_SVM','TARGET_FLAG_SVMW'])

Prediction.to_csv('Insurance_prediction.csv',index=False)