See : https://www.kaggle.com/c/prudential-life-insurance-assessment/data


### Variable Description
Id                      A unique identifier associated with an application.

Product_Info_1-7        A set of normalized variables relating to the product applied for

Ins_Age                 Normalized age of applicant

Ht                      Normalized height of applicant

Wt                      Normalized weight of applicant

BMI                     Normalized BMI of applicant

Employment_Info_1-6     A set of normalized variables relating to the employment history of the applicant.

InsuredInfo_1-6         A set of normalized variables providing information about the applicant.

Insurance_History_1-9   A set of normalized variables relating to the insurance history of the applicant.

Family_Hist_1-5         A set of normalized variables relating to the family history of the applicant.

Medical_History_1-41    A set of normalized variables relating to the medical history of the applicant.

Medical_Keyword_1-48    A set of dummy variables relating to the presence of/absence of a medical keyword being associated with the application.

**Response**            This is the target variable, an ordinal variable relating to the final decision associated with an application


#### The following variables are all categorical (nominal):
Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7,

Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3,

InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7,

Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7,

Insurance_History_8, Insurance_History_9,

Family_Hist_1,

Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7,

Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13,

Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19,

Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25,

Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30,

Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36,

Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41


#### The following variables are continuous:
Product_Info_4, Ins_Age, Ht, Wt, BMI,

Employment_Info_1, Employment_Info_4, Employment_Info_6,

Insurance_History_5,

Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5


#### The following variables are discrete:
Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32

Medical_Keyword_1-48 are dummy variables.

In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
# machine learning
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [118]:
df0 = pandas.read_csv("~/caffe/neuralnets/data/train.csv.gz")
df = df0.sample(frac=1) # shuffle to better split the train/test sets

In [119]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,59381,39507.211515,22815.883089,2,19780.000000,39487.000000,59211.000000,79146
Product_Info_1,59381,1.026355,0.160191,1,1.000000,1.000000,1.000000,2
Product_Info_3,59381,24.415655,5.072885,1,26.000000,26.000000,26.000000,38
Product_Info_4,59381,0.328952,0.282562,0,0.076923,0.230769,0.487179,1
Product_Info_5,59381,2.006955,0.083107,2,2.000000,2.000000,2.000000,3
Product_Info_6,59381,2.673599,0.739103,1,3.000000,3.000000,3.000000,3
Product_Info_7,59381,1.043583,0.291949,1,1.000000,1.000000,1.000000,3
Ins_Age,59381,0.405567,0.197190,0,0.238806,0.402985,0.567164,1
Ht,59381,0.707283,0.074239,0,0.654545,0.709091,0.763636,1
Wt,59381,0.292587,0.089037,0,0.225941,0.288703,0.345188,1


In [120]:
L = ['Response']
L1 = ['Product_Info_4', 'Ins_Age', 'Ht', 'Wt', 'BMI', 'Employment_Info_1', 'Employment_Info_4', 'Employment_Info_6']
L.extend(L1)
L2 = ['Insurance_History_5', 'Family_Hist_2', 'Family_Hist_3', 'Family_Hist_4', 'Family_Hist_5']
L.extend(L2)
df[L].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Response,59381,5.636837,2.456833,1,4.0,6.0,8.0,8.0
Product_Info_4,59381,0.328952,0.282562,0,0.076923,0.230769,0.487179,1.0
Ins_Age,59381,0.405567,0.19719,0,0.238806,0.402985,0.567164,1.0
Ht,59381,0.707283,0.074239,0,0.654545,0.709091,0.763636,1.0
Wt,59381,0.292587,0.089037,0,0.225941,0.288703,0.345188,1.0
BMI,59381,0.469462,0.122213,0,0.385517,0.451349,0.532858,1.0
Employment_Info_1,59362,0.077582,0.082347,0,0.035,0.06,0.1,1.0
Employment_Info_4,52602,0.006283,0.032816,0,0.0,0.0,0.0,1.0
Employment_Info_6,48527,0.361469,0.349551,0,0.06,0.25,0.55,1.0
Insurance_History_5,33985,0.001733,0.007338,0,0.0004,0.000973,0.002,1.0


In [121]:
### note that some variables are not defined everywhere

In [122]:
L = ['Response']
L1 = ['Product_Info_4', 'Ins_Age', 'Ht', 'Wt', 'BMI']
L.extend(L1)
df[L].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Response,59381,5.636837,2.456833,1,4.0,6.0,8.0,8
Product_Info_4,59381,0.328952,0.282562,0,0.076923,0.230769,0.487179,1
Ins_Age,59381,0.405567,0.19719,0,0.238806,0.402985,0.567164,1
Ht,59381,0.707283,0.074239,0,0.654545,0.709091,0.763636,1
Wt,59381,0.292587,0.089037,0,0.225941,0.288703,0.345188,1
BMI,59381,0.469462,0.122213,0,0.385517,0.451349,0.532858,1


In [123]:
X = df[['Product_Info_4', 'Ins_Age', 'Ht', 'Wt', 'BMI']].as_matrix()
Y = df['Response'].as_matrix()

In [124]:
logreg = LogisticRegression(C=1e5)
logreg.fit(X, Y)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [125]:
# WARNING : check how Logistic handles more than 2 classes
len( [1 for y, ym in zip(Y, logreg.predict(X)) if y==ym] ) / float(len(Y))

0.41191626951381755

In [126]:
knn = KNeighborsClassifier()
knn.fit(X, Y) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [127]:
len( [1 for y, ym in zip(Y, knn.predict(X)) if y==ym] ) / float(len(Y))

0.5538808709856688

In [128]:
c2val, c2prob = chi2(X, Y)
c2val.sort()
c2val = np.fliplr([c2val])[0]
print c2val

[ 716.82528432  485.96687396  404.88763251  346.41630164   11.27036626]


In [129]:
print X.shape
X_new = SelectKBest(chi2, k=2).fit_transform(X, Y)
print X_new.shape

(59381, 5)
(59381, 2)


### Turn categorical variables into dummies with OneHotEncoding

List of variables:

Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7,
Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3,
InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7,
Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7,
Insurance_History_8, Insurance_History_9,
Family_Hist_1,
Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7,
Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13,
Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19,
Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25,
Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30,
Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36,
Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41


In [130]:
catstring = 'Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7, Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7, Insurance_History_8, Insurance_History_9, Family_Hist_1, Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13, Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19, Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25, Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30, Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36, Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41'
categories = catstring.replace(' ','').split(',')
print categories[0:10]

['Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 'Product_Info_7', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5', 'InsuredInfo_1']


In [131]:
encX = OneHotEncoder()
# remove Product_Info_2 as it is not numeric (should convert it separately)
Xcat = df[categories].drop('Product_Info_2', 1).as_matrix()
#print Xcat.shape
#print df[categories].head()
encX.fit(Xcat)  
Xohe = encX.transform(Xcat).toarray()
print Xohe.shape

# as Y has 9 categories it can be usefull to treat them separately
encY = OneHotEncoder()
encY.fit(Y.reshape(-1, 1)) # reshape as Y is a vector and OHE requires a matrix
Yohe = encY.transform(Y.reshape(-1, 1))
print Yohe.shape

(59381, 810)
(59381, 8)


### Discrete variables

In [132]:
discstring = 'Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32'
discretes = discstring.replace(' ', '').split(',')
Xdisc = df[discretes]
Xdisc.describe()

Unnamed: 0,Medical_History_1,Medical_History_10,Medical_History_15,Medical_History_24,Medical_History_32
count,50492.0,557.0,14785.0,3801.0,1107.0
mean,7.962172,141.118492,123.760974,50.635622,11.965673
std,13.027697,107.759559,98.516206,78.149069,38.718774
min,0.0,0.0,0.0,0.0,0.0
25%,2.0,8.0,17.0,1.0,0.0
50%,4.0,229.0,117.0,8.0,0.0
75%,9.0,240.0,240.0,64.0,2.0
max,240.0,240.0,240.0,240.0,240.0


### merge continuous data with categorial OHE and original dummy variables

In [133]:
dummies = ['Medical_Keyword_'+str(i) for i in range(1,49)]
Xdummies = df[dummies].as_matrix()

In [134]:
Xmerge = np.concatenate((X, Xohe, Xdummies), axis=1)

### chi2 selection

In [135]:
def getbests(Xarray, Yarray, nbkeep=20):
    c2val, c2prob = chi2(Xarray, Yarray)
    print len([j for j, p in enumerate(c2prob) if p<0.01]) / float(len(c2prob))
    aux = c2val.tolist()
    aux.sort()
    aux.reverse()
    minc2val = aux[nbkeep]
    return [j for j, cv in enumerate(c2val) if cv>minc2val]

bests20 = getbests(Xmerge, Y, 20)
Xbests20 = Xmerge[:,bests20]
print Xbests20.shape

bests30 = getbests(Xmerge, Y, 30)
Xbests30 = Xmerge[:,bests30]
print Xbests30.shape

bests40 = getbests(Xmerge, Y, 40)
Xbests40 = Xmerge[:,bests40]
print Xbests40.shape

bests50 = getbests(Xmerge, Y, 50)
Xbests50 = Xmerge[:,bests50]
print Xbests50.shape

0.273464658169
(59381, 20)
0.273464658169
(59381, 30)
0.273464658169
(59381, 40)
0.273464658169
(59381, 50)


#### KNN

In [136]:
knn2 = KNeighborsClassifier()
Xknntrain = Xbests20[range(0,50000), :]
Yknntrain = Y[range(0,50000)]
Xknntest = Xbests20[range(50000,59000), :]
Yknntest = Y[range(50000,59000)]
knn2.fit(Xknntrain, Yknntrain) # lower the "bests" threshold to include more variables ... but KNN will slow drastically

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [137]:
#len( [1 for y, ym in zip(Y, knn2.predict(Xbests30)) if y==ym] ) / float(len(Y))
print knn2.score(Xknntrain, Yknntrain)
print knn2.score(Xknntest, Yknntest)

0.38608
0.362333333333


In [138]:
# split the set into different Y classes to measure their importance
np.mean(encY.transform(Yknntrain.reshape(-1, 1)).toarray(), axis=0)

array([ 0.1036 ,  0.11054,  0.01726,  0.02364,  0.0924 ,  0.19006,
        0.13494,  0.32756])

In [147]:
classcol = 7
#model = LogisticRegression()
#model = KNeighborsClassifier()
#model = RandomForestClassifier(n_estimators=50)
#model = GaussianNB()
model = SVC()
Xrftrain = Xbests40[range(0,40000), :]
Yrftrain = Y[range(0,40000)]
Xrftest = Xbests40[range(40000,59000), :]
Yrftest = Y[range(40000,59000)]
colYrftrain = encY.transform(Yrftrain.reshape(-1, 1)).getcol(classcol).toarray().flatten()
colYrftest = encY.transform(Yrftest.reshape(-1, 1)).getcol(classcol).toarray().flatten()
model.fit(Xrftrain, colYrftrain)
print model.score(Xrftrain, colYrftrain)
print model.score(Xrftest, colYrftest)

0.810825
0.809789473684


#### Random Forests

In [145]:
random_forest = RandomForestClassifier(n_estimators=50)
Xrftrain = Xbests20[range(0,50000), :]
Yrftrain = Y[range(0,50000)]
Xrftest = Xbests20[range(50000,59000), :]
Yrftest = Y[range(50000,59000)]
random_forest.fit(Xrftrain, Yrftrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [146]:
#Y_pred = random_forest.predict(X)
print random_forest.score(Xrftrain, Yrftrain)
print random_forest.score(Xrftest, Yrftest)

0.46628
0.433777777778
