See : https://www.kaggle.com/c/prudential-life-insurance-assessment/data


### Variable Description
Id                      A unique identifier associated with an application.

Product_Info_1-7        A set of normalized variables relating to the product applied for

Ins_Age                 Normalized age of applicant

Ht                      Normalized height of applicant

Wt                      Normalized weight of applicant

BMI                     Normalized BMI of applicant

Employment_Info_1-6     A set of normalized variables relating to the employment history of the applicant.

InsuredInfo_1-6         A set of normalized variables providing information about the applicant.

Insurance_History_1-9   A set of normalized variables relating to the insurance history of the applicant.

Family_Hist_1-5         A set of normalized variables relating to the family history of the applicant.

Medical_History_1-41    A set of normalized variables relating to the medical history of the applicant.

Medical_Keyword_1-48    A set of dummy variables relating to the presence of/absence of a medical keyword being associated with the application.

**Response**            This is the target variable, an ordinal variable relating to the final decision associated with an application


#### The following variables are all categorical (nominal) :
Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7,

Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3,

InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7,

Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7,

Insurance_History_8, Insurance_History_9,

Family_Hist_1,

Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7,

Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13,

Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19,

Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25,

Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30,

Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36,

Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41


#### The following variables are continuous :
Product_Info_4, Ins_Age, Ht, Wt, BMI,

Employment_Info_1, Employment_Info_4, Employment_Info_6,

Insurance_History_5,

Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5


#### The following variables are discrete :
Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32

#### The following variables are dummy variables :
Medical_Keyword_1-48

In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
# machine learning
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [3]:
df0 = pandas.read_csv("../data/train.csv.gz")
df = df0.sample(frac=1) # shuffle to better split the train/test sets

In [78]:
#df.describe().transpose()

### Continuous variables
##### Product_Info_4, Ins_Age, Ht, Wt, BMI
##### Employment_Info_1, Employment_Info_4, Employment_Info_6
##### Insurance_History_5
##### Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5

In [93]:
L = []
L1 = ['Product_Info_4', 'Ins_Age', 'Ht', 'Wt', 'BMI', 'Employment_Info_1', 'Employment_Info_4', 'Employment_Info_6']
L.extend(L1)
L2 = ['Insurance_History_5', 'Family_Hist_2', 'Family_Hist_3', 'Family_Hist_4', 'Family_Hist_5']
L.extend(L2)
df[L].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Product_Info_4,59381,0.328952,0.282562,0,0.076923,0.230769,0.487179,1.0
Ins_Age,59381,0.405567,0.19719,0,0.238806,0.402985,0.567164,1.0
Ht,59381,0.707283,0.074239,0,0.654545,0.709091,0.763636,1.0
Wt,59381,0.292587,0.089037,0,0.225941,0.288703,0.345188,1.0
BMI,59381,0.469462,0.122213,0,0.385517,0.451349,0.532858,1.0
Employment_Info_1,59362,0.077582,0.082347,0,0.035,0.06,0.1,1.0
Employment_Info_4,52602,0.006283,0.032816,0,0.0,0.0,0.0,1.0
Employment_Info_6,48527,0.361469,0.349551,0,0.06,0.25,0.55,1.0
Insurance_History_5,33985,0.001733,0.007338,0,0.0004,0.000973,0.002,1.0
Family_Hist_2,30725,0.47455,0.154959,0,0.362319,0.463768,0.57971,1.0


In [94]:
### note that some variables are not defined everywhere

In [95]:
L1 = ['Product_Info_4', 'Ins_Age', 'Ht', 'Wt', 'BMI']
df[L1].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Product_Info_4,59381,0.328952,0.282562,0,0.076923,0.230769,0.487179,1
Ins_Age,59381,0.405567,0.19719,0,0.238806,0.402985,0.567164,1
Ht,59381,0.707283,0.074239,0,0.654545,0.709091,0.763636,1
Wt,59381,0.292587,0.089037,0,0.225941,0.288703,0.345188,1
BMI,59381,0.469462,0.122213,0,0.385517,0.451349,0.532858,1


In [98]:
for l in L:
    if not(l in L1):
        print(l, df[l].mean())
        df[l].fillna((df[l].mean()), inplace=True)

('Employment_Info_1', 0.07758209953084583)
('Employment_Info_4', 0.0062826743249304795)
('Employment_Info_6', 0.3614688040014903)
('Insurance_History_5', 0.0017330636999304842)
('Family_Hist_2', 0.47455006427179613)
('Family_Hist_3', 0.4977373765798334)
('Family_Hist_4', 0.4448902535379505)
('Family_Hist_5', 0.4846349296655123)


In [99]:
df[L].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Product_Info_4,59381,0.328952,0.282562,0,0.076923,0.230769,0.487179,1.0
Ins_Age,59381,0.405567,0.19719,0,0.238806,0.402985,0.567164,1.0
Ht,59381,0.707283,0.074239,0,0.654545,0.709091,0.763636,1.0
Wt,59381,0.292587,0.089037,0,0.225941,0.288703,0.345188,1.0
BMI,59381,0.469462,0.122213,0,0.385517,0.451349,0.532858,1.0
Employment_Info_1,59381,0.077582,0.082334,0,0.035,0.06,0.1,1.0
Employment_Info_4,59381,0.006283,0.030887,0,0.0,0.0,0.0,1.0
Employment_Info_6,59381,0.361469,0.315993,0,0.1,0.35,0.5,1.0
Insurance_History_5,59381,0.001733,0.005551,0,0.000667,0.001733,0.001733,1.0
Family_Hist_2,59381,0.47455,0.111464,0,0.449275,0.47455,0.47455,1.0


In [100]:
X = df[L].as_matrix()
Y = df['Response'].as_matrix()

In [101]:
logreg = LogisticRegression(C=1e5)
logreg.fit(X, Y)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [102]:
# WARNING : check how Logistic handles more than 2 classes
len( [1 for y, ym in zip(Y, logreg.predict(X)) if y==ym] ) / float(len(Y))

0.4160758491773463

In [11]:
knn = KNeighborsClassifier()
knn.fit(X, Y) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [12]:
len( [1 for y, ym in zip(Y, knn.predict(X)) if y==ym] ) / float(len(Y))

0.5535440629157474

In [13]:
c2val, c2prob = chi2(X, Y)
c2val.sort()
c2val = np.fliplr([c2val])[0]
print c2val

[ 716.82528432  485.96687396  404.88763251  346.41630164   11.27036626]


In [14]:
print X.shape
X_new = SelectKBest(chi2, k=2).fit_transform(X, Y)
print X_new.shape

(59381, 5)
(59381, 2)


### Turn categorical variables into dummies with OneHotEncoding

List of variables:

Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7,
Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3,
InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7,
Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7,
Insurance_History_8, Insurance_History_9,
Family_Hist_1,
Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7,
Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13,
Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19,
Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25,
Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30,
Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36,
Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41


In [72]:
catstring = 'Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7, '
catstring+= 'Employment_Info_2, Employment_Info_3, Employment_Info_5, '
catstring+= 'InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, '
catstring+= 'Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7, '
catstring+= 'Insurance_History_8, Insurance_History_9, '
catstring+= 'Family_Hist_1, '
catstring+= 'Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, '
catstring+= 'Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, '
catstring+= 'Medical_History_13, Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, '
catstring+= 'Medical_History_19, Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, '
catstring+= 'Medical_History_25, Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, '
catstring+= 'Medical_History_30, Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, '
catstring+= 'Medical_History_36, Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, '
catstring+= 'Medical_History_41'
categories = catstring.replace(' ','').split(',')
print categories[0:10]

['Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 'Product_Info_7', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5', 'InsuredInfo_1']


In [73]:
df[categories].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Product_Info_1,59381,1.026355,0.160191,1,1,1,1,2
Product_Info_3,59381,24.415655,5.072885,1,26,26,26,38
Product_Info_5,59381,2.006955,0.083107,2,2,2,2,3
Product_Info_6,59381,2.673599,0.739103,1,3,3,3,3
Product_Info_7,59381,1.043583,0.291949,1,1,1,1,3
Employment_Info_2,59381,8.641821,4.227082,1,9,9,9,38
Employment_Info_3,59381,1.300904,0.715034,1,1,1,1,3
Employment_Info_5,59381,2.142958,0.350033,2,2,2,2,3
InsuredInfo_1,59381,1.209326,0.417939,1,1,1,1,3
InsuredInfo_2,59381,2.007427,0.085858,2,2,2,2,3


### WARNING : Product_Info_2 is not numeric / still need to include it

In [91]:
print( df[['Product_Info_2']].count() )
df[['Product_Info_2']].head(5)

Product_Info_2    59381
dtype: int64


Unnamed: 0,Product_Info_2
50227,A8
44980,A1
45755,C4
17816,D1
2552,D4


In [16]:
encX = OneHotEncoder()
# remove Product_Info_2 as it is not numeric (should convert it separately)
Xcat = df[categories].drop('Product_Info_2', 1).as_matrix()
#print Xcat.shape
#print df[categories].head()
encX.fit(Xcat)  
Xohe = encX.transform(Xcat).toarray()
print Xohe.shape

# as Y has 9 categories it can be usefull to treat them separately
encY = OneHotEncoder()
encY.fit(Y.reshape(-1, 1)) # reshape as Y is a vector and OHE requires a matrix
Yohe = encY.transform(Y.reshape(-1, 1))
print Yohe.shape

(59381, 810)
(59381, 8)


### Discrete variables / WARNING still need to include these

In [76]:
discstring = 'Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32'
discretes = discstring.replace(' ', '').split(',')
Xdisc = df[discretes]

In [77]:
Xdisc.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Medical_History_1,50492,7.962172,13.027697,0,2,4,9,240
Medical_History_10,557,141.118492,107.759559,0,8,229,240,240
Medical_History_15,14785,123.760974,98.516206,0,17,117,240,240
Medical_History_24,3801,50.635622,78.149069,0,1,8,64,240
Medical_History_32,1107,11.965673,38.718774,0,0,0,2,240


### Dummy variables

In [18]:
dummies = ['Medical_Keyword_'+str(i) for i in range(1,49)]

In [74]:
df[dummies].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Medical_Keyword_1,59381,0.042,0.200591,0,0,0,0,1
Medical_Keyword_2,59381,0.008942,0.094141,0,0,0,0,1
Medical_Keyword_3,59381,0.049275,0.216443,0,0,0,0,1
Medical_Keyword_4,59381,0.01455,0.119744,0,0,0,0,1
Medical_Keyword_5,59381,0.008622,0.092456,0,0,0,0,1
Medical_Keyword_6,59381,0.012597,0.111526,0,0,0,0,1
Medical_Keyword_7,59381,0.01391,0.117119,0,0,0,0,1
Medical_Keyword_8,59381,0.010407,0.101485,0,0,0,0,1
Medical_Keyword_9,59381,0.006652,0.081289,0,0,0,0,1
Medical_Keyword_10,59381,0.036459,0.187432,0,0,0,0,1


In [None]:
Xdummies = df[dummies].as_matrix()

### Merge

In [103]:
Xmerge = np.concatenate((X, Xohe, Xdummies), axis=1)

In [104]:
Xmerge.shape

(59381, 871)

### chi2 selection

In [20]:
def getbests(Xarray, Yarray, nbkeep=20):
    c2val, c2prob = chi2(Xarray, Yarray)
    print len([j for j, p in enumerate(c2prob) if p<0.01]) / float(len(c2prob))
    aux = c2val.tolist()
    aux.sort()
    aux.reverse()
    minc2val = aux[nbkeep]
    return [j for j, cv in enumerate(c2val) if cv>minc2val]

bests20 = getbests(Xmerge, Y, 20)
Xbests20 = Xmerge[:,bests20]
print Xbests20.shape

bests30 = getbests(Xmerge, Y, 30)
Xbests30 = Xmerge[:,bests30]
print Xbests30.shape

bests40 = getbests(Xmerge, Y, 40)
Xbests40 = Xmerge[:,bests40]
print Xbests40.shape

bests50 = getbests(Xmerge, Y, 50)
Xbests50 = Xmerge[:,bests50]
print Xbests50.shape

0.273464658169
(59381, 20)
0.273464658169
(59381, 30)
0.273464658169
(59381, 40)
0.273464658169
(59381, 50)


#### KNN

In [21]:
knn2 = KNeighborsClassifier()
Xknntrain = Xbests20[range(0,50000), :]
Yknntrain = Y[range(0,50000)]
Xknntest = Xbests20[range(50000,59000), :]
Yknntest = Y[range(50000,59000)]
knn2.fit(Xknntrain, Yknntrain) # lower the "bests" threshold to include more variables ... but KNN will slow drastically

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [22]:
#len( [1 for y, ym in zip(Y, knn2.predict(Xbests30)) if y==ym] ) / float(len(Y))
print knn2.score(Xknntrain, Yknntrain)
print knn2.score(Xknntest, Yknntest)

0.41682
0.395666666667


In [23]:
# split the set into different Y classes to measure their importance
np.mean(encY.transform(Yknntrain.reshape(-1, 1)).toarray(), axis=0)

array([ 0.10442,  0.11038,  0.017  ,  0.0241 ,  0.09196,  0.18908,
        0.13442,  0.32864])

In [24]:
classcol = 7
#model = LogisticRegression()
#model = KNeighborsClassifier()
#model = RandomForestClassifier(n_estimators=50)
#model = GaussianNB()
model = SVC()
Xrftrain = Xbests40[range(0,40000), :]
Yrftrain = Y[range(0,40000)]
Xrftest = Xbests40[range(40000,59000), :]
Yrftest = Y[range(40000,59000)]
colYrftrain = encY.transform(Yrftrain.reshape(-1, 1)).getcol(classcol).toarray().flatten()
colYrftest = encY.transform(Yrftest.reshape(-1, 1)).getcol(classcol).toarray().flatten()
model.fit(Xrftrain, colYrftrain)
print model.score(Xrftrain, colYrftrain)
print model.score(Xrftest, colYrftest)

0.81185
0.809894736842


#### Random Forests

In [25]:
random_forest = RandomForestClassifier(n_estimators=50)
Xrftrain = Xbests20[range(0,50000), :]
Yrftrain = Y[range(0,50000)]
Xrftest = Xbests20[range(50000,59000), :]
Yrftest = Y[range(50000,59000)]
random_forest.fit(Xrftrain, Yrftrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
#Y_pred = random_forest.predict(X)
print random_forest.score(Xrftrain, Yrftrain)
print random_forest.score(Xrftest, Yrftest)

0.46624
0.436


### Neural Net

In [28]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, Embedding, Activation, LSTM, merge, Flatten, Dropout, Lambda
from keras.models import Model, Sequential
from keras.engine.topology import Merge
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers.convolutional import *
from keras.utils.data_utils import get_file

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled)


In [111]:
model = Sequential()
model.add( Dense(400, init='glorot_uniform', activation='relu', input_dim=871) )
model.add( BatchNormalization() )
model.add( Dropout(0.4) )
model.add( Dense(200, activation='relu') )
model.add( BatchNormalization() )
model.add( Dropout(0.4) )
model.add( Dense(100, activation='relu') )
model.add( BatchNormalization() )
model.add( Dropout(0.4) )
model.add( Dense(8, activation='softmax') )
model.compile(optimizer=Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])

In [106]:
Xmerge.shape, Yohe.toarray().shape

((59381, 871), (59381, 8))

In [107]:
from sklearn import preprocessing
import numpy as np
min_max_scaler = preprocessing.MinMaxScaler()
Xmerge_minmax = min_max_scaler.fit_transform(Xmerge)

In [108]:
Xnn_train = Xmerge_minmax[0:45000]
Xnn_valid = Xmerge_minmax[45000:]

Ynn_train = Yohe.toarray()[0:45000]
Ynn_valid = Yohe.toarray()[45000:]

In [113]:
model.optimizer.lr = 1e-4
model.fit(Xnn_train, Ynn_train, nb_epoch=10, batch_size=64, validation_data=(Xnn_valid, Ynn_valid), verbose=1)

Train on 45000 samples, validate on 14381 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff739236250>

### GATED MIXTURE OF EXPERTS

In [25]:
from keras.layers import Input, Dense, merge, Flatten
from keras.models import Model
from keras import backend as K

# this returns a tensor
inputs = Input(shape=(784,))

# a layer instance is callable on a tensor, and returns a tensor
x1 = Dense(64, activation='relu')(inputs)
x1 = Dense(64, activation='relu')(x1)
predictions1 = Dense(10, activation='softmax')(x1)

# a layer instance is callable on a tensor, and returns a tensor
x2 = Dense(64, activation='relu')(inputs)
x2 = Dense(64, activation='relu')(x2)
predictions2 = Dense(10, activation='softmax')(x2)


merged = merge([predictions1,predictions2], mode='concat', concat_axis=1) # 1 or 0 ?

# a layer instance is callable on a tensor, and returns a tensor
gate = Dense(2, activation='softmax')(inputs)

# this creates a model that includes
# the Input layer and three Dense layers
model = Model(input=inputs, output=[gate, merged])

'''model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])'''

def me_loss(y_true, modeloutput):
    g = modeloutput[0]
    o1 = K.gather(modeloutput[1], 0)
    o2 = K.gather(modeloutput[1], 1)
    K.transpose(K.exp(-0.5 * K.square(y_true - o1)))
    A = K.gather(g, 0) * K.transpose(K.exp(-0.5 * K.square(y_true - o1)))
    B = K.gather(g, 1) * K.transpose(K.exp(-0.5 * K.square(y_true - o2)))
    return -K.log(K.sum(A+B))

# [...] edit the compile line from above example
model.compile(optimizer='Adam', loss=me_loss)