In [1]:
import pandas as pd

### Read in Data

In [168]:
df = pd.read_csv('insurance_prediction_training.csv')
df = df.fillna(5.0)
df.head()

Unnamed: 0,id,region,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ,educ_good,...,literacy,age_missing,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,educ_missing,male_missing,literacy_missing,takeup
0,1,1,21,54,2,20,2.4,1,2.0,1,...,1,0,0,0,0,1,0,0,0,1
1,2,1,21,73,2,100,2.3,1,1.0,0,...,1,0,0,0,0,1,0,0,0,1
2,3,1,21,72,10,80,12.0,1,1.0,0,...,1,0,0,0,0,1,0,0,0,1
3,4,1,21,43,4,20,4.0,1,2.0,1,...,1,0,0,0,0,1,0,0,0,0
4,5,1,21,63,6,90,14.0,1,1.0,0,...,1,0,0,0,0,1,0,0,0,0


### Make dummy variables for region and educ, drop village

In [169]:
educ_dummy = pd.get_dummies(df['educ'])
new_names = []
for i in range(len(educ_dummy.columns)):
    new_names.append('educ_'+str(i))
educ_dummy.columns = new_names
df = df.drop(['educ'],axis=1)
df = pd.concat([df,educ_dummy],axis=1)

region_dummy = pd.get_dummies(df['region'])
new_names = []
for i in range(len(region_dummy.columns)):
    new_names.append('region_'+str(i))
region_dummy.columns = new_names
df = df.drop(['region','village'],axis=1)
df = pd.concat([df,region_dummy],axis=1)
df.head()

### Split into train and val

In [171]:
from sklearn.model_selection import train_test_split

X,y = df.drop(['takeup'],axis=1), df['takeup']
x_train, x_val, y_train, y_val = train_test_split(X,y,test_size=.3)

### Try hella models

In [143]:
from sklearn.linear_model import Lasso # .69269
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, roc_auc_score

clf1 = Lasso()
params = [{'alpha':[0.5,1]}]
clf2 = GridSearchCV(clf1,param_grid=params,cv=5)
clf2.fit(x_train,y_train)
prediction = clf2.predict(x_val)
log_loss(y_val, prediction), roc_auc_score(y_val,prediction)

(0.6881000003895026, 0.5517550673176507)

In [144]:
from sklearn.linear_model import ElasticNet

clf1 = ElasticNet()
params = [{'alpha':[.8,1],
          'l1_ratio':[.05,.1,.3]}]
clf2 = GridSearchCV(clf1,param_grid=params,cv=5)
clf2.fit(x_train,y_train)
prediction = clf2.predict(x_val)
log_loss(y_val, prediction), roc_auc_score(y_val,prediction)

(0.6863568905604147, 0.5573180204172214)

In [145]:
from sklearn.linear_model import Ridge

clf1 = Ridge()
params = [{'alpha':[.5,.6,.7,.8,.9,1]}]
clf2 = GridSearchCV(clf1,param_grid=params,cv=5)
clf2.fit(x_train,y_train)
prediction = clf2.predict(x_val)
log_loss(y_val, prediction), roc_auc_score(y_val,prediction)

(0.6656053612308962, 0.6365901020861073)

In [146]:
from sklearn.tree import DecisionTreeClassifier

clf1 = DecisionTreeClassifier()
params = [{'max_depth':[3,5,10],
          'min_samples_leaf':[1,5,10]}]
clf2 = GridSearchCV(clf1,param_grid=params,cv=5)
clf2.fit(x_train,y_train)
prediction = clf2.predict_proba(x_val)[:,1]
log_loss(y_val, prediction), roc_auc_score(y_val,prediction)

(1.6912305210259468, 0.8543821201361148)

### idk why i tried hella models when i knew i would pick gradient boosting

In [172]:
from sklearn.ensemble import GradientBoostingClassifier

clf1 = GradientBoostingClassifier()
params = [{'max_depth':[3,5,10],
          'min_samples_leaf':[1,5,10]}]
clf2 = GridSearchCV(clf1,param_grid=params,cv=5)
clf2.fit(x_train,y_train)
prediction = clf2.predict_proba(x_val)[:,1]
log_loss(y_val, prediction), roc_auc_score(y_val,prediction)

(0.1705490578808012, 0.9790971105318618)

In [173]:
# Check feature importance
# note: probably not good that ID is so important,, maybe fix that?
for (x,y) in zip(clf2.best_estimator_.feature_importances_,x_train.columns):
    print(x, y)

0.04976206570384889 id
0.21719420624467814 age
0.1017660743656746 agpop
0.10059497756722105 rice_inc
0.1811420315026488 ricearea_2010
0.024584086771555574 general_trust
0.014086387013268432 educ_good
0.021800698092000713 male
0.08154468314901134 disaster_loss
0.00974826730125566 disaster_yes
0.06253476201703731 risk_averse
0.013474809770470319 literacy
0.0 age_missing
2.3775746189792347e-05 agpop_missing
0.003729684389431356 rice_inc_missing
0.004143104161742537 ricearea_2010_missing
0.014548035848782117 disaster_loss_missing
3.838173733135276e-05 educ_missing
0.00010541867951109503 male_missing
3.493510225115859e-05 literacy_missing
0.01072441041595983 educ_0
0.016206803255032747 educ_1
0.010488727567569975 educ_2
0.01046246515713939 educ_3
0.0025015536791255124 educ_4
7.956897091184347e-05 educ_5
0.021154037792618704 region_0
0.015950605696462106 region_1
0.011575442301269348 region_2


### Predict on test data and write to csv

In [174]:
test = pd.read_csv('insurance_prediction_to_predict.csv')
test = test.fillna(5.0)

educ_dummy = pd.get_dummies(test['educ'])
new_names = []
for i in range(len(educ_dummy.columns)):
    new_names.append('educ_'+str(i))
educ_dummy.columns = new_names
test = test.drop(['educ'],axis=1)
test = pd.concat([test,educ_dummy],axis=1)

region_dummy = pd.get_dummies(test['region'])
new_names = []
for i in range(len(region_dummy.columns)):
    new_names.append('region_'+str(i))
region_dummy.columns = new_names
test = test.drop(['region','village'],axis=1)
test = pd.concat([test,region_dummy],axis=1)


test_prediction = clf2.predict_proba(test)[:,1]
test['takeup'] = test_prediction
test = test[['id','takeup']]
test.to_csv('my_predictions.csv',index=False)