### Prudential Life Insurance Assessment 
Develop a predictive model that accurately classifies risk

More info: [kaggle competition webpage](https://www.kaggle.com/c/prudential-life-insurance-assessment)

Word on the street is that people using Gradient Boosting are at the top of the leaderboard. The current version of this code demonstrates the use of sklearn's Gradient Boosting classifier to achieve a rather mediocre score. Increasing the number of estimators improves the results, but it takes over one hour per run. The next step is to use xgboost instead, which is supposed to be faster.

In [88]:
import pandas as pd
import numpy as np
import sklearn.svm
import sklearn.ensemble
import time

def read_test_train(sample_training=False): 
    #sample training set? False: use the whole data set, otherwise, input a fraction to use (value from 0-1)

    df_train = pd.read_csv('train.csv')
    df_test = pd.read_csv('test.csv')
    
    if sample_training:
        df_train = df_train.sample(frac=sample_training)

    return df_train, df_test

df_train, df_test = read_test_train(sample_training=1)
df_test.head()

Unnamed: 0,Id,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,...,Medical_Keyword_39,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48
0,1,1,D3,26,0.487179,2,3,1,0.61194,0.781818,...,0,0,0,0,0,0,0,0,0,0
1,3,1,A2,26,0.076923,2,3,1,0.626866,0.727273,...,0,0,0,0,0,0,0,0,0,0
2,4,1,D3,26,0.144667,2,3,1,0.58209,0.709091,...,0,0,0,0,0,0,0,0,0,0
3,9,1,A1,26,0.151709,2,1,1,0.522388,0.654545,...,0,0,0,0,0,0,0,0,1,1
4,12,1,A1,26,0.076923,2,3,1,0.298507,0.672727,...,0,0,0,0,0,0,0,0,0,0


In [89]:
def get_labels_features_ids(df_train, df_test):
    y_train = df_train["Response"]
    X_train = df_train.iloc[:,1:127]
    X_test = df_test.iloc[:,1:127]
    id_test = df_test['Id']
    return y_train, X_train, X_test, id_test

y_train, X_train, X_test, id_test = get_labels_features_ids(df_train, df_test)

In [90]:
# Function to list features with missing values
def listNA(X):
    print("%20s \tCount \tPct missing" % 'Feature')
    for column_name, column in X.iteritems():
        naCount = sum(column.isnull())
        if naCount > 0:
           print("%20s \t%5d  \t%2.2f%%" % (column_name, naCount, 100.*naCount/X.shape[0]))
        
#listNA(X_train)
    
#fill NAs using the mean for now
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

In [91]:
def addDummies(X_train, X_test):    
    """
    Simple function to convert the categorical variables from the Prudential
    dataset into dummy (1/0) variables. Returns new train/test dfs
    """
    
    categoricalColNames = ["Product_Info_1", "Product_Info_2", "Product_Info_3", "Product_Info_5", "Product_Info_6", "Product_Info_7", "Employment_Info_2", "Employment_Info_3", "Employment_Info_5", "InsuredInfo_1", "InsuredInfo_2", "InsuredInfo_3", "InsuredInfo_4", "InsuredInfo_5", "InsuredInfo_6", "InsuredInfo_7", "Insurance_History_1", "Insurance_History_2", "Insurance_History_3", "Insurance_History_4", "Insurance_History_7", "Insurance_History_8", "Insurance_History_9", "Family_Hist_1", "Medical_History_2", "Medical_History_3", "Medical_History_4", "Medical_History_5", "Medical_History_6", "Medical_History_7", "Medical_History_8", "Medical_History_9", "Medical_History_10", "Medical_History_11", "Medical_History_12", "Medical_History_13", "Medical_History_14", "Medical_History_16", "Medical_History_17", "Medical_History_18", "Medical_History_19", "Medical_History_20", "Medical_History_21", "Medical_History_22", "Medical_History_23", "Medical_History_25", "Medical_History_26", "Medical_History_27", "Medical_History_28", "Medical_History_29", "Medical_History_30", "Medical_History_31", "Medical_History_33", "Medical_History_34", "Medical_History_35", "Medical_History_36", "Medical_History_37", "Medical_History_38", "Medical_History_39", "Medical_History_40", "Medical_History_41"]
    newColumns = []
    
    X = pd.concat([X_train,X_test], axis= 0)
    for colName in X.columns:
        index = X.columns.get_loc(colName)
        if colName in categoricalColNames:
            dummies = pd.get_dummies(X.ix[:,index], prefix = colName, prefix_sep = ".")
            newColumns.append(dummies)
        else:
            newColumns.append(X.ix[:,index])
            
    new_X = pd.concat(newColumns, axis = 1)
    new_X_train = new_X[:X_train.shape[0]]
    new_X_test = new_X[X_train.shape[0]:]        
    
    return new_X_train, new_X_test


def scaleFeatures(X_train, X_test):
    """ Standardize features by removing the mean and scaling to unit variance
    """
    stdscaler = sklearn.preprocessing.StandardScaler()
    stdscaler.fit(X_train)
    
    X_train_sc = stdscaler.transform(X_train)
    X_test_sc = stdscaler.transform(X_test)

    return X_train_sc, X_test_sc


X_train, X_test = addDummies(X_train, X_test)
#X_test = addDummies(X_test)
X_train, X_test = scaleFeatures(X_train, X_test)

X_train.shape, X_test.shape

((59381, 1079), (19765, 1079))

In [92]:
#Dicts with classifiers and their parameters for Grid Search CV

classifiers = {
    #'Logistic Regression': sklearn.linear_model.LogisticRegression(),
    #'Random Forests': sklearn.ensemble.RandomForestClassifier(n_estimators=100),
    #'SVC': sklearn.svm.SVC(C=1.0, kernel='linear', probability=False),
    #'LinearSVC': sklearn.svm.LinearSVC(),
    #'MultinomialNB': sklearn.naive_bayes.BernoulliNB(),
    'Gradient Boosting': sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, subsample=0.7),
}

classifiers_gridparameters = {
    'Logistic Regression': None,
    'Random Forests': None,
    'SVC': [{'kernel': ['linear'], 'C': [0.2, 0.4, 0.6, 0.8, 1.0]}],
    'LinearSVC': {'C': [0.1, 0.4, 0.6, 1.0, 10.]},
    'MultinomialNB': None,
    'Gradient Boosting': None,
    #'Gradient Boosting': {"n_estimators": [10, 50, 100], 'learning_rate': [0.1, 0.2, 0.3, 0.5], 'max_depth': [1, 2, 4],},
}

for clf_name, clf_notoptimized in classifiers.iteritems():
    
    skf = sklearn.cross_validation.StratifiedKFold(y_train, n_folds=5)
    param_grid = classifiers_gridparameters[clf_name]

    st = time.time()
    
    if param_grid is None:    
        print "Skipping grid search for %s" %clf_name
        clf_fitted = clf_notoptimized.fit(X_train, y_train)
    else:
        print "Doing grid search for %s" %clf_name
        clf = sklearn.grid_search.GridSearchCV(estimator=clf_notoptimized, param_grid=param_grid, cv=skf, scoring='accuracy')
        clf_fitted = clf.fit(X_train, y_train).best_estimator_
        clf_optimal_params = clf.best_params_
        print "Best parameters:", clf_optimal_params
    
    #scores = sklearn.cross_validation.cross_val_score(clf_fitted, X_train, y_train, cv=skf, scoring='accuracy')
    #print("CV Accuracy: %0.4f (+/- %0.4f) %s" % (scores.mean(), scores.std(), clf_name))
    
    elapsed_time = time.time() - st
    print("Elapsed time: %.2fs" % elapsed_time)

Skipping grid search for Gradient Boosting
Elapsed time: 4107.34s


In [93]:
y_pred = clf_fitted.predict(X_test)

In [94]:
submission = id_test.to_frame()
submission["Response"] = y_pred
submission.to_csv('pred_submission.csv', index=False)