In [1]:
# NumPy for numerical computing
import numpy as np

# Pandas for DataFrames
import pandas as pd

# Matplotlib for visualization
from matplotlib import pyplot as plt
# display plots in the notebook
%matplotlib inline 

# Seaborn for easier visualization
import seaborn as sns

In [2]:
# Load employee data from CSV
sub_df = pd.read_csv('sample_submission_sLex1ul.csv')
train_df = pd.read_csv('train_x4ELi83.csv')
test_df = pd.read_csv('test_Tu0FSbF.csv')

In [3]:
# Drop duplicates
print( train_df.shape )
train_df = train_df.drop_duplicates()
print( train_df.shape )

(79853, 13)
(79853, 13)


In [4]:
# Drop duplicates
print( test_df.shape )
test_df = test_df.drop_duplicates()
print( test_df.shape )

(34224, 12)
(34224, 12)


In [5]:
train_df.isnull().sum()

id                                     0
perc_premium_paid_by_cash_credit       0
age_in_days                            0
Income                                 0
Count_3-6_months_late                 97
Count_6-12_months_late                97
Count_more_than_12_months_late        97
application_underwriting_score      2974
no_of_premiums_paid                    0
sourcing_channel                       0
residence_area_type                    0
premium                                0
renewal                                0
dtype: int64

In [6]:
test_df.isnull().sum()

id                                     0
perc_premium_paid_by_cash_credit       0
age_in_days                            0
Income                                 0
Count_3-6_months_late                 31
Count_6-12_months_late                31
Count_more_than_12_months_late        31
application_underwriting_score      1323
no_of_premiums_paid                    0
sourcing_channel                       0
residence_area_type                    0
premium                                0
dtype: int64

In [7]:
# Indicator variable for missing last_evaluation
train_df['Count_3-6_months_late_missing'] = train_df['Count_3-6_months_late'].isnull().astype(int)
train_df['Count_6-12_months_late_missing'] = train_df['Count_6-12_months_late'].isnull().astype(int)
train_df['Count_more_than_12_months_late_missing'] = train_df['Count_more_than_12_months_late'].isnull().astype(int)
train_df['application_underwriting_score_missing'] = train_df['application_underwriting_score'].isnull().astype(int)

In [8]:
# Indicator variable for missing last_evaluation
test_df['Count_3-6_months_late_missing'] = test_df['Count_3-6_months_late'].isnull().astype(int)
test_df['Count_6-12_months_late_missing'] = test_df['Count_6-12_months_late'].isnull().astype(int)
test_df['Count_more_than_12_months_late_missing'] = test_df['Count_more_than_12_months_late'].isnull().astype(int)
test_df['application_underwriting_score_missing'] = test_df['application_underwriting_score'].isnull().astype(int)

In [9]:
train_df["Count_3-6_months_late"].fillna(train_df["Count_3-6_months_late"].mean(), inplace = True)
train_df["Count_6-12_months_late"].fillna(train_df["Count_3-6_months_late"].mean(), inplace = True)
train_df["Count_more_than_12_months_late"].fillna(train_df["Count_3-6_months_late"].mean(), inplace = True)
train_df["application_underwriting_score"].fillna(train_df["application_underwriting_score"].mean(), inplace = True)

In [10]:
test_df["Count_3-6_months_late"].fillna(test_df["Count_3-6_months_late"].mean(), inplace = True)
test_df["Count_6-12_months_late"].fillna(test_df["Count_3-6_months_late"].mean(), inplace = True)
test_df["Count_more_than_12_months_late"].fillna(test_df["Count_3-6_months_late"].mean(), inplace = True)
test_df["application_underwriting_score"].fillna(test_df["application_underwriting_score"].mean(), inplace = True)

In [11]:
train_df.isnull().sum()

id                                        0
perc_premium_paid_by_cash_credit          0
age_in_days                               0
Income                                    0
Count_3-6_months_late                     0
Count_6-12_months_late                    0
Count_more_than_12_months_late            0
application_underwriting_score            0
no_of_premiums_paid                       0
sourcing_channel                          0
residence_area_type                       0
premium                                   0
renewal                                   0
Count_3-6_months_late_missing             0
Count_6-12_months_late_missing            0
Count_more_than_12_months_late_missing    0
application_underwriting_score_missing    0
dtype: int64

In [12]:
test_df.isnull().sum()

id                                        0
perc_premium_paid_by_cash_credit          0
age_in_days                               0
Income                                    0
Count_3-6_months_late                     0
Count_6-12_months_late                    0
Count_more_than_12_months_late            0
application_underwriting_score            0
no_of_premiums_paid                       0
sourcing_channel                          0
residence_area_type                       0
premium                                   0
Count_3-6_months_late_missing             0
Count_6-12_months_late_missing            0
Count_more_than_12_months_late_missing    0
application_underwriting_score_missing    0
dtype: int64

In [13]:
import numpy as np
# Scikit-Learn for Modeling
import sklearn
# Pickle for saving model files
import pickle

# Function for splitting training and test set
from sklearn.model_selection import train_test_split # Scikit-Learn 0.18+

# Helper for cross-validation
from sklearn.model_selection import GridSearchCV

# Classification metrics (added later)
from sklearn.metrics import roc_curve, auc

In [14]:
train_df.columns

Index(['id', 'perc_premium_paid_by_cash_credit', 'age_in_days', 'Income',
       'Count_3-6_months_late', 'Count_6-12_months_late',
       'Count_more_than_12_months_late', 'application_underwriting_score',
       'no_of_premiums_paid', 'sourcing_channel', 'residence_area_type',
       'premium', 'renewal', 'Count_3-6_months_late_missing',
       'Count_6-12_months_late_missing',
       'Count_more_than_12_months_late_missing',
       'application_underwriting_score_missing'],
      dtype='object')

In [15]:
# Create separate object for target variable
y = train_df.renewal

# Create separate object for input features
X = train_df.drop(['renewal','id'], axis=1)

In [16]:
X_pred = test_df.drop(['id'],axis=1)

In [21]:
X = pd.get_dummies(X, columns=['sourcing_channel', 'residence_area_type'])
X_pred = pd.get_dummies(X_pred, columns=['sourcing_channel', 'residence_area_type'])

In [22]:
X.columns

Index(['perc_premium_paid_by_cash_credit', 'age_in_days', 'Income',
       'Count_3-6_months_late', 'Count_6-12_months_late',
       'Count_more_than_12_months_late', 'application_underwriting_score',
       'no_of_premiums_paid', 'premium', 'Count_3-6_months_late_missing',
       'Count_6-12_months_late_missing',
       'Count_more_than_12_months_late_missing',
       'application_underwriting_score_missing', 'sourcing_channel_A',
       'sourcing_channel_B', 'sourcing_channel_C', 'sourcing_channel_D',
       'sourcing_channel_E', 'residence_area_type_Rural',
       'residence_area_type_Urban'],
      dtype='object')

In [23]:
X_pred.columns

Index(['perc_premium_paid_by_cash_credit', 'age_in_days', 'Income',
       'Count_3-6_months_late', 'Count_6-12_months_late',
       'Count_more_than_12_months_late', 'application_underwriting_score',
       'no_of_premiums_paid', 'premium', 'Count_3-6_months_late_missing',
       'Count_6-12_months_late_missing',
       'Count_more_than_12_months_late_missing',
       'application_underwriting_score_missing', 'sourcing_channel_A',
       'sourcing_channel_B', 'sourcing_channel_C', 'sourcing_channel_D',
       'sourcing_channel_E', 'residence_area_type_Rural',
       'residence_area_type_Urban'],
      dtype='object')

In [24]:
# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.4, 
                                                    random_state=1234)

# Print number of observations in X_train, X_test, y_train, and y_test
print( len(X_train), len(X_test), len(y_train), len(y_test) )

47911 31942 47911 31942


In [26]:
import xgboost as xgb

from sklearn import metrics

def auc(m, train, test): 

    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),

                            metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))
# Parameter Tuning
xgb_model = xgb.XGBClassifier()

param_dist = {"max_depth": [10,30,50],

              "min_child_weight" : [1,3,6],

              "n_estimators": [200],

              "learning_rate": [0.05, 0.1,0.16],}

xgb_model = GridSearchCV(xgb_model, param_grid=param_dist, cv = 3, verbose=10, n_jobs=-1)

xgb_model.fit(X_train, y_train)

# Get the best estimator for setting optimal parameter
xgb_model.best_estimator_


Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 23.3min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 28.1min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 31.1min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 34.7min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 36.9min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed: 46.9min finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=10, min_child_weight=6, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [53]:
xgb_model = xgb.XGBClassifier(max_depth=50, min_child_weight=1,n_estimators=200,n_jobs=-1 , verbose=1,learning_rate=0.16)

xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.16, max_delta_step=0,
       max_depth=50, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1, verbose=1)

In [31]:
pred_xgb = xgb_model.predict_proba(X_pred)

In [38]:
X_pred['prob'] = pred_xgb[:,1]

In [50]:
sub_df['renewal'] = pred_xgb[:,1]

In [52]:
sub_df.to_csv('xgb_submission.csv',index=False)