# Configurations

In [1]:
from sklearn import preprocessing
from IPython.display import display, HTML
import numpy as np
import pandas as pd
#from sklearn.cross_validation import StratifiedKFold
from sklearn import metrics, linear_model, svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

Training_file = './Dataset/train_ZoGVYWq.csv'
Test_file     = './Dataset/test_66516Ee.csv'

Training_dtype = {'id':np.str, 'perc_premium_paid_by_cash_credit':np.float32, 
'age_in_days':np.float32, 
'Income':np.float32,
'Count_3-6_months_late':np.float32, 
'Count_6-12_months_late':np.float32,
'Count_more_than_12_months_late':np.float32, 
'application_underwriting_score':np.float32,
'no_of_premiums_paid':np.float32, 
'sourcing_channel':np.str, 
'residence_area_type':np.str,
'premium':np.int, 
'renewal':np.int}

Test_dtype = {'id':np.str, 'perc_premium_paid_by_cash_credit':np.float32, 
'age_in_days':np.float32, 
'Income':np.float32,
'Count_3-6_months_late':np.float32, 
'Count_6-12_months_late':np.float32,
'Count_more_than_12_months_late':np.float32, 
'application_underwriting_score':np.float32,
'no_of_premiums_paid':np.float32, 
'sourcing_channel':np.str, 
'residence_area_type':np.str,
'premium':np.int}


## 1) Loading Dataset - Train and test

In [2]:
# Loading dataset
trn_origin = pd.read_csv(Training_file, dtype=Training_dtype, na_values='')
tst_origin = pd.read_csv(Test_file, dtype=Test_dtype, na_values='')

display(trn_origin.head())
display(tst_origin.head())

#Check the shape of each dataset
print(trn_origin.shape)
print(tst_origin.shape)

Unnamed: 0,id,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel,residence_area_type,premium,renewal
0,110936,0.429,12058.0,355060.0,0.0,0.0,0.0,99.019997,13.0,C,Urban,3300,1
1,41492,0.01,21546.0,315150.0,0.0,0.0,0.0,99.889999,21.0,A,Urban,18000,1
2,31300,0.917,17531.0,84140.0,2.0,3.0,1.0,98.690002,7.0,C,Rural,3300,0
3,19415,0.049,15341.0,250510.0,0.0,0.0,0.0,99.57,9.0,A,Urban,9600,1
4,99379,0.052,31400.0,198680.0,0.0,0.0,0.0,99.870003,12.0,B,Urban,9600,1


Unnamed: 0,id,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel,residence_area_type,premium
0,649,0.001,27384.0,51150.0,0.0,0.0,0.0,99.889999,7.0,A,Rural,3300
1,81136,0.124,23735.0,285140.0,0.0,0.0,0.0,98.93,19.0,A,Urban,11700
2,70762,1.0,17170.0,186030.0,0.0,0.0,0.0,,2.0,B,Urban,11700
3,53935,0.198,16068.0,123540.0,0.0,0.0,0.0,99.0,11.0,B,Rural,5400
4,15476,0.041,10591.0,200020.0,1.0,0.0,0.0,99.169998,14.0,A,Rural,9600


(79853, 13)
(34224, 12)


## 1-2) Check the descriptive statistics for each dataset
- We can see that some columns have 'missing' values in it as counts of each column are different.

In [3]:
display(trn_origin.describe())
display(tst_origin.describe())

# age_in_days, Income, Count_3-6_months_late, Count_6-12_months_late	Count_more_than_12_months_late	application_underwriting_score	no_of_premiums_paid	premium

Unnamed: 0,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,premium,renewal
count,79853.0,79853.0,79853.0,79756.0,79756.0,79756.0,76879.0,79853.0,79853.0,79853.0
mean,0.314291,18846.738281,208848.6,0.248671,0.078188,0.060008,99.071289,10.863888,10924.507533,0.93741
std,0.334902,5208.866699,496577.8,0.691676,0.436455,0.311902,0.739792,5.170848,9401.676542,0.242226
min,0.0,7670.0,24030.0,0.0,0.0,0.0,91.900002,2.0,1200.0,0.0
25%,0.034,14974.0,108010.0,0.0,0.0,0.0,98.809998,7.0,5400.0,1.0
50%,0.167,18625.0,166560.0,0.0,0.0,0.0,99.209999,10.0,7500.0,1.0
75%,0.538,22636.0,252090.0,0.0,0.0,0.0,99.540001,14.0,13800.0,1.0
max,1.0,37602.0,90262600.0,13.0,17.0,11.0,99.889999,60.0,60000.0,1.0


Unnamed: 0,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,premium
count,34224.0,34224.0,34224.0,34193.0,34193.0,34193.0,32901.0,34224.0,34224.0
mean,0.314454,18824.125,202820.2,0.238733,0.080718,0.058111,99.062012,10.890428,10834.721248
std,0.334058,5246.466797,270253.2,0.686056,0.454576,0.307093,0.742933,5.216792,9263.074506
min,0.0,7671.0,24030.0,0.0,0.0,0.0,91.900002,2.0,1200.0
25%,0.034,14972.0,106397.5,0.0,0.0,0.0,98.800003,7.0,5400.0
50%,0.169,18623.0,165070.0,0.0,0.0,0.0,99.209999,10.0,7500.0
75%,0.54,22636.0,250020.0,0.0,0.0,0.0,99.529999,14.0,13800.0
max,1.0,35785.0,21914550.0,12.0,10.0,7.0,99.889999,59.0,60000.0


## 2) Data preprocessing


### 2-0) Split X and Y dataset

In [4]:
trn_origin_X = trn_origin.loc[:,trn_origin.columns != 'renewal'].copy()
trn_origin_Y = trn_origin.loc[:,'renewal'].copy()

tst_origin_X = tst_origin.copy()

## 2-1) Imputating Missing values
- We can figure out columns containing missing values

In [5]:
# Check the missing values
display(trn_origin_X.isnull().sum())
display(tst_origin_X.isnull().sum())

id                                     0
perc_premium_paid_by_cash_credit       0
age_in_days                            0
Income                                 0
Count_3-6_months_late                 97
Count_6-12_months_late                97
Count_more_than_12_months_late        97
application_underwriting_score      2974
no_of_premiums_paid                    0
sourcing_channel                       0
residence_area_type                    0
premium                                0
dtype: int64

id                                     0
perc_premium_paid_by_cash_credit       0
age_in_days                            0
Income                                 0
Count_3-6_months_late                 31
Count_6-12_months_late                31
Count_more_than_12_months_late        31
application_underwriting_score      1323
no_of_premiums_paid                    0
sourcing_channel                       0
residence_area_type                    0
premium                                0
dtype: int64

### Imputation Strategy 1 : Mean

In [6]:
from sklearn.preprocessing import Imputer

# From .isnull().sum()
na_in_cols = ['Count_3-6_months_late', 'Count_6-12_months_late', 
            'Count_more_than_12_months_late', 'application_underwriting_score']

trn_imputed_X = trn_origin_X.copy()
tst_imputed_X = tst_origin_X.copy()

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(trn_imputed_X[na_in_cols])

trn_imputed_X[na_in_cols] = imp.transform(trn_imputed_X[na_in_cols])
tst_imputed_X[na_in_cols] = imp.transform(tst_imputed_X[na_in_cols])


### Imputation Strategy 2 : K-nearest Neighbors

In [None]:
# ToDo

In [7]:
# Check the missing values AGAIN
display(trn_imputed_X.isnull().sum())
display(tst_imputed_X.isnull().sum())

id                                  0
perc_premium_paid_by_cash_credit    0
age_in_days                         0
Income                              0
Count_3-6_months_late               0
Count_6-12_months_late              0
Count_more_than_12_months_late      0
application_underwriting_score      0
no_of_premiums_paid                 0
sourcing_channel                    0
residence_area_type                 0
premium                             0
dtype: int64

id                                  0
perc_premium_paid_by_cash_credit    0
age_in_days                         0
Income                              0
Count_3-6_months_late               0
Count_6-12_months_late              0
Count_more_than_12_months_late      0
application_underwriting_score      0
no_of_premiums_paid                 0
sourcing_channel                    0
residence_area_type                 0
premium                             0
dtype: int64

## 2-2) Convert string values into dummy variables

In [8]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

cat_cols = ['sourcing_channel', 'residence_area_type']

# Concatenate trn and test X dataset
n_row_trn, ncol_trn = trn_imputed_X.shape
n_row_tst, ncol_tst = tst_imputed_X.shape

agg_data_X = pd.concat([trn_imputed_X, tst_imputed_X], axis=0)

for cat_col in cat_cols:
    print(' Convert string values from col : ', cat_col)

    just_dummy = pd.get_dummies(agg_data_X[cat_col], prefix=cat_col)
    #print(just_dummy)
    
    # Concatenate dummy columns into dataset
    agg_data_X = pd.concat([agg_data_X, just_dummy], axis=1)
    # Drop the origin column
    agg_data_X.drop(cat_col, axis=1, inplace=True)


# Split trn and test X dataset
trn_imputed_dum_X = agg_data_X.iloc[0:n_row_trn, :].copy()
tst_imputed_dum_X = agg_data_X.iloc[n_row_trn:, :].copy()

print(trn_imputed_dum_X.shape)
print(tst_imputed_dum_X.shape)

    #display(agg_data_X)
#     label_encoder = LabelEncoder()
#     onehot_encoder = OneHotEncoder(sparse=False)

#     trn_col_arr = trn_imputed[cat_col]
#     tst_col_arr = tst_imputed[cat_col]
    
#     ## Str to Integer encoding
#     label_encoder.fit(trn_col_arr)
#     trn_int_encoded = label_encoder.transform(trn_col_arr)
#     tst_int_encoded = label_encoder.transform(tst_col_arr)
#     #print(integer_encoded)

#     ## Integer to Binary encoding
#     trn_int_encoded = trn_int_encoded.reshape(len(trn_int_encoded), 1)
#     tst_int_encoded = tst_int_encoded.reshape(len(tst_int_encoded), 1)
#     onehot_encoder.fit(trn_int_encoded)
#     trn_onehot_encoded = onehot_encoder.transform(trn_int_encoded)
#     tst_onehot_encoded = onehot_encoder.transform(tst_int_encoded)
    
#     print(trn_onehot_encoded)
#     print(tst_onehot_encoded)
    
    ## Merge to the dataset
    

 Convert string values from col :  sourcing_channel
 Convert string values from col :  residence_area_type
(79853, 17)
(34224, 17)


## 2-3) Data standardization

In [9]:
cols_for_std = ['age_in_days', 'Income', 'Count_3-6_months_late', 
                'Count_6-12_months_late', 'Count_more_than_12_months_late', 'application_underwriting_score',
                'no_of_premiums_paid', 'premium']

scaler = preprocessing.StandardScaler().fit(trn_imputed_dum_X[cols_for_std])
trn_imputed_dum_X[cols_for_std] = scaler.transform(trn_imputed_dum_X[cols_for_std])
tst_imputed_dum_X[cols_for_std] = scaler.transform(tst_imputed_dum_X[cols_for_std])


## 3) Classification Models

In [10]:
n_fold = 10
#kf = StratifiedKFold(trn_origin_Y, n_folds = n_fold)
scoring = {'AUC': 'roc_auc', 'F1': 'f1', 'Prec':'precision', 'Rec':'recall'}

## 3-1) Logistic Regression

### Hyperparameter optimization

In [12]:
# Parameters
lr_param = {'C':[0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1],
           'tol' : [1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}

lr_results = pd.DataFrame(np.empty((0,4), float), columns=['c', 'tol', 'AUROC', 'f1'])

def clf_LogisticLR(trn_X, trn_Y, lr_param):
    clf_LR = linear_model.LogisticRegression()
    clf = GridSearchCV(clf_LR, lr_param, n_jobs=8, cv=10, scoring='roc_auc')
    clf.fit(trn_X, trn_Y)
    return clf

## 3-2) Support Vector Machine

### Hyperparameter optimization

In [13]:
from sklearn.svm import SVC
svc_param = {'C':[0.001, 0.01, 0.1, 1, 10, 100]}

def clf_SVC(trn_X, trn_Y, svc_param):
    clf_SVC = SVC()
    clf = GridSearchCV(clf_SVC, svc_param, n_jobs=8, cv=10, scoring='roc_auc')
    clf.fit(trn_X, trn_Y)
    return clf  

## 3-3) Stochastic Gradient Descent

## 3-4) Gradient Tree Boosting


## 3-x) DO PREDICTION

In [128]:
#lr_clf = clf_LogisticLR(trn_imputed_dum_X, trn_origin_Y, lr_param)
#

#lr_clf.best_params_
#lr_clf.best_score_
#lr_clf.best_params_
#lr_clf.grid_scores_


In [132]:
print(lr_clf.best_score_)
lr_yhat = lr_clf.best_estimator_.predict(tst_imputed_dum_X)
lr_prob = lr_clf.best_estimator_.predict_proba(tst_imputed_dum_X)
lr_prob_renewal = lr_prob[:,1]
display(lr_prob_renewal)

0.8315741784184437


array([0.98760972, 0.97732018, 0.91497082, ..., 0.95718645, 0.76679178,
       0.96973406])

In [None]:
clf_SVC = clf_SVC(trn_imputed_dum_X, trn_origin_Y, svc_param)

## 4) Export to output file

In [133]:
tst_output = pd.DataFrame(tst_origin_X[['id', 'premium']])

lr_prob_df = pd.DataFrame(lr_prob_renewal.reshape(len(lr_prob_renewal),1), columns=['renewal'])
pd_incentives = pd.DataFrame(np.zeros((len(lr_prob_renewal), 1)), columns=['incentives'], dtype=np.float32)
pd_improve = pd.DataFrame(np.zeros((len(lr_prob_renewal), 1)), columns=['improvement'], dtype=np.float32)

tst_output = pd.concat([tst_output, lr_prob_df, pd_improve, pd_incentives], axis=1)
#display(tst_output)

tst_output.to_csv('./Dataset/tst_renewal_predicted.csv', index=False)