# Configurations

In [None]:
from sklearn import preprocessing
from IPython.display import display, HTML
import numpy as np
import pandas as pd
from sklearn.cross_validation import StratifiedKFold
from sklearn import  metrics, grid_search, linear_model, svm
from sklearn.model_selection import GridSearchCV

Training_file = './Dataset/train_ZoGVYWq.csv'
Test_file     = './Dataset/test_66516Ee.csv'

Training_dtype = {'id':np.str, 'perc_premium_paid_by_cash_credit':np.float32, 
'age_in_days':np.float32, 
'Income':np.float32,
'Count_3-6_months_late':np.float32, 
'Count_6-12_months_late':np.float32,
'Count_more_than_12_months_late':np.float32, 
'application_underwriting_score':np.float32,
'no_of_premiums_paid':np.float32, 
'sourcing_channel':np.str, 
'residence_area_type':np.str,
'premium':np.int, 
'renewal':np.int}

Test_dtype = {'id':np.str, 'perc_premium_paid_by_cash_credit':np.float32, 
'age_in_days':np.float32, 
'Income':np.float32,
'Count_3-6_months_late':np.float32, 
'Count_6-12_months_late':np.float32,
'Count_more_than_12_months_late':np.float32, 
'application_underwriting_score':np.float32,
'no_of_premiums_paid':np.float32, 
'sourcing_channel':np.str, 
'residence_area_type':np.str,
'premium':np.int}


## 1) Loading Dataset - Train and test

In [None]:
# Loading dataset
trn_origin = pd.read_csv(Training_file, dtype=Training_dtype, na_values='')
tst_origin = pd.read_csv(Test_file, dtype=Test_dtype, na_values='')

display(trn_origin.head())
display(tst_origin.head())

#Check the shape of each dataset
print(trn_origin.shape)
print(tst_origin.shape)

## 1-2) Check the descriptive statistics for each dataset
- We can see that some columns have 'missing' values in it as counts of each column are different.

In [None]:
display(trn_origin.describe())
display(tst_origin.describe())

# age_in_days, Income, Count_3-6_months_late, Count_6-12_months_late	Count_more_than_12_months_late	application_underwriting_score	no_of_premiums_paid	premium

## 2) Data preprocessing


### 2-0) Split X and Y dataset

In [None]:
trn_origin_X = trn_origin.loc[:,trn_origin.columns != 'renewal'].copy()
trn_origin_Y = trn_origin.loc[:,'renewal'].copy()

tst_origin_X = tst_origin.copy()

## 2-1) Imputating Missing values
- We can figure out columns containing missing values

In [None]:
# Check the missing values
display(trn_origin_X.isnull().sum())
display(tst_origin_X.isnull().sum())

### Imputation Strategy 1 : Mean

In [None]:
from sklearn.preprocessing import Imputer

# From .isnull().sum()
na_in_cols = ['Count_3-6_months_late', 'Count_6-12_months_late', 
            'Count_more_than_12_months_late', 'application_underwriting_score']

trn_imputed_X = trn_origin_X.copy()
tst_imputed_X = tst_origin_X.copy()

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(trn_imputed_X[na_in_cols])

trn_imputed_X[na_in_cols] = imp.transform(trn_imputed_X[na_in_cols])
tst_imputed_X[na_in_cols] = imp.transform(tst_imputed_X[na_in_cols])


### Imputation Strategy 2 : K-nearest Neighbors

In [None]:
# ToDo

In [None]:
# Check the missing values AGAIN
display(trn_imputed_X.isnull().sum())
display(tst_imputed_X.isnull().sum())

## 2-2) Convert string values into dummy variables

In [None]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

cat_cols = ['sourcing_channel', 'residence_area_type']

# Concatenate trn and test X dataset
n_row_trn, ncol_trn = trn_imputed_X.shape
n_row_tst, ncol_tst = tst_imputed_X.shape

agg_data_X = pd.concat([trn_imputed_X, tst_imputed_X], axis=0)

for cat_col in cat_cols:
    print(' Convert string values from col : ', cat_col)

    just_dummy = pd.get_dummies(agg_data_X[cat_col], prefix=cat_col)
    #print(just_dummy)
    
    # Concatenate dummy columns into dataset
    agg_data_X = pd.concat([agg_data_X, just_dummy], axis=1)
    # Drop the origin column
    agg_data_X.drop(cat_col, axis=1, inplace=True)


# Split trn and test X dataset
trn_imputed_dum_X = agg_data_X.iloc[0:n_row_trn, :].copy()
tst_imputed_dum_X = agg_data_X.iloc[n_row_trn:, :].copy()

print(trn_imputed_dum_X.shape)
print(tst_imputed_dum_X.shape)

    #display(agg_data_X)
#     label_encoder = LabelEncoder()
#     onehot_encoder = OneHotEncoder(sparse=False)

#     trn_col_arr = trn_imputed[cat_col]
#     tst_col_arr = tst_imputed[cat_col]
    
#     ## Str to Integer encoding
#     label_encoder.fit(trn_col_arr)
#     trn_int_encoded = label_encoder.transform(trn_col_arr)
#     tst_int_encoded = label_encoder.transform(tst_col_arr)
#     #print(integer_encoded)

#     ## Integer to Binary encoding
#     trn_int_encoded = trn_int_encoded.reshape(len(trn_int_encoded), 1)
#     tst_int_encoded = tst_int_encoded.reshape(len(tst_int_encoded), 1)
#     onehot_encoder.fit(trn_int_encoded)
#     trn_onehot_encoded = onehot_encoder.transform(trn_int_encoded)
#     tst_onehot_encoded = onehot_encoder.transform(tst_int_encoded)
    
#     print(trn_onehot_encoded)
#     print(tst_onehot_encoded)
    
    ## Merge to the dataset
    

## 2-3) Data standardization

In [None]:
cols_for_std = ['age_in_days', 'Income', 'Count_3-6_months_late', 
                'Count_6-12_months_late', 'Count_more_than_12_months_late', 'application_underwriting_score',
                'no_of_premiums_paid', 'premium']

scaler = preprocessing.StandardScaler().fit(trn_imputed_dum_X[cols_for_std])
trn_imputed_dum_X[cols_for_std] = scaler.transform(trn_imputed_dum_X[cols_for_std])
tst_imputed_dum_X[cols_for_std] = scaler.transform(tst_imputed_dum_X[cols_for_std])


## 3) Classification Models

In [None]:
n_fold = 10
kf = StratifiedKFold(trn_origin_Y, n_folds = n_fold)
scoring = {'AUC': 'roc_auc', 'F1': 'f1', 'Prec':'precision', 'Rec':'recall'}

## 3-1) Logistic Regression

### Hyperparameter optimization

In [None]:
# Parameters
lr_param = {'C':[0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.5, 1, 5, 10],
           'tol' : [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10]}

lr_param = {'C':[0.001,  10],
           'tol' : [1e-7, 5, 10]}

lr_results = pd.DataFrame(np.empty((0,4), float), columns=['c', 'tol', 'AUROC', 'f1'])


def clf_LogisticLR(kf):
    clf_LR = linear_model.LogisticRegression()
    
    for trn_idx, val_idx in kf:
        clf = GridSearchCV(clf_LR, lr_param, n_jobs=8, scoring=scoring)
        clf.fit(trn_imputed_dum_X.loc[trn_idx,:], trn_origin_Y.loc[trn_idx], refit='roc_auc')



## 3-x) DO PREDICTION

In [None]:
clf_LogisticLR(kf)