<a href="https://colab.research.google.com/github/mrninainaidi/Machine-Learning-Projects/blob/master/personal_loan_default_logReg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preamble

* jupyter notebook theme (optional)
* package
* global variables

In [None]:
# # Optional: setup theme for Jupyter Notebook
# # comment out if running on Colab
# import jupytertheme as jt
# from jupyterthemes.stylefx import set_nb_theme

# set_nb_theme('chesterish')

In [None]:
import pandas as pd
import numpy as np
import math

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score

import xgboost
from xgboost import plot_importance

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

import gc
from scipy import stats
import time
import datetime
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

!pip install missingno
import missingno as msno

!pip install category_encoders
import category_encoders as ce

!pip install imblearn
from imblearn.over_sampling import SMOTE


# import seaborn as sns
# import altair as alt

In [None]:
RAND_STATE = 3

# Data Processing and Cleaning

## Import Data

In [None]:
# For colab
from google.colab import drive
drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/further_study/machine_learning_projects/personal_loan_rating/'
df = pd.read_csv(root_path+'default_loan_no_quotes.csv')

In [None]:
# # For others (Jupyter Notebook)
# # NOTE: This requies the data file to be saved under the same directory as this file.

# df = pd.read_csv('default_loan_no_quotes.csv')

In [None]:
df.head()

In [None]:
df.columns = df.columns.str.replace(' ','_')
df.columns = map(str.lower, df.columns)
df.columns

## Features With Null Values

In [None]:
msno.bar(df)

In [None]:
na_names = list()

for col in df.columns:
    if df[col].isna().sum() > 0:
        print(f'Feature: {col}, has {100 * df[col].isna().sum() / df[col].shape[0]:.3f}%  or {df[col].isna().sum()} null values.')
        na_names.append(col)

na_names

## Drop the columns of little interests


In [None]:
'''
Drop the columns of little interests
'''

not_interested = ['entry_date', 'fist_installment_date',\
                  'id', 'deposit_amt', 'financed_amt', 'term_remaining',\
                  'instalment_amt', 'amt_paid_to_merchant_nettofmerchfeesandgst',\
                  'est_fees', 'proc_fees', 'other_fees', 'total_merchant_charges',\
                  'total_consumer_charges']

for name in not_interested: 
    if name not in df.columns:
        raise ValueError(f'column name: {name} is not valid')

df.drop(columns=not_interested, inplace=True)
# df.head()

## Cleaning the consumer post code feature

In [None]:
# manually correct typos in post code
df['consumer_post_code'].loc[df['consumer_post_code'] == '28501'] = '2850'
df['consumer_post_code'].loc[df['consumer_post_code'] == '2166`1'] = '2166'
df['consumer_post_code'].loc[df['consumer_post_code'] == '414'] = np.nan
df['consumer_post_code'].loc[df['consumer_post_code'] == 'CM144WG'] = np.nan
df['consumer_post_code'].loc[df['consumer_post_code'] == '4Q53'] = '4053'
df['consumer_post_code'].loc[df['consumer_post_code'] == '40/2'] = '4012'
df['consumer_post_code'].loc[df['consumer_post_code'] == '482O'] = '4820'
df['consumer_post_code'].loc[df['consumer_post_code'] == '500O'] = '5000'
df['consumer_post_code'].loc[df['consumer_post_code'] == '430('] = np.nan
df['consumer_post_code'].loc[df['consumer_post_code'] == '48/7'] = '4817'

# convert NA values to 'unknown'
consumerid_list = df['consumer_id'].loc[df['consumer_post_code'].isna()].values
consumerid_list = set(consumerid_list)
print(consumerid_list)

for id in consumerid_list:
    if df['consumer_post_code'].loc[df['consumer_id'] == id].isnull().values.all():
        print(f'consumer: {id} has no post code info')
        df['consumer_post_code'].loc[df['consumer_id'] == id] = 'unknown'
    else:
        possible_post_codes = df["consumer_post_code"].loc[df["consumer_id"] == id].values
        possible_post_codes = possible_post_codes[pd.notna(possible_post_codes)]
        print(f'consumer: {id} has the following post code: {possible_post_codes}')
        print(f'    applying post code to consumer: {id}')
        df['consumer_post_code'].loc[df['consumer_id'] == id] = str(int(possible_post_codes[0]))


# make sure all int and float-type entries are cast to str
df['consumer_post_code'] = df['consumer_post_code'].astype(str).replace('\.0', '', regex=True)
print('Convertion complete.')

In [None]:
# # to check if all instances of the 'consumer_post_code' feature have been converted
# # to string-type
# for _, row in df.iterrows():
#     try: 
#         assert(isinstance(row['consumer_post_code'], str))
#     except: 
#         print(row)

## Cleaning the consumer year of birth feature

In [None]:
# convert NA values to '99/99/9999'
consumerid_list = df['consumer_id'].loc[df['consumer_year_of_birth'].isna()].values
consumerid_list = set(consumerid_list)

for id in consumerid_list:
    if df['consumer_year_of_birth'].loc[df['consumer_id'] == id].isnull().values.all():
        print(f'consumer: {id} has no DoB info')
        df['consumer_year_of_birth'].loc[df['consumer_id'] == id] = '99/99/9999'
    else:
        
        possibleDoB = df["consumer_year_of_birth"].loc[df["consumer_id"] == id].values
        possibleDoB = possibleDoB[pd.notna(possibleDoB)]
        print(f'consumer: {id} has the following DoBs: {possibleDoB}')
        print(f'    applying DoB to consumer: {id}')
        df['consumer_year_of_birth'].loc[df['consumer_id'] == id] = str(possibleDoB[0])

# Convert str-type DoB to int-type year of birth
df['consumer_year_of_birth'] = df['consumer_year_of_birth'].str.split('/', expand=True)[2].astype(int)
print('Convertion complete.')

In [None]:
# to check if all instances of the 'consumer_year_of_birth' feature have been converted
# to int-type or np.nan
# for _, row in df.iterrows():
#     yob = row['consumer_year_of_birth']
#     if isinstance(yob, int):
#         if yob > 1900 and yob <= 9999:
#             continue
#     else: 
#         print(f'row["consumer_year_of_birth"] = {yob}')
# print('Assertion complete.')

In [None]:
x = df['consumer_year_of_birth'].value_counts(normalize=True).sort_index().index
y = df['consumer_year_of_birth'].value_counts(normalize=True).sort_index().values
plt.xlim(1900,2000)
plt.title('Consumer Age Distribution')
plt.xlabel('Year of Birth')
plt.ylabel('Probability')
plt.plot(x, y,'g*')


## Converting application_date feature to application_month and application_year

In [None]:
# Convert str-type application date to int-type year of application
df['application_year'] = df['application_date'].str.split('/', expand=True)[2].astype(int)
df['application_month'] = df['application_date'].str.split('/', expand=True)[1].astype(int)


## Converting recent_default_default_date to recent_default_year and recent_dafault_month

In [None]:
df['recent_default_default_date'] = df['recent_default_default_date'].replace(np.nan, '00/00/0000', regex=True)
df['recent_default_year'] = df['recent_default_default_date'].str.split('/', expand=True)[2].astype(int)
df['recent_default_month'] = df['recent_default_default_date'].str.split('/', expand=True)[1].astype(int)
# df['recent_default_month'].value_counts()

In [None]:
x = df['recent_default_month'].value_counts(normalize=True).sort_index().index
y = df['recent_default_month'].value_counts(normalize=True).sort_index().values

plt.xlim(1,12)
plt.ylim(0,0.02)

plt.title('Recent default month distribution')
plt.xlabel('Month of default')
plt.ylabel('Probability')
plt.plot(x, y,'g*-')

## Adding age_of_application feature (integer)

In [None]:
df['age_of_application'] = df['application_year'] - df['consumer_year_of_birth']

# use this "age of application" to validate the "consumer year of birth"
# if "age of application" < 18, the minimum legal age of having a credit account
# the "consumer year of birth" entry must be faulty. 
df['consumer_year_of_birth'].loc[df['age_of_application'] < 18] = int(9999)
df['age_of_application'].loc[df['age_of_application'] < 18] = int(-1)

In [None]:
x = df['age_of_application'].value_counts(normalize=True).sort_index().index
y = df['age_of_application'].value_counts(normalize=True).sort_index().values
plt.xlim(-1,100)
plt.title('Consumer Age of Application')
plt.xlabel('Consumer Age')
plt.ylabel('Probability')
plt.plot(x, y, 'g*')

## Adding age_of_recent_default feature (integer)

In [None]:
df['age_of_recent_default'] = df['recent_default_year'] - df['consumer_year_of_birth']
# df['age_of_recent_default'].value_counts(normalize=True)

In [None]:
# recent default should not happen before the year of application
# the age of application has to be > 18 for age of recent default to be effective

# For invalid entry of recent_default_default_date and consumer_year_of_birth
df['age_of_recent_default'].loc[(df['age_of_recent_default'] <df['age_of_application'])\
                                | (df['age_of_application'] < 18)] = int(-1)
df['recent_default_year'].loc[(df['age_of_recent_default'] <df['age_of_application'])\
                                | (df['age_of_application'] < 18)] = int(0)
df['recent_default_month'].loc[(df['age_of_recent_default'] <df['age_of_application'])\
                                | (df['age_of_application'] < 18)] = int(0)

# For absent recent_default_default_date 
df['age_of_recent_default'].loc[df['recent_default_year'] == 0] = int(0)
df['recent_default_year'].loc[df['recent_default_year'] == 0] = int(0)
df['recent_default_month'].loc[df['recent_default_year'] == 0] = int(0)

print('Convertion complete.')

In [None]:
x = df['age_of_recent_default'].value_counts(normalize=True).sort_index().index
y = df['age_of_recent_default'].value_counts(normalize=True).sort_index().values
plt.xlim(20,100)
plt.ylim(0,0.01)
plt.title('Consumer Age of Recent Default')
plt.xlabel('Consumer Age')
plt.ylabel('Probability')
plt.plot(x, y, 'g*')

## Cleaning product feature

In [None]:
# replace NaN with 'unknown'
df['product'] = df['product'].replace(np.nan, 'unknown', regex=True)

In [None]:
# shorten the tails
x = df['product'].value_counts(normalize=True).index
y = df['product'].value_counts(normalize=True).values

## Shorten the features with heavy tails

* 'product'
* 'merchant_name'
* 'merchant_number'

In [None]:

def convert_tails_to_others(dataframe, feature, fracToConvert):
    x = dataframe[feature].value_counts(normalize=True).index
    y = dataframe[feature].value_counts(normalize=True).values

    all_list = dataframe[feature].value_counts(normalize=True).index.tolist()

    # obtain the list of value to keep
    threshold = 1 - fracToConvert
    current = 0.0
    keep_list = list()

    for i in range(len(y)):
        if current >= threshold:
            break
        current += y[i]
        keep_list.append(x[i])

    drop_list = [x for x in all_list if x not in keep_list]

    # apply keep_list
    dataframe[feature].loc[dataframe[feature].isin(drop_list)] = 'others'
    # print(dataframe[feature].value_counts(normalize=True))
    # print()

In [None]:
col_names = ['product', 'merchant_name', 'merchant_number']
frac_dict = {'product':0.08, 'merchant_name':0.05, 'merchant_number':0.05}

for name in col_names:
    convert_tails_to_others(df, name, frac_dict[name])

## Cleaning total_balance_outstanding feature

In [None]:
df_tmp = df['total_balance_outstanding']

df_tmp.replace(np.nan, '0.0', regex=True, inplace=True)
df_tmp.replace(',', '', regex=True, inplace=True)
df_tmp = df_tmp.astype(float)

df['total_balance_outstanding'] = df_tmp
del df_tmp
print('Convertion complete')

In [None]:
# # check if everything has been converted to float-type
# for index, value in df['total_balance_outstanding'].items():
#     if not isinstance(value, float):
#         print(f'{value} ----- {type(value)}')
# print('Assertion complete.')

## Cleaning recent_default_default_amt feature

In [None]:
df_tmp = df['recent_default_default_amt']

df_tmp.replace(np.nan, '0.0', regex=True, inplace=True)
df_tmp.replace(',', '', regex=True, inplace=True)
df_tmp = df_tmp.astype(float)

df['recent_default_default_amt'] = df_tmp
del df_tmp
print('Convertion complete')

In [None]:
# # check if everything has been converted to float-type
# for index, value in df['recent_default_default_amt'].items():
#     if not isinstance(value, float):
#         print(f'{value} ----- {type(value)}')
# print('Assertion complete.')

## Adding term_run_frac feature
representing the fraction of terms that have been fulfilled. 

In [None]:
df['term_run_frac'] = df['term_run'] / df['total_term']

## Adding total_month feature

In [None]:
df_tmp = pd.DataFrame()
df_tmp['total_term'] = df['total_term']
df_tmp['total_month'] = df['total_term']
df_tmp['freq'] = df['freq']

mask = (df_tmp['freq'] == 'FN')
df_valid = df_tmp[mask]

df_tmp.loc[mask, 'total_month'] = df_valid['total_term'] / 2

df['total_month'] = df_tmp['total_month']
del df_tmp

## Adding conditional mean/std features

In [None]:
df.drop(df[df['age_of_application'] == -1].index, axis=0, inplace=True)

df['age_of_application'].value_counts().sort_index()

In [None]:
# Conditioning for "age_op_application"
df['aop_indName_mean'] = df['age_of_application'] / df.groupby(['industry_name'])['age_of_application'].transform('mean')
df['aop_indName_stdev'] = df['age_of_application'] / df.groupby(['industry_name'])['age_of_application'].transform('std')

df['aop_pmtTp_mean'] = df['age_of_application'] / df.groupby(['payment_type'])['age_of_application'].transform('mean')
df['aop_pmtTp_stdev'] = df['age_of_application'] / df.groupby(['payment_type'])['age_of_application'].transform('std')

df['aop_fq_mean'] = df['age_of_application'] / df.groupby(['freq'])['age_of_application'].transform('mean')
df['aop_fq_stdev'] = df['age_of_application'] / df.groupby(['freq'])['age_of_application'].transform('std')

df['aop_hoId_mean'] = df['age_of_application'] / df.groupby(['homowner_ind'])['age_of_application'].transform('mean')
df['aop_hoId_stdev'] = df['age_of_application'] / df.groupby(['homowner_ind'])['age_of_application'].transform('std')

df['aop_hoCon_mean'] = df['age_of_application'] / df.groupby(['homowner_consumer'])['age_of_application'].transform('mean')
df['aop_hoCon_stdev'] = df['age_of_application'] / df.groupby(['homowner_consumer'])['age_of_application'].transform('std')


# Conditioning for "purchase_amt"
df['pAmt_indName_mean'] = df['purchase_amt'] / df.groupby(['industry_name'])['purchase_amt'].transform('mean')
df['pAmt_indName_stdev'] = df['purchase_amt'] / df.groupby(['industry_name'])['purchase_amt'].transform('std')

df['pAmt_pmtTp_mean'] = df['purchase_amt'] / df.groupby(['payment_type'])['purchase_amt'].transform('mean')
df['pAmt_pmtTp_stdev'] = df['purchase_amt'] / df.groupby(['payment_type'])['purchase_amt'].transform('std')

df['pAmt_fq_mean'] = df['purchase_amt'] / df.groupby(['freq'])['purchase_amt'].transform('mean')
df['pAmt_fq_stdev'] = df['purchase_amt'] / df.groupby(['freq'])['purchase_amt'].transform('std')

df['pAmt_hoId_mean'] = df['purchase_amt'] / df.groupby(['homowner_ind'])['purchase_amt'].transform('mean')
df['pAmt_hoId_stdev'] = df['purchase_amt'] / df.groupby(['homowner_ind'])['purchase_amt'].transform('std')

df['pAmt_hoCon_mean'] = df['purchase_amt'] / df.groupby(['homowner_consumer'])['purchase_amt'].transform('mean')
df['pAmt_hoCon_stdev'] = df['purchase_amt'] / df.groupby(['homowner_consumer'])['purchase_amt'].transform('std')

In [None]:
df.drop(df[df['aop_indName_stdev'].isna()].index, axis=0, inplace=True)

In [None]:
# check for NaN in the conditional features: 
cond_names = ['aop_indName_mean', 'aop_indName_stdev', 'aop_pmtTp_mean',\
              'aop_pmtTp_stdev', 'aop_fq_mean', 'aop_fq_stdev', 'aop_hoId_mean',\
              'aop_hoId_stdev', 'aop_hoCon_mean', 'aop_hoCon_stdev', 'pAmt_indName_mean',\
              'pAmt_indName_stdev', 'pAmt_pmtTp_mean', 'pAmt_pmtTp_stdev', 'pAmt_fq_mean',\
              'pAmt_fq_stdev', 'pAmt_hoId_mean', 'pAmt_hoId_stdev', 'pAmt_hoCon_mean', 'pAmt_hoCon_stdev']

df_tmp = pd.DataFrame()
for name in cond_names:
    df_tmp[name] = df[name].copy()

# df_tmp.head()
msno.bar(df_tmp)

## Define ground truth

In [None]:
# df_recent = df[[col for col in df.columns if 'recent' in col]]
# df_recent['defaultdate'] = df['defaultdate']
# df_recent['consumer_id'] = df['consumer_id']
# df_recent['defaultamount'] = df['defaultamount']
# df_recent['contract_number'] = df['contract_number']
# df_recent['contract_status'] = df['contract_status']
# df_recent['expected_contract_end_date'] = df['expected_contract_end_date']

In [None]:
# for index, row in df_recent.iterrows():
#     if row['recent_default_default_amt'] == 0:
#         if isinstance(row['defaultdate'], str):
#             print(row)
#             print()      

**Test outcome**

* when "recent_default_year" == 0, there are FIVE instances that "recent_default_default_amt" != 0. And all FIVE instances are marked as DEFAULT by the "contract_status"


* when "recent_default_default_amt" == 0, there are TWO instances that "recent_default_year" != 0. And all of the TWO instances are marked as PAIDINFULL by the "contract_status".

    ==> both "recent_default_year" and "recent_default_default_amt" == 0 means NoDefault

**Suspect bad columns:**

Assumning 'recent_default_default_amt' is the indicator for the ground truth... 

* 'defaultdate'

* 'defaultamount'

* 'total_balance_outstanding'

* 'recent_default_default_date' ==> 'recent_default_year' ==> 'recent_default_age'


**Question:**

Do I go back to realign 'recent_default_year' and 'recent_default_age' with the assumed ground truth???


In [None]:
# introduce the ground truth according to above analysis
df['isDefault'] = df['recent_default_default_amt'] > 0

## Finalising data processing and digitising categorical features

In [None]:
print(df.columns)

### Collect numeric features

In [None]:
df_train = pd.DataFrame()

num_names = ['purchase_amt','deposit_percent', 'age_of_application', \
             'gtee_rate','term_run_frac', 'total_month']

num_names = num_names + cond_names

for name in num_names: 
    if name not in df.columns:
        raise ValueError(f'column name: {name} is not valid')

for name in num_names: 
    df_train[name] = df[name].copy()

df_train.head(10)

### Encode categorical features

In [None]:
def ordered_labels(df, col, order):
    df[col] = df[col].astype('category')
    df[col] = df[col].cat.reorder_categories(order, ordered=True)
    df[col] = df[col].cat.codes.astype(int)

In [None]:
# split the categorical features into... 
# ordered
ordered_features = ['application_year', 'application_month']
ordered_dict = dict()
for elm in ordered_features:
    ordered_dict[elm] = df[elm].value_counts().sort_index().index

# one-hot like
oneHot_features = ['product', 'consumer_post_code', 'industry_name']

# binary encoder
nominated_features = ['payment_type', 'freq', 'homowner_ind', 'homowner_consumer', 'isDefault']

cate_names = ordered_features + nominated_features + oneHot_features

# initialise encoder
le = LabelEncoder()

# start encoding... 
for col in cate_names: 
    df_train[col] = df[col].copy()

    # the ordered_features
    if col in ordered_features:
        ordered_labels(df_train, col, ordered_dict[col])
        continue

    # the one-hot like features
    if col in oneHot_features:
        encoder = ce.BinaryEncoder(cols=[col])
        df_train = pd.concat([df_train, encoder.fit_transform(df_train[col])], axis=1)
        df_train.drop(columns=[col], inplace=True)
        continue

    # other features
    le.fit(list(df_train[col].astype(str).values))
    df_train[col] = le.transform(list(df_train[col].astype(str).values))
    length = df_train[col].value_counts().shape

df_train.head(10)

In [None]:
df_train.shape

### Rescale everything to (0, 1)

In [None]:
# Scale numeric features to (0, 1)
value_array = df_train.values
col_names = df_train.columns

min_max_scaler = MinMaxScaler()

value_array_scaled = min_max_scaler.fit_transform(value_array)

df_train = pd.DataFrame(value_array_scaled, columns=col_names)

df_train.describe()

### Separate input and output

In [None]:
# model input
X = df_train.drop(columns=['isDefault'])

# expected output
y = df_train['isDefault']

print(X.shape)
print(y.shape)

### Re-sampling with SMOTE

In [None]:
# initialise SMOTE sampling
sm = SMOTE(random_state=RAND_STATE)

# resample the training set
input, target = sm.fit_sample(X, y.ravel())
print(target.mean())

In [None]:
df_tmp = pd.DataFrame(input, columns=X.columns)
df_tmp['isDefault'] = target

df_train = df_tmp
del df_tmp
df_train.shape

In [None]:
# model input
X = df_train.drop(columns=['isDefault'])

# expected output
y = df_train['isDefault']

print(X.shape)
print(y.shape)

# Model Training and Validating

## HyperOpt function and parameters space definition

In [None]:
'''
Define the objective function
    The output of this objective fnc has been set to the negative value of the mean
    score, so that fmin() can be used to find the most suitable parameters for 
    maximum mean score.
'''
def objective(params):
    time1 = time.time()

    params = {
        'penalty'         : str(params['penalty']),
        'C'               : float(params['C']),
        'solver'          : str(params['solver'])
    }

    df_toprint = pd.DataFrame(params, index=[0])

    print('\n############## New Run ################')
    print(f"params = {df_toprint.transpose()}")

    # declair total number of folds and fold counter
    FOLDS = 6
    counter = 1

    # instantiate the TSS model
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RAND_STATE)

    # initialise the 0-valued array to hold the flag of wrong predictions for training set
    y_oof = np.zeros(X.shape[0])

    # initialise the mean score for the cross-validation
    score_mean = 0
    score_acc_mean = 0

    print(f'Training set shape: {X.shape}')

    print('\nCV - scores: ')

    # Start the Training and Cross-validation loop
    for t_idx, v_idx in skf.split(X, y):

        # instantiate the XGB classifier
        clf = LogisticRegression(random_state=RAND_STATE, **params)

        # get the time series split indices
        X_t, X_v = X.iloc[t_idx, :], X.iloc[v_idx, :]
        y_t, y_v = y.iloc[t_idx], y.iloc[v_idx]

        # Model training
        clf.fit(X_t, y_t)
        # clf.fit(X_t_res, y_t_res)

        # Obtain the validation score for the fitted model
        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_v, y_v)
        score_mean += score

        score_acc = make_scorer(accuracy_score)(clf, X_v, y_v)
        score_acc_mean += score_acc

        print(f'    {counter} :auc = {round(score, 4)}; acc = {round(score_acc, 4)}', end = " ")


        # advance the fold counter
        counter += 1
    
    # record the time elapsed
    time2 = time.time() - time1

    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect() # garbage collection
    print(f'Mean ROC_AUC : {round((score_mean / FOLDS), 4)}')
    print(f'Mean ACCURACY: {round((score_acc_mean / FOLDS), 4)}')


    del X_t, X_v, y_t, y_v, clf, score
    # del X_t_res, y_t_res


    return -(score_mean / FOLDS)

In [None]:
'''
Initial guess of the objective function parameters
'''

space = {
    'penalty' : hp.choice('penalty', ['l2']),
    'C' : hp.choice('C', [500]), 
    'solver' : hp.choice('solver', ['liblinear'])
}

# space = {
#     'penalty' : hp.choice('penalty', ['l2']),
#     'C' : hp.choice('C', np.logspace(-4, 4, 100)), 
#     'solver' : hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
# }

## Running the optimiser to train the model and obtain the best parameter combination

In [None]:
# Set algoritm parameters
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=1)

# Print best parameters
best_params = space_eval(space, best)

print("BEST PARAMS: ", best_params)

# Post Processing -- Feature Importance Summary

## Train a model with the best parameter set

In [None]:
clf = LogisticRegression(random_state=RAND_STATE, **best_params)

clf.fit(X, y)

## Obtain the feature importance map

In [None]:
# get feature importance ratings
importance = list(map(abs, clf.coef_[0].tolist()))
feat_names = X.columns

df_featImp = pd.DataFrame(data=importance, index=feat_names, columns=['score']).sort_values(by='score', ascending=True)


ax = df_featImp.plot(kind='barh', figsize=(5,10))

In [None]:
df_featImp.sort_values(by='score', ascending=False).head(20)

In [None]:
# terminates the current run
raise SystemExit('Run Terminated.') 

# Training Results

In [None]:
'''
0. Original model
================================================================================
Mean ROC_AUC : 0.6521
Mean ACCURACY: 0.8571

purchase_amt	        3.015827
age_of_application	    2.705475
deposit_percent	        1.746485
gtee_rate	            0.633335
homowner_consumer	    0.504437
total_month	            0.478606
industry_name_1	        0.430812
industry_name_4	        0.367680
application_year	    0.281145
consumer_post_code_2	0.248964
term_run_frac	        0.247221
payment_type	        0.245869
product_2	            0.235614
consumer_post_code_1	0.218545
product_1	            0.185457
product_3	            0.108091
homowner_ind	        0.100841
industry_name_2	        0.096097
industry_name_6	        0.086340
consumer_post_code_3	0.078943



1. SMOTE
================================================================================
Mean ROC_AUC : 0.6683
Mean ACCURACY: 0.6271

purchase_amt	        3.472045
age_of_application	    3.118785
deposit_percent	        2.044111
homowner_consumer	    0.917468
consumer_post_code_1	0.696817
gtee_rate	            0.667458
term_run_frac	        0.616765
total_month	            0.562021
homowner_ind	        0.561687
consumer_post_code_2	0.485889
product_2	            0.397051
product_1	            0.365540
industry_name_4	        0.353007
application_year	    0.231852
product_3	            0.228520
payment_type	        0.188476
consumer_post_code_3	0.146259
industry_name_6	        0.121417
industry_name_2	        0.090612
application_month	    0.077116


2. Conditional features
================================================================================
Mean ROC_AUC : 0.6551
Mean ACCURACY: 0.8571

aop_fq_mean	            5.656338
aop_pmtTp_mean	        4.355559
aop_pmtTp_stdev	        4.110974
aop_hoCon_mean	        3.820768
pAmt_indName_mean	    3.058633
pAmt_fq_mean	        2.996144
aop_indName_stdev	    2.733245
pAmt_pmtTp_stdev	    2.317841
aop_hoCon_stdev	        2.144659
pAmt_fq_stdev	        2.017118
aop_fq_stdev	        1.938017
deposit_percent	        1.836419
age_of_application	    1.533660
aop_hoId_stdev	        1.332799
aop_indName_mean	    1.223487
homowner_consumer	    0.915064
aop_hoId_mean	        0.871274
pAmt_hoId_mean	        0.838960
pAmt_hoCon_stdev	    0.833364
pAmt_hoCon_mean	        0.826123


3. 1 + 2
================================================================================
Mean ROC_AUC : 0.6739
Mean ACCURACY: 0.6312

pAmt_pmtTp_stdev	    10.605212
aop_hoCon_mean	        8.722845
aop_fq_mean	            6.185844
pAmt_pmtTp_mean	        4.232469
pAmt_indName_mean	    3.676728
aop_pmtTp_mean	        3.630854
aop_pmtTp_stdev	        3.543014
aop_hoCon_stdev	        3.335447
aop_hoId_mean	        3.244385
aop_indName_stdev	    2.779993
pAmt_fq_stdev	        2.467831
pAmt_fq_mean	        2.409095
purchase_amt	        2.309612
aop_fq_stdev	        2.277520
deposit_percent	        2.179834
homowner_consumer	    1.794267
pAmt_hoId_stdev	        1.424730
age_of_application	    1.371557
pAmt_hoCon_mean	        1.323430
pAmt_hoId_mean	        1.251162


'''

In [None]:
'''
Fixing input space
==================
space = {
    'penalty' : hp.choice('penalty', ['l2']),
    'C' : hp.choice('C', [1]), 
    'solver' : hp.choice('solver', ['liblinear'])
}

Without conditional features: 
=============================
AUC = 0.6686
ACC = 0.6276

With conditional features: 
==========================
['aop_indName_mean', 'aop_indName_stdev',\
 'pAmt_indName_mean', 'pAmt_indName_stdev']

AUC = 0.6708
ACC = 0.6294

With above conditional features and remove the entries with -1 age_of_application
================================
Mean ROC_AUC : 0.6712
Mean ACCURACY: 0.6297

With additional conditional features: 
===============================
['aop_indName_mean', 'aop_indName_stdev', 'aop_pmtTp_mean',\
'aop_pmtTp_stdev', 'aop_fq_mean', 'aop_fq_stdev', 'aop_hoId_mean',\
'aop_hoId_stdev', 'aop_hoCon_mean', 'aop_hoCon_stdev', 'pAmt_indName_mean',\
'pAmt_indName_stdev', 'pAmt_pmtTp_mean', 'pAmt_pmtTp_stdev', 'pAmt_fq_mean',\
'pAmt_fq_stdev', 'pAmt_hoId_mean', 'pAmt_hoId_stdev', 'pAmt_hoCon_mean', 'pAmt_hoCon_stdev']

Mean ROC_AUC : 0.6739
Mean ACCURACY: 0.6309

Modify input space with above conditional features
==================
space = {
    'penalty' : hp.choice('penalty', ['l2']),
    'C' : hp.choice('C', [500]), 
    'solver' : hp.choice('solver', ['liblinear'])
}

Mean ROC_AUC : 0.674
Mean ACCURACY: 0.6309
'''

In [None]:
'''
BEST PARAMS:  {'C': 545.5594781168514, 'penalty': 'l2', 'solver': 'liblinear'}


CV - scores: 
    1 : 0.6513;
    2 : 0.6523;
    3 : 0.6509;
    4 : 0.6522;
    5 : 0.6554;
    6 : 0.65;
Total Time Run: 0.38


age_of_application	    3.305887
purchase_amt	        3.111321
deposit_percent	        1.750232
gtee_rate	            0.644461
homowner_consumer	    0.505763
total_month	            0.448497
industry_name_1	        0.404635
industry_name_4	        0.357544
term_run_frac	        0.305157
application_year	    0.282223
consumer_post_code_2	0.248970
payment_type	        0.246252
consumer_post_code_1	0.220856
product_2	            0.212898
product_1	            0.182107
product_3	            0.124898
industry_name_2	        0.106117
homowner_ind	        0.104002
industry_name_6 	    0.092173
consumer_post_code_3	0.079214

'''

In [None]:
'''
BEST PARAMS:  {'C': 16.23776739188721, 'penalty': 'l2', 'solver': 'lbfgs'}


CV - scores: 
    1 : 0.6513;
    2 : 0.6523;
    3 : 0.6509;
    4 : 0.6522;
    5 : 0.6553;
    6 : 0.6501;


age_of_application	    3.311348
purchase_amt	        3.095686
deposit_percent	        1.746661
gtee_rate	            0.651738
industry_name_1	        0.527772
homowner_consumer	    0.491427
total_month	            0.452777
industry_name_4	        0.362157
application_year	    0.290661
consumer_post_code_2	0.249201
payment_type	        0.246723
consumer_post_code_1	0.221004
product_2	            0.215359
term_run_frac	        0.215209
product_1	            0.174818
product_3	            0.126728
industry_name_2	        0.102357
industry_name_6	        0.094139
homowner_ind	        0.084542
consumer_post_code_3	0.079851
'''

# Code Dump

## Exploring other features

### Exploring arrears amount feature

In [None]:
# a = df['arrears_amount'].value_counts(dropna=False).sort_index()
# print(a)

In [None]:
# df_tmp = df.loc[ df['arrears_amount'] == 0]
# df_tmp['contract_status'].value_counts(normalize=True, dropna=False)

### Exploring contract_status feature

In [None]:
# '''
# Separate the dataframe by contract status
# '''

# df_paid = df.loc[df['contract_status'] == 'PaidInFull']
# df_default = df.loc[df['contract_status'] == 'Default']
# df_active = df.loc[df['contract_status'] == 'Active']
# print(df['contract_status'].value_counts(normalize=True))

In [None]:
# # '''
# # Look at the Paid-set
# # '''
# print(df_paid['arrears_amount'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_paid['age_of_recent_default'].value_counts(dropna=False, normalize=False).sort_index())
# print()
# print(df_paid['total_balance_outstanding'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_paid['defaultdate'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print((df_paid['term_run'] / df_paid['total_term']).value_counts(dropna=False, normalize=True).sort_index())
# print()


In [None]:
# # '''
# # Look at the Default-set
# # '''
# print(df_default['arrears_amount'].value_counts(dropna=False, normalize=True).sort_values(ascending=False))
# print()
# print(df_default['age_of_recent_default'].value_counts(dropna=False, normalize=True).sort_values(ascending=False))
# print()
# print(df_default['total_balance_outstanding'].value_counts(dropna=False, normalize=True).sort_values(ascending=False))
# print()
# print(df_default['defaultdate'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print((df_default['term_run'] / df_default['total_term']).value_counts(dropna=False, normalize=True).sort_index())
# print()

In [None]:
# # '''
# # Look at the active-set
# # '''
# print(df_active['arrears_amount'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_active['age_of_recent_default'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_active['total_balance_outstanding'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_active['defaultdate'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print((df_active['term_run'] / df_active['total_term']).value_counts(dropna=False, normalize=True).sort_index())
# print()

**My Questions regarding the "contract_status" feature (ground-truth?)**

* All instances in "PaidInFull" subset have 0.0 "arrears_amount", but more than 10% of the instances in this subset has a valid "age_of_recent_default", which suggests default did occur to these intances. All instances in this subset have NaN value for "defaultdate"

* About 3% of the instances in "Default" subset have 0.0 "arrears_amount", which indicate default has never occured to these instances. 99.7% of this subset have a valid "defaultdate" entry.

* "Active" subset has very similar behaviour when compared against "PaidInFull". All instances in this subset have NaN value for "defaultdate"

**Options for ground-truth**

If **True** ==> has default, **False** ==> has no default: 

1. **True** = "Default" subset and **False** = "PaidInFull" + "Active" subsets

2. **True** = "arrears_amount" != 0, and **False** = "arrears_amount" == 0

3. **True** = "defaultdate" == valid date, and **False** = NaN

**According to the observations, none of the above options are fully make sense...**

**Meeting outcome**


4. use the combined recent group to determine the ground truth. 

    i.e. when "age_of_recent_default", "age_of_recent_default_cure" and "recent_default_amt" are all valid ==> **DEFAULT**

## Exploring the processed features

In [None]:
df_pos = pd.DataFrame()
df_pos = df.loc[df['isDefault'] == 1]

df_neg = pd.DataFrame()
df_neg = df.loc[df['isDefault'] == 0]

print(df_pos.shape)
print(df_neg.shape)

In [None]:
xx = df_neg['purchase_amt'].value_counts(normalize=True).sort_index().index
yy = df_neg['purchase_amt'].value_counts(normalize=True).sort_index().values

plt.xlim(0,20000)
# plt.ylim(0,0.01)
# plt.title('Consumer Age of Recent Default')
plt.xlabel('purchase_amt')
plt.ylabel('Probability')
plt.plot(xx, yy, 'g.')

xx = df_pos['purchase_amt'].value_counts(normalize=True).sort_index().index
yy = df_pos['purchase_amt'].value_counts(normalize=True).sort_index().values
plt.plot(xx, yy, 'r.')
plt.legend(['No Default', 'Default'])

In [None]:
xx = df_neg['isDefault'].values
yy = df_neg['purchase_amt'].values

plt.xlim(-1, 2)
# plt.ylim(0,0.01)
# plt.title('Consumer Age of Recent Default')
plt.xlabel('isDefault')
plt.ylabel('purchase_amt')
plt.plot(xx, yy, 'g.')

xx = df_pos['isDefault'].values
yy = df_pos['purchase_amt'].values
plt.plot(xx, yy, 'r.')
plt.legend(['No Default', 'Default'])

In [None]:
xx = df_neg['deposit_percent'].value_counts(normalize=True).sort_index().index
yy = df_neg['deposit_percent'].value_counts(normalize=True).sort_index().values

# plt.xlim(0,20000)
# plt.ylim(0,0.01)
# plt.title('Consumer Age of Recent Default')
plt.xlabel('deposit_percent')
plt.ylabel('Probability')
plt.plot(xx, yy, 'g.')

xx = df_pos['deposit_percent'].value_counts(normalize=True).sort_index().index
yy = df_pos['deposit_percent'].value_counts(normalize=True).sort_index().values
plt.plot(xx, yy, 'r.')
plt.legend(['No Default', 'Default'])

In [None]:
xx = df_neg['isDefault'].values
yy = df_neg['deposit_percent'].values

plt.xlim(-1, 2)
# plt.ylim(0,0.01)
# plt.title('Consumer Age of Recent Default')
plt.xlabel('isDefault')
plt.ylabel('deposit_percent')
plt.plot(xx, yy, 'g.')

xx = df_pos['isDefault'].values
yy = df_pos['deposit_percent'].values
plt.plot(xx, yy, 'r.')
plt.legend(['No Default', 'Default'])

In [None]:
feature = 'age_of_application'
xx = df_neg['isDefault'].values
yy = df_neg[feature].values

plt.xlim(-1, 2)
# plt.ylim(0,1)
# plt.title('Consumer Age of Recent Default')
plt.xlabel('isDefault')
plt.ylabel(feature)
plt.plot(xx, yy, 'g.')

xx = df_pos['isDefault'].values
yy = df_pos[feature].values
plt.plot(xx, yy, 'r.')
plt.legend(['No Default', 'Default'])

In [None]:
feature = 'age_of_application'
fig_size = (10,5)
figure, axes = plt.subplots(1, 2)
df_neg[feature].plot.hist(bins=50, ax=axes[0], figsize=fig_size)
df_pos[feature].plot.hist(bins=50, ax=axes[1], figsize=fig_size)
axes[0].set_ylim([0, 16000])
axes[1].set_ylim([0, 16000])
axes[0].set_xlabel('No Default')
axes[1].set_xlabel('Default')

In [None]:
feature = 'age_of_application'
tmp = pd.crosstab(df[feature], df['isDefault'], normalize='index') * 100
tmp = tmp.reset_index()
tmp.rename(columns={0:'NoDefault', 1:'Default'}, inplace=True)

plt.plot(tmp['age_of_application'].values, tmp['Default'], 'r.-')
plt.xlim(-1, 100)
plt.xlabel('age_of_application')
plt.ylabel('% Default')
plt.xticks((18, 30, 40, 50, 60, 70, 80, 90, 100))

In [None]:
feature = 'purchase_amt'
n_bins = 100

fig_size = (10,5)
figure, axes = plt.subplots(1, 2)
df_neg[feature].plot.hist(bins=n_bins, ax=axes[0], figsize=fig_size)
df_pos[feature].plot.hist(bins=n_bins, ax=axes[1], figsize=fig_size)
for ax in axes:
    ax.set_ylim([0, 70000])
    ax.set_xlim([0, 20000])

axes[0].set_xlabel('No Default')
axes[1].set_xlabel('Default')

In [None]:
df_neg['industry_name_conditioned_mean_aop'] = df_neg.groupby(['industry_name'])['age_of_application'].transform('mean')
df_pos['industry_name_conditioned_mean_aop'] = df_pos.groupby(['industry_name'])['age_of_application'].transform('mean')

xx = df_pos['industry_name'].values
yy = df_pos['industry_name_conditioned_mean_aop'].values
plt.figure(figsize=(10,20))
plt.xticks(rotation=90)
plt.plot(yy, xx, 'r.')

xx = df_neg['industry_name'].values
yy = df_neg['industry_name_conditioned_mean_aop'].values
plt.plot(yy, xx, 'g.')
plt.legend(['Default', 'No Default'])