<a href="https://colab.research.google.com/github/mrninainaidi/Machine-Learning-Projects/blob/master/personal_loan_default_xgb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preamble

* jupyter notebook theme (optional)
* package
* global variables

In [None]:
# # Optional: setup theme for Jupyter Notebook
# # comment out if running on Colab
# import jupytertheme as jt
# from jupyterthemes.stylefx import set_nb_theme

# set_nb_theme('chesterish')

In [None]:
import pandas as pd
import numpy as np
import math

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score

import xgboost
from xgboost import plot_importance

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

import gc
from scipy import stats
import time
import datetime
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

!pip install missingno
import missingno as msno

!pip install category_encoders
import category_encoders as ce

!pip install imblearn
from imblearn.over_sampling import SMOTE


# import seaborn as sns
# import altair as alt

In [None]:
RAND_STATE = 3

# Data Processing and Cleaning

## Import Data

In [None]:
# For colab
from google.colab import drive
drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/further_study/machine_learning_projects/personal_loan_rating/'
df = pd.read_csv(root_path+'default_loan_no_quotes.csv')

In [None]:
# # For others (Jupyter Notebook)
# # NOTE: This requies the data file to be saved under the same directory as this file.

# df = pd.read_csv('default_loan_no_quotes.csv')

In [None]:
# df.head()

In [None]:
df.columns = df.columns.str.replace(' ','_')
df.columns = map(str.lower, df.columns)
df.columns

## Features With Null Values

In [None]:
msno.bar(df)

In [None]:
na_names = list()

for col in df.columns:
    if df[col].isna().sum() > 0:
        print(f'Feature: {col}, has {100 * df[col].isna().sum() / df[col].shape[0]:.3f}%  or {df[col].isna().sum()} null values.')
        na_names.append(col)

na_names

## Checking Feature Types

## Drop the columns of little interests


In [None]:
'''
Drop the columns of little interests
'''

not_interested = ['entry_date', 'fist_installment_date',\
                  'id', 'deposit_amt', 'financed_amt', 'term_remaining',\
                  'instalment_amt', 'amt_paid_to_merchant_nettofmerchfeesandgst',\
                  'est_fees', 'proc_fees', 'other_fees', 'total_merchant_charges',\
                  'total_consumer_charges']

for name in not_interested: 
    if name not in df.columns:
        raise ValueError(f'column name: {name} is not valid')

df.drop(columns=not_interested, inplace=True)
# df.head()

## Cleaning the consumer post code feature

In [None]:
# manually correct typos in post code
df['consumer_post_code'].loc[df['consumer_post_code'] == '28501'] = '2850'
df['consumer_post_code'].loc[df['consumer_post_code'] == '2166`1'] = '2166'
df['consumer_post_code'].loc[df['consumer_post_code'] == '414'] = np.nan
df['consumer_post_code'].loc[df['consumer_post_code'] == 'CM144WG'] = np.nan
df['consumer_post_code'].loc[df['consumer_post_code'] == '4Q53'] = '4053'
df['consumer_post_code'].loc[df['consumer_post_code'] == '40/2'] = '4012'
df['consumer_post_code'].loc[df['consumer_post_code'] == '482O'] = '4820'
df['consumer_post_code'].loc[df['consumer_post_code'] == '500O'] = '5000'
df['consumer_post_code'].loc[df['consumer_post_code'] == '430('] = np.nan
df['consumer_post_code'].loc[df['consumer_post_code'] == '48/7'] = '4817'

# convert NA values to 'unknown'
consumerid_list = df['consumer_id'].loc[df['consumer_post_code'].isna()].values
consumerid_list = set(consumerid_list)
print(consumerid_list)

for id in consumerid_list:
    if df['consumer_post_code'].loc[df['consumer_id'] == id].isnull().values.all():
        print(f'consumer: {id} has no post code info')
        df['consumer_post_code'].loc[df['consumer_id'] == id] = 'unknown'
    else:
        possible_post_codes = df["consumer_post_code"].loc[df["consumer_id"] == id].values
        possible_post_codes = possible_post_codes[pd.notna(possible_post_codes)]
        print(f'consumer: {id} has the following post code: {possible_post_codes}')
        print(f'    applying post code to consumer: {id}')
        df['consumer_post_code'].loc[df['consumer_id'] == id] = str(int(possible_post_codes[0]))


# make sure all int and float-type entries are cast to str
df['consumer_post_code'] = df['consumer_post_code'].astype(str).replace('\.0', '', regex=True)
print('Convertion complete.')

In [None]:
# # to check if all instances of the 'consumer_post_code' feature have been converted
# # to string-type
# for _, row in df.iterrows():
#     try: 
#         assert(isinstance(row['consumer_post_code'], str))
#     except: 
#         print(row)

## Cleaning the consumer year of birth feature

In [None]:
# convert NA values to '99/99/9999'
consumerid_list = df['consumer_id'].loc[df['consumer_year_of_birth'].isna()].values
consumerid_list = set(consumerid_list)

for id in consumerid_list:
    if df['consumer_year_of_birth'].loc[df['consumer_id'] == id].isnull().values.all():
        print(f'consumer: {id} has no DoB info')
        df['consumer_year_of_birth'].loc[df['consumer_id'] == id] = '99/99/9999'
    else:
        
        possibleDoB = df["consumer_year_of_birth"].loc[df["consumer_id"] == id].values
        possibleDoB = possibleDoB[pd.notna(possibleDoB)]
        print(f'consumer: {id} has the following DoBs: {possibleDoB}')
        print(f'    applying DoB to consumer: {id}')
        df['consumer_year_of_birth'].loc[df['consumer_id'] == id] = str(possibleDoB[0])

# Convert str-type DoB to int-type year of birth
df['consumer_year_of_birth'] = df['consumer_year_of_birth'].str.split('/', expand=True)[2].astype(int)
print('Convertion complete.')

In [None]:
# to check if all instances of the 'consumer_year_of_birth' feature have been converted
# to int-type or np.nan
# for _, row in df.iterrows():
#     yob = row['consumer_year_of_birth']
#     if isinstance(yob, int):
#         if yob > 1900 and yob <= 9999:
#             continue
#     else: 
#         print(f'row["consumer_year_of_birth"] = {yob}')
# print('Assertion complete.')

In [None]:
x = df['consumer_year_of_birth'].value_counts(normalize=True).sort_index().index
y = df['consumer_year_of_birth'].value_counts(normalize=True).sort_index().values
plt.xlim(1900,2000)
plt.title('Consumer Age Distribution')
plt.xlabel('Year of Birth')
plt.ylabel('Probability')
plt.plot(x, y,'g*')


## Converting application_date feature to application_month and application_year

In [None]:
# Convert str-type application date to int-type year of application
df['application_year'] = df['application_date'].str.split('/', expand=True)[2].astype(int)
df['application_month'] = df['application_date'].str.split('/', expand=True)[1].astype(int)


## Converting recent_default_default_date to recent_default_year and recent_dafault_month

In [None]:
df['recent_default_default_date'] = df['recent_default_default_date'].replace(np.nan, '00/00/0000', regex=True)
df['recent_default_year'] = df['recent_default_default_date'].str.split('/', expand=True)[2].astype(int)
df['recent_default_month'] = df['recent_default_default_date'].str.split('/', expand=True)[1].astype(int)
# df['recent_default_month'].value_counts()

In [None]:
x = df['recent_default_month'].value_counts(normalize=True).sort_index().index
y = df['recent_default_month'].value_counts(normalize=True).sort_index().values

plt.xlim(1,12)
plt.ylim(0,0.02)

plt.title('Recent default month distribution')
plt.xlabel('Month of default')
plt.ylabel('Probability')
plt.plot(x, y,'g*-')

## Adding age_of_application feature (integer)

In [None]:
df['age_of_application'] = df['application_year'] - df['consumer_year_of_birth']

# use this "age of application" to validate the "consumer year of birth"
# if "age of application" < 18, the minimum legal age of having a credit account
# the "consumer year of birth" entry must be faulty. 
df['consumer_year_of_birth'].loc[df['age_of_application'] < 18] = int(9999)
df['age_of_application'].loc[df['age_of_application'] < 18] = int(-1)

In [None]:
# Drop the instances with -1 age_of_application
df.drop(df[df['age_of_application'] == -1].index, axis=0, inplace=True)

In [None]:
x = df['age_of_application'].value_counts(normalize=True).sort_index().index
y = df['age_of_application'].value_counts(normalize=True).sort_index().values
plt.xlim(-1,100)
plt.title('Consumer Age of Application')
plt.xlabel('Consumer Age')
plt.ylabel('Probability')
plt.plot(x, y, 'g*')

## Adding age_of_recent_default feature (integer)

In [None]:
df['age_of_recent_default'] = df['recent_default_year'] - df['consumer_year_of_birth']
# df['age_of_recent_default'].value_counts(normalize=True)

In [None]:
# recent default should not happen before the year of application
# the age of application has to be > 18 for age of recent default to be effective

# For invalid entry of recent_default_default_date and consumer_year_of_birth
df['age_of_recent_default'].loc[(df['age_of_recent_default'] <df['age_of_application'])\
                                | (df['age_of_application'] < 18)] = int(-1)
df['recent_default_year'].loc[(df['age_of_recent_default'] <df['age_of_application'])\
                                | (df['age_of_application'] < 18)] = int(0)
df['recent_default_month'].loc[(df['age_of_recent_default'] <df['age_of_application'])\
                                | (df['age_of_application'] < 18)] = int(0)

# For absent recent_default_default_date 
df['age_of_recent_default'].loc[df['recent_default_year'] == 0] = int(0)
df['recent_default_year'].loc[df['recent_default_year'] == 0] = int(0)
df['recent_default_month'].loc[df['recent_default_year'] == 0] = int(0)

print('Convertion complete.')

In [None]:
x = df['age_of_recent_default'].value_counts(normalize=True).sort_index().index
y = df['age_of_recent_default'].value_counts(normalize=True).sort_index().values
plt.xlim(20,100)
plt.ylim(0,0.01)
plt.title('Consumer Age of Recent Default')
plt.xlabel('Consumer Age')
plt.ylabel('Probability')
plt.plot(x, y, 'g*')

## Cleaning product feature

In [None]:
# replace NaN with 'unknown'
df['product'] = df['product'].replace(np.nan, 'unknown', regex=True)

In [None]:
# shorten the tails
x = df['product'].value_counts(normalize=True).index
y = df['product'].value_counts(normalize=True).values

## Shorten the features with heavy tails

* 'product'
* 'merchant_name'
* 'merchant_number'

In [None]:

def convert_tails_to_others(dataframe, feature, fracToConvert):
    x = dataframe[feature].value_counts(normalize=True).index
    y = dataframe[feature].value_counts(normalize=True).values

    all_list = dataframe[feature].value_counts(normalize=True).index.tolist()

    # obtain the list of value to keep
    threshold = 1 - fracToConvert
    current = 0.0
    keep_list = list()

    for i in range(len(y)):
        if current >= threshold:
            break
        current += y[i]
        keep_list.append(x[i])

    drop_list = [x for x in all_list if x not in keep_list]

    # apply keep_list
    dataframe[feature].loc[dataframe[feature].isin(drop_list)] = 'others'
    # print(dataframe[feature].value_counts(normalize=True))
    # print()

In [None]:
col_names = ['product', 'merchant_name', 'merchant_number']
frac_dict = {'product':0.08, 'merchant_name':0.05, 'merchant_number':0.05}

for name in col_names:
    convert_tails_to_others(df, name, frac_dict[name])

## Cleaning total_balance_outstanding feature

In [None]:
df_tmp = df['total_balance_outstanding']

df_tmp.replace(np.nan, '0.0', regex=True, inplace=True)
df_tmp.replace(',', '', regex=True, inplace=True)
df_tmp = df_tmp.astype(float)

df['total_balance_outstanding'] = df_tmp
del df_tmp
print('Convertion complete')

In [None]:
# # check if everything has been converted to float-type
# for index, value in df['total_balance_outstanding'].items():
#     if not isinstance(value, float):
#         print(f'{value} ----- {type(value)}')
# print('Assertion complete.')

## Cleaning recent_default_default_amt feature

In [None]:
df_tmp = df['recent_default_default_amt']

df_tmp.replace(np.nan, '0.0', regex=True, inplace=True)
df_tmp.replace(',', '', regex=True, inplace=True)
df_tmp = df_tmp.astype(float)

df['recent_default_default_amt'] = df_tmp
del df_tmp
print('Convertion complete')

In [None]:
# # check if everything has been converted to float-type
# for index, value in df['recent_default_default_amt'].items():
#     if not isinstance(value, float):
#         print(f'{value} ----- {type(value)}')
# print('Assertion complete.')

## Adding term_run_frac feature
representing the fraction of terms that have been fulfilled. 

In [None]:
df['term_run_frac'] = df['term_run'] / df['total_term']

## Adding total_month feature

In [None]:
df_tmp = pd.DataFrame()
df_tmp['total_term'] = df['total_term']
df_tmp['total_month'] = df['total_term']
df_tmp['freq'] = df['freq']

mask = (df_tmp['freq'] == 'FN')
df_valid = df_tmp[mask]

df_tmp.loc[mask, 'total_month'] = df_valid['total_term'] / 2

df['total_month'] = df_tmp['total_month']
del df_tmp

## Adding conditional mean/std features

In [None]:
# Conditioning for "age_op_application"
df['aop_indName_mean'] = df['age_of_application'] / df.groupby(['industry_name'])['age_of_application'].transform('mean')
df['aop_indName_stdev'] = df['age_of_application'] / df.groupby(['industry_name'])['age_of_application'].transform('std')

df['aop_pmtTp_mean'] = df['age_of_application'] / df.groupby(['payment_type'])['age_of_application'].transform('mean')
df['aop_pmtTp_stdev'] = df['age_of_application'] / df.groupby(['payment_type'])['age_of_application'].transform('std')

df['aop_fq_mean'] = df['age_of_application'] / df.groupby(['freq'])['age_of_application'].transform('mean')
df['aop_fq_stdev'] = df['age_of_application'] / df.groupby(['freq'])['age_of_application'].transform('std')

df['aop_hoId_mean'] = df['age_of_application'] / df.groupby(['homowner_ind'])['age_of_application'].transform('mean')
df['aop_hoId_stdev'] = df['age_of_application'] / df.groupby(['homowner_ind'])['age_of_application'].transform('std')

df['aop_hoCon_mean'] = df['age_of_application'] / df.groupby(['homowner_consumer'])['age_of_application'].transform('mean')
df['aop_hoCon_stdev'] = df['age_of_application'] / df.groupby(['homowner_consumer'])['age_of_application'].transform('std')


# Conditioning for "purchase_amt"
df['pAmt_indName_mean'] = df['purchase_amt'] / df.groupby(['industry_name'])['purchase_amt'].transform('mean')
df['pAmt_indName_stdev'] = df['purchase_amt'] / df.groupby(['industry_name'])['purchase_amt'].transform('std')

df['pAmt_pmtTp_mean'] = df['purchase_amt'] / df.groupby(['payment_type'])['purchase_amt'].transform('mean')
df['pAmt_pmtTp_stdev'] = df['purchase_amt'] / df.groupby(['payment_type'])['purchase_amt'].transform('std')

df['pAmt_fq_mean'] = df['purchase_amt'] / df.groupby(['freq'])['purchase_amt'].transform('mean')
df['pAmt_fq_stdev'] = df['purchase_amt'] / df.groupby(['freq'])['purchase_amt'].transform('std')

df['pAmt_hoId_mean'] = df['purchase_amt'] / df.groupby(['homowner_ind'])['purchase_amt'].transform('mean')
df['pAmt_hoId_stdev'] = df['purchase_amt'] / df.groupby(['homowner_ind'])['purchase_amt'].transform('std')

df['pAmt_hoCon_mean'] = df['purchase_amt'] / df.groupby(['homowner_consumer'])['purchase_amt'].transform('mean')
df['pAmt_hoCon_stdev'] = df['purchase_amt'] / df.groupby(['homowner_consumer'])['purchase_amt'].transform('std')

In [None]:
df.drop(df[df['aop_indName_stdev'].isna()].index, axis=0, inplace=True)

In [None]:
# check for NaN in the conditional features: 
cond_names = ['aop_indName_mean', 'aop_indName_stdev', 'aop_pmtTp_mean',\
              'aop_pmtTp_stdev', 'aop_fq_mean', 'aop_fq_stdev', 'aop_hoId_mean',\
              'aop_hoId_stdev', 'aop_hoCon_mean', 'aop_hoCon_stdev', 'pAmt_indName_mean',\
              'pAmt_indName_stdev', 'pAmt_pmtTp_mean', 'pAmt_pmtTp_stdev', 'pAmt_fq_mean',\
              'pAmt_fq_stdev', 'pAmt_hoId_mean', 'pAmt_hoId_stdev', 'pAmt_hoCon_mean', 'pAmt_hoCon_stdev']

df_tmp = pd.DataFrame()
for name in cond_names:
    df_tmp[name] = df[name].copy()

# df_tmp.head()
msno.bar(df_tmp)

## Define ground truth

In [None]:
# df_recent = df[[col for col in df.columns if 'recent' in col]]
# df_recent['defaultdate'] = df['defaultdate']
# df_recent['consumer_id'] = df['consumer_id']
# df_recent['defaultamount'] = df['defaultamount']
# df_recent['contract_number'] = df['contract_number']
# df_recent['contract_status'] = df['contract_status']
# df_recent['expected_contract_end_date'] = df['expected_contract_end_date']

In [None]:
# for index, row in df_recent.iterrows():
#     if row['recent_default_default_amt'] == 0:
#         if isinstance(row['defaultdate'], str):
#             print(row)
#             print()      

**Test outcome**

* when "recent_default_year" == 0, there are FIVE instances that "recent_default_default_amt" != 0. And all FIVE instances are marked as DEFAULT by the "contract_status"


* when "recent_default_default_amt" == 0, there are TWO instances that "recent_default_year" != 0. And all of the TWO instances are marked as PAIDINFULL by the "contract_status".

    ==> both "recent_default_year" and "recent_default_default_amt" == 0 means NoDefault

**Suspect bad columns:**

Assumning 'recent_default_default_amt' is the indicator for the ground truth... 

* 'defaultdate'

* 'defaultamount'

* 'total_balance_outstanding'

* 'recent_default_default_date' ==> 'recent_default_year' ==> 'recent_default_age'


**Question:**

Do I go back to realign 'recent_default_year' and 'recent_default_age' with the assumed ground truth???


In [None]:
# introduce the ground truth according to above analysis
df['isDefault'] = df['recent_default_default_amt'] > 0

## Finalising data processing and digitising categorical features

In [None]:
print(df.columns)

### Collect numeric features

In [None]:
df_train = pd.DataFrame()

num_names = ['purchase_amt','deposit_percent', 'age_of_application', \
             'gtee_rate','term_run_frac', 'total_month']

# num_names = num_names + cond_names

for name in num_names: 
    if name not in df.columns:
        raise ValueError(f'column name: {name} is not valid')

for name in num_names: 
    df_train[name] = df[name].copy()

df_train.head(10)

### Digitise categorical features

In [None]:
def ordered_labels(df, col, order):
    df[col] = df[col].astype('category')
    df[col] = df[col].cat.reorder_categories(order, ordered=True)
    df[col] = df[col].cat.codes.astype(int)

In [None]:
# split the categorical features into... 
# ordered
ordered_features = ['application_year', 'application_month']
ordered_dict = dict()
for elm in ordered_features:
    ordered_dict[elm] = df[elm].value_counts().sort_index().index

# one-hot like
oneHot_features = ['product', 'consumer_post_code', 'industry_name']

# binary
nominated_features = ['payment_type', 'freq', 'homowner_ind', 'homowner_consumer', 'isDefault']

cate_names = ordered_features + nominated_features + oneHot_features

# initialise encoder
le = LabelEncoder()

# start encoding... 
for col in cate_names: 
    df_train[col] = df[col].copy()

    # the ordered_features
    if col in ordered_features:
        ordered_labels(df_train, col, ordered_dict[col])
        continue

    # the one-hot like features
    if col in oneHot_features:
        encoder = ce.BinaryEncoder(cols=[col])
        df_train = pd.concat([df_train, encoder.fit_transform(df_train[col])], axis=1)
        df_train.drop(columns=[col], inplace=True)
        continue

    # other features
    le.fit(list(df_train[col].astype(str).values))
    df_train[col] = le.transform(list(df_train[col].astype(str).values))
    length = df_train[col].value_counts().shape

df_train.head(10)

In [None]:
df_train.shape

### Rescale everything to (0, 1)

In [None]:
# Scale numeric features to (0, 1)
value_array = df_train.values
col_names = df_train.columns
# col_names = num_names + ordered_features

min_max_scaler = MinMaxScaler()

value_array_scaled = min_max_scaler.fit_transform(value_array)

df_train = pd.DataFrame(value_array_scaled, columns=col_names)

df_train.describe()

### Separate input and output

In [None]:
# model input
X = df_train.drop(columns=['isDefault'])

# expected output
y = df_train['isDefault']

print(X.shape)
print(y.shape)

### Re-sampling with SMOTE

In [None]:
# # initialise SMOTE sampling
# sm = SMOTE(random_state=RAND_STATE)

# # resample the training set
# input, target = sm.fit_sample(X, y.ravel())

In [None]:
# df_tmp = pd.DataFrame(input, columns=X.columns)
# df_tmp['isDefault'] = target
# df_tmp.head()

In [None]:
# df_train = df_tmp
# del df_tmp
# df_train.shape

In [None]:
# # Re-split input/output

# # model input
# X = df_train.drop(columns=['isDefault'])
# # expected output
# y = df_train['isDefault']

# print(X.shape)
# print(y.shape)

# Model Training and Validating

## HyperOpt function and parameters space definition

In [None]:
'''
Define the objective function
    The output of this objective fnc has been set to the negative value of the mean
    score, so that fmin() can be used to find the most suitable parameters for 
    maximum mean score.
'''
def objective(params):
    time1 = time.time()

    params = {
        'max_depth'         : int(params['max_depth']),
        'gamma'             : '{:.3f}'.format(params['gamma']),
        'subsample'         : '{:.2f}'.format(params['subsample']),
        'reg_alpha'         : '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda'        : '{:.3f}'.format(params['reg_lambda']),
        'learning_rate'     : '{:.3f}'.format(params['learning_rate']),
        'num_leaves'        : '{:.3f}'.format(params['num_leaves']),
        'colsample_bytree'  : '{:.3f}'.format(params['colsample_bytree']),
        'min_child_samples' : '{:.3f}'.format(params['min_child_samples']),
        'feature_fraction'  : '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction'  : '{:.3f}'.format(params['bagging_fraction'])
    }

    df_toprint = pd.DataFrame(params, index=[0])

    print('\n############## New Run ################')
    print(f"params = {df_toprint.transpose()}")

    # declair total number of folds and fold counter
    FOLDS = 6
    counter = 1

    # instantiate the TSS model
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RAND_STATE)

    # # initialise the 0-valued array to hold the final predictions of the test set
    # y_preds = np.zeros(y.shape[0])

    # initialise the 0-valued array to hold the flag of wrong predictions for training set
    y_oof = np.zeros(X.shape[0])

    # initialise the mean score for the cross-validation
    score_mean = 0
    score_acc_mean = 0
    conf_mtx = []

    print(f'Training set shape: {X.shape}')

    print('\nCV - scores: ')

    # Start the Training and Cross-validation loop
    for t_idx, v_idx in skf.split(X, y):

        # instantiate the XGB classifier
        clf = xgboost.XGBClassifier(
            n_estimators = 200, #600,
            random_state = RAND_STATE, 
            verbose = True, 
            tree_method = 'hist', #'gpu_hist'
            **params
        )

        # get the time series split indices
        X_t, X_v = X.iloc[t_idx, :], X.iloc[v_idx, :]
        y_t, y_v = y.iloc[t_idx], y.iloc[v_idx]

        # Model training
        clf.fit(X_t, y_t)

        # Obtain the validation score for the fitted model
        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_v, y_v)
        score_mean += score

        score_acc = make_scorer(accuracy_score)(clf, X_v, y_v)
        score_acc_mean += score_acc

        print(f'    {counter} :auc = {round(score, 4)}; acc = {round(score_acc, 4)}', end = " ")

        # populate the confusion matrix
        y_pred = clf.predict(X_v)
        tn, fp, fn, tp = confusion_matrix(y_v, y_pred).ravel()
        conf_mtx.append([tn, fp, fn, tp])

        # advance the fold counter
        counter += 1
    
    # record the time elapsed
    time2 = time.time() - time1

    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect() # garbage collection
    print(f'\nMean ROC_AUC : {round((score_mean / FOLDS), 4)}')
    print(f'Mean ACCURACY: {round((score_acc_mean / FOLDS), 4)}')

    # compute mean confusion matrix
    print('\nConfusion Matrix: ')
    i = 0
    total = 0
    name_list = ['TN', 'FP', 'FN', 'TP']
    for col in zip(conf_mtx):
        print(f'Mean {name_list[i]} = {round((np.mean(col)), 0)}')
        i += 1
        total += round((np.mean(col)), 0)
    print(f'Total instance = {total}, size of validation set = {X_v.shape[0]}. ')

    del X_t, X_v, y_t, y_v, clf, score, tn, fp, fn, tp

    return -(score_mean / FOLDS)

In [None]:
'''
Initial guess of the objective function parameters
'''

# space = {
#     'max_depth': hp.quniform('max_depth', 3, 18, 1),    
#     'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
#     'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
#     'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
#     'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
#     'gamma': hp.uniform('gamma', 0.01, .7),
#     'num_leaves': hp.choice('num_leaves', list(range(8, 100, 2))),
#     'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
#     'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
#     'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
#     'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
# }

# dummy space
space = {
    'max_depth': hp.choice('max_depth', [10]),    
    'reg_alpha':  hp.choice('reg_alpha', [0.2]),
    'reg_lambda': hp.choice('reg_lambda', [0.2]),
    'learning_rate': hp.choice('learning_rate', [0.15]),
    'colsample_bytree': hp.choice('colsample_bytree', [0.6]),
    'gamma': hp.choice('gamma', [0.4]),
    'num_leaves': hp.choice('num_leaves', [32]),
    'min_child_samples': hp.choice('min_child_samples', [100]),
    'subsample': hp.choice('subsample', [0.7]),
    'feature_fraction': hp.choice('feature_fraction', [0.6]),
    'bagging_fraction': hp.choice('bagging_fraction', [0.6])
}

## Running the optimiser to train the model and obtain the best parameter combination

In [None]:
# Set algoritm parameters
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=1)

# Print best parameters
best_params = space_eval(space, best)

print("BEST PARAMS: ", best_params)

# make sure the 'max_depth' is an integer value
best_params['max_depth'] = int(best_params['max_depth'])

# Post Processing -- Feature Importance Summary

## Train a model with the best parameter set

In [None]:
clf = xgboost.XGBClassifier(
    n_estimators=300,
    **best_params,
    tree_method='hist'
)

clf.fit(X, y)

## Feature importance map

In [None]:
# get feature importance ratings
feature_importance = clf.get_booster().get_score(importance_type='weight')

# transfer teature_importance into a data frame
keys = list(feature_importance.keys())
values = list(feature_importance.values())

df_featImp = pd.DataFrame(data=values, index=keys, columns=['score']).sort_values(by='score', ascending=False)

ax = df_featImp.plot(kind='barh', figsize=(5,10))

In [None]:
df_featImp.head(20)

In [None]:
# terminates the current run
raise SystemExit('Run Terminated.') 

# Training Results

In [None]:
'''
0. Original model
================================================================================
Mean ROC_AUC : 0.6394
Mean ACCURACY: 0.8534

purchase_amt	        20438
age_of_application	    17522
deposit_percent	        17116
gtee_rate	            15754
application_month	    11291
total_month	            5404
consumer_post_code_6	3355
consumer_post_code_10	3296
consumer_post_code_9	3217
application_year	    3181
consumer_post_code_11	3125
consumer_post_code_4	3088
consumer_post_code_5	3042
consumer_post_code_12	3025
consumer_post_code_8	2949
consumer_post_code_7	2934
consumer_post_code_3	2724
product_12	            2714
payment_type	        2603
product_7	            2580


1. SMOTE re-sampling
================================================================================
Mean ROC_AUC : 0.9429
Mean ACCURACY: 0.9141

purchase_amt	        13607
deposit_percent	        11704
age_of_application	    11437
gtee_rate	            10769
application_month	    7311
total_month	            4150
application_year	    2442
consumer_post_code_10	2408
consumer_post_code_6	2292
consumer_post_code_9	2277
consumer_post_code_12	2204
consumer_post_code_11	2178
consumer_post_code_7	2163
consumer_post_code_4	2157
consumer_post_code_8	2109
consumer_post_code_5	2096
payment_type	        1914
consumer_post_code_3	1911
product_12	            1714
product_7	            1712


2. Additional features
================================================================================
Mean ROC_AUC : 0.6382
Mean ACCURACY: 0.8533

deposit_percent	        8545
gtee_rate	            8387
pAmt_indName_mean	    6506
pAmt_indName_stdev	    6334
aop_indName_mean	    6032
application_month	    5825
aop_indName_stdev	    5271
pAmt_fq_mean	        4901
aop_pmtTp_mean	        4129
aop_fq_mean	            4012
pAmt_pmtTp_stdev	    3978
pAmt_fq_stdev	        3942
aop_pmtTp_stdev	        3652
purchase_amt	        3597
pAmt_hoId_mean	        3295
pAmt_pmtTp_mean	        3180
pAmt_hoCon_mean	        2719
age_of_application	    2236
pAmt_hoId_stdev	        2185
total_month	            2180


3. 1 + 2
================================================================================
Mean ROC_AUC : 0.9427
Mean ACCURACY: 0.9135

deposit_percent	        6336
gtee_rate	            6306
pAmt_indName_mean	    4779
pAmt_indName_stdev	    4549
aop_indName_mean	    4342
application_month	    4104
aop_indName_stdev	    3835
pAmt_fq_mean	        3466
aop_pmtTp_mean	        3162
aop_fq_mean	            2869
pAmt_pmtTp_stdev	    2858
pAmt_fq_stdev	        2757
purchase_amt	        2551
aop_pmtTp_stdev	        2528
pAmt_hoId_mean	        2326
total_month	            2323
pAmt_pmtTp_mean	        2174
age_of_application	    1989
pAmt_hoCon_mean	        1890
pAmt_hoCon_stdev	    1764



'''

In [None]:
'''
BEST PARAMS:  {'bagging_fraction': 0.568470595063193, 'colsample_bytree': 0.8693029173059938, 'feature_fraction': 0.6788843180864463, 'gamma': 0.28179774192830787, 'learning_rate': 0.046935737548640236, 'max_depth': 18.0, 'min_child_samples': 160, 'num_leaves': 80, 'reg_alpha': 0.15556590187576974, 'reg_lambda': 0.30452242043564615, 'subsample': 0.8}

Mean ROC_AUC : 0.9473
Mean ACCURACY: 0.9145

purchase_amt	        98014
gtee_rate	            95500
pAmt_indName_mean	    93867
deposit_percent	        93477
pAmt_indName_stdev	    82348
application_month	    77532
aop_indName_mean	    72501
aop_indName_stdev	    70291
age_of_application	    70053
total_month	            33514
consumer_post_code_12	23304
consumer_post_code_11	22748
consumer_post_code_6	22102
consumer_post_code_9	21838
consumer_post_code_8	21665
consumer_post_code_7	21612
consumer_post_code_10	21418
consumer_post_code_5	20726
consumer_post_code_4	18783
consumer_post_code_3	17790
'''

In [None]:
'''
Original parameter range
For Trials 1 to 3.
---------------------------

space = {
    # The maximum depth of a tree, same as GBM.
    # Used to control over-fitting as higher depth will allow model 
    # to learn relations very specific to a particular sample.
    # Should be tuned using CV.
    # Typical values: 3-10
    'max_depth': hp.quniform('max_depth', 7, 23, 1),
    
    # reg_alpha: L1 regularization term. L1 regularization encourages sparsity 
    # (meaning pulling weights to 0). It can be more useful when the objective
    # is logistic regression since you might need help with feature selection.
    # Increasing this value will make the model more conservative. 
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    
    # reg_lambda: L2 regularization term. L2 encourages smaller weights, this
    # approach can be more useful in tree-models where zeroing 
    # features might not make much sense.
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    
    # eta: Analogous to learning rate in GBM
    # Makes the model more robust by shrinking the weights on each step
    # Typical final values to be used: 0.01-0.2
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    
    # colsample_bytree: Similar to max_features in GBM. Denotes the 
    # fraction of columns to be randomly samples for each tree.
    # Typical values: 0.5-1
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
    
    # A node is split only when the resulting split gives a positive
    # reduction in the loss function. Gamma specifies the 
    # minimum loss reduction required to make a split.
    # Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.
    'gamma': hp.uniform('gamma', 0.01, .7),
    
    # more increases accuracy, but may lead to overfitting.
    # num_leaves: the number of leaf nodes to use. Having a large number 
    # of leaves will improve accuracy, but will also lead to overfitting.
    'num_leaves': hp.choice('num_leaves', list(range(20, 250, 10))),
    
    # specifies the minimum samples per leaf node.
    # the minimum number of samples (data) to group into a leaf. 
    # The parameter can greatly assist with overfitting: larger sample
    # sizes per leaf will reduce overfitting (but may lead to under-fitting).
    'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    
    # subsample: represents a fraction of the rows (observations) to be 
    # considered when building each subtree. Tianqi Chen and Carlos Guestrin
    # in their paper A Scalable Tree Boosting System recommend 
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    
    # randomly select a fraction of the features.
    # feature_fraction: controls the subsampling of features used
    # for training (as opposed to subsampling the actual training data in 
    # the case of bagging). Smaller fractions reduce overfitting.
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    
    # randomly bag or subsample training data.
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
    
    # bagging_fraction and bagging_freq: enables bagging (subsampling) 
    # of the training data. Both values need to be set for bagging to be used.
    # The frequency controls how often (iteration) bagging is used. Smaller
    # fractions and frequencies reduce overfitting.
}
'''
print()

In [None]:
# First trial
'''
Features Considered:
--------------------
num_names = ['purchase_amt','deposit_percent',\
             'gtee_rate','term_run_frac']
             
cate_names = ['product', 'contract_status','total_term',\
              'payment_type', 'freq', 'consumer_post_code', 'merchant_number',\
              'merchant_name', 'industry_name', 'homowner_ind',\
              'homowner_consumer', 'recent_default_year', 'recent_default_month',\
              'age_of_application', 'age_of_recent_default','application_year',\
              'application_month', 'isDefault']
              
Training Results: (first round)
-----------------
############## New Run ################
params = {'max_depth': 12, 'gamma': '0.178', 'subsample': '0.90', 'reg_alpha': '0.135', 'reg_lambda': '0.199', 'learning_rate': '0.146', 'num_leaves': '200.000', 'colsample_bytree': '0.689', 'min_child_samples': '100.000', 'feature_fraction': '0.699', 'bagging_fraction': '0.777'}
1 CV - score: 1.0                                     
2 CV - score: 1.0                                     
3 CV - score: 1.0                                     
4 CV - score: 1.0                                     
5 CV - score: 1.0                                     
6 CV - score: 1.0                                     
Total Time Run: 2.93                                  
Mean ROC_AUC: 0.999993745011824  

'''


# Second trial
'''
Features Considered:
--------------------
The following features have been removed, otherwise remain the same: 
'recent_default_year'
'recent_default_month'
'age_of_recent_default'

As these features are observed in "post-default".

Training Results: (best round, not finished)
-----------------
############## New Run ################
params = {'max_depth': 9, 'gamma': '0.487', 'subsample': '0.20', 'reg_alpha': '0.274', 'reg_lambda': '0.125', 'learning_rate': '0.018', 'num_leaves': '190.000', 'colsample_bytree': '0.516', 'min_child_samples': '200.000', 'feature_fraction': '0.559', 'bagging_fraction': '0.642'}
1 CV - score: 0.742
2 CV - score: 0.7493
3 CV - score: 0.753
4 CV - score: 0.7566
5 CV - score: 0.7536
6 CV - score: 0.7513
Total Time Run: 1.91
Mean ROC_AUC: 0.7509450477943139
'''


# Third trial
'''
Features Considered:
--------------------
The following features have been removed, otherwise remain the same: 
'contract_status'


Training Results: (best, not finished)
-----------------
############## New Run ################
params = {'max_depth': 10, 'gamma': '0.381', 'subsample': '0.90', 'reg_alpha': '0.264', 'reg_lambda': '0.263', 'learning_rate': '0.048', 'num_leaves': '70.000', 'colsample_bytree': '0.626', 'min_child_samples': '200.000', 'feature_fraction': '0.669', 'bagging_fraction': '0.710'}
1 CV - score: 0.6704
2 CV - score: 0.6716
3 CV - score: 0.6732
4 CV - score: 0.6747
5 CV - score: 0.6759
6 CV - score: 0.6718
Total Time Run: 2.45
Mean ROC_AUC: 0.6729110842886276
'''
print()

In [None]:
# 4th Trial
'''
Parameter Range: 
------------------
space = {
    'max_depth': hp.quniform('max_depth', 3, 12, 1),    
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'num_leaves': hp.choice('num_leaves', list(range(20, 250, 10))),
    'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
}
'''

'''
Features Considered: 
--------------------
Same as 3rd Trial

Training Results: (best, finished)
-----------------
params =                          0
max_depth                6
gamma                0.368
subsample             0.60
reg_alpha            0.270
reg_lambda           0.374
learning_rate        0.076
num_leaves          20.000
colsample_bytree     0.680
min_child_samples  170.000
feature_fraction     0.497
bagging_fraction     0.806

CV - scores: 
    1 : 0.6751;
    2 : 0.674;
    3 : 0.674;
    4 : 0.6774;
    5 : 0.676;
    6 : 0.6731;
Total Time Run: 1.65
Mean ROC_AUC: 0.6749405834057569
'''

'''
Feature Importance: 
-------------------
consumer_post_code	    2400
purchase_amt	        2118
deposit_percent	        1754
age_of_application	    1634
gtee_rate	            1486
merchant_name	        1444
merchant_number	        1406
product	                1383
application_month	    956
industry_name	        871
total_term	            361
payment_type	        328
application_year	    273
freq	                261
homowner_consumer	    125
homowner_ind	        49
term_run_frac	        44
'''
print()

In [None]:
# 5th Trial
'''
Parameter Range: 
------------------
Based on the observation from Trial-4:
* In this attempt, reduce the range of number of leaves to (8, 32)
* Push the upper limit of max_depth to enclose the total number of features (17)
space = {
    'max_depth': hp.quniform('max_depth', 3, 18, 1),    
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'num_leaves': hp.choice('num_leaves', list(range(8, 32, 1))),
    'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
}
'''


'''
Features Considered: 
--------------------
Same as 3rd Trial

Training Results: (best, finished)
-----------------
params =                          0
max_depth                5
gamma                0.602
subsample             0.90
reg_alpha            0.289
reg_lambda           0.084
learning_rate        0.173
num_leaves          31.000
colsample_bytree     0.483
min_child_samples  210.000
feature_fraction     0.494
bagging_fraction     0.539

CV - scores: 
    1 : 0.6749;
    2 : 0.6751;
    3 : 0.674;
    4 : 0.6769;
    5 : 0.6771;
    6 : 0.6753;
Total Time Run: 1.52
Mean ROC_AUC: 0.6755379631187354
'''

'''
Feature Importance: 
-------------------
consumer_post_code	    1232
purchase_amt	        1006
deposit_percent	        978
merchant_name	        792
product	                784
age_of_application	    764
merchant_number	        730
gtee_rate	            705
industry_name	        450
application_month	    434
total_term	            198
payment_type	        157
application_year	    140
freq	                132
homowner_consumer	    69
homowner_ind	        37
term_run_frac	        35
'''
print()

In [None]:
# 6th Trial
'''
Parameter Range: 
------------------
Use the parameter range from Trial-5 with the range of num_leaves changed to (8, 100, 2)
space = {
    'max_depth': hp.quniform('max_depth', 3, 18, 1),    
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'num_leaves': hp.choice('num_leaves', list(range(8, 100, 2))),
    'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
}
'''

'''
Features Considered: 
-----------------------
remove feature "merchant_name"
# remove merchant_name and merchant_number


Training Results: (best, finished)
-----------------
params =                          0
max_depth                7
gamma                0.401
subsample             0.90
reg_alpha            0.303
reg_lambda           0.099
learning_rate        0.116
num_leaves          94.000
colsample_bytree     0.629
min_child_samples  210.000
feature_fraction     0.702
bagging_fraction     0.821

CV - scores: 
    1 : 0.6715;
    2 : 0.673;
    3 : 0.6723;
    4 : 0.6759;
    5 : 0.6768;
    6 : 0.6705;
Total Time Run: 1.93
Mean ROC_AUC: 0.6733360373745697
'''

'''
Feature Importance: 
-------------------
consumer_post_code	    4730
purchase_amt	        4158
deposit_percent	        3132
age_of_application	    3112
product	                2788
merchant_number	        2766
gtee_rate	            2706
application_month	    1884
industry_name	        1493
total_term	            676
application_year	    575
payment_type	        524
freq	                523
homowner_consumer	    173
homowner_ind	        118
term_run_frac	        61
'''

print()

In [None]:
# 7th Trial
'''
Parameter Range: 
------------------
Use the parameter range from Trial-6
space = {
    'max_depth': hp.quniform('max_depth', 3, 18, 1),    
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'num_leaves': hp.choice('num_leaves', list(range(8, 100, 2))),
    'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
}
'''

'''
Features Considered: 
-----------------------
remove merchant_name and merchant_number


Training Results: (best, finished)
-----------------



'''

'''
Feature Importance: 
-------------------




'''

print()

# Code Dump

In [None]:
# # loop through all rows to deal with bad string entries
# # and convert all int and float entries to string
# for index, row in df.iterrows():
#     postCode = row['consumer_post_code']
#     id = row['consumer_id']
#     if isinstance(postCode, str):

#         # Dealing with post codes with incorrect length
#         if len(postCode) != 4:
#             print(f'Bad entry detected... {postCode} -- not 4-digit entry')
#             if postCode == '28501':
#                 print(f'    consumer: {id} correcting entry')
#                 df['consumer_post_code'].iloc[index] = '2850'
#             elif postCode == '2166`1':
#                 print(f'    consumer: {id} correcting entry')
#                 df['consumer_post_code'].iloc[index] = '2166'
#             elif postCode == '414' or postCode == 'CM144WG':
#                 print(f'    consumer: {id} correcting entry')
#                 df['consumer_post_code'].iloc[index] = np.nan

#         # Dealing with post codes with non-decimal elements
#         elif not postCode.strip().isdecimal():
#             print(f'Bad entry detected... {postCode} -- non-decimal entry')
#             if postCode == '4Q53':
#                 print(f'    consumer: {id} correcting entry')
#                 df['consumer_post_code'].iloc[index] = '4053'
#             elif postCode == '40/2':
#                 print(f'    consumer: {id} correcting entry')
#                 df['consumer_post_code'].iloc[index] = '4012'
#             elif postCode == '482O':
#                 print(f'    consumer: {id} correcting entry')
#                 df['consumer_post_code'].iloc[index] = '4820'
#             elif postCode == '500O':
#                 print(f'    consumer: {id} correcting entry')
#                 df['consumer_post_code'].iloc[index] = '5000'               
#             elif postCode == '430(':
#                 print(f'    consumer: {id} correcting entry')
#                 df['consumer_post_code'].iloc[index] = np.nan
#             elif postCode == '48/7':
#                 print(f'    consumer: {id} correcting entry')
#                 df['consumer_post_code'].iloc[index] = '4817'   

#     if isinstance(postCode, float) or isinstance(postCode, int):
#         if pd.notna(postCode):
#             df['consumer_post_code'].iloc[index] = str(int(postCode))

# # Dealing with all nan-value entries
# consumerid_list = df['consumer_id'].loc[df['consumer_post_code'].isna()].values
# consumerid_list = set(consumerid_list)
# print(consumerid_list)
# consumerid_noPost_list = list()

# for id in consumerid_list:
#     if df['consumer_post_code'].loc[df['consumer_id'] == id].isnull().values.all():
#         print(f'consumer: {id} has no post code info')
#         df['consumer_post_code'].loc[df['consumer_id'] == id] = 'unknown'
#         consumerid_noPost_list.append(id)
#     else:
#         possible_post_codes = df["consumer_post_code"].loc[df["consumer_id"] == id].values
#         possible_post_codes = possible_post_codes[pd.notna(possible_post_codes)]
#         print(f'consumer: {id} has the following post code: {possible_post_codes}')
#         print(f'    applying post code to consumer: {id}')
#         df['consumer_post_code'].loc[df['consumer_id'] == id] = str(int(possible_post_codes[0]))

# print(f'Consumers without post code: {consumerid_noPost_list}')

## Exploring other features

### Exploring arrears amount feature

In [None]:
# a = df['arrears_amount'].value_counts(dropna=False).sort_index()
# print(a)

In [None]:
# df_tmp = df.loc[ df['arrears_amount'] == 0]
# df_tmp['contract_status'].value_counts(normalize=True, dropna=False)

### Exploring contract_status feature

In [None]:
# '''
# Separate the dataframe by contract status
# '''

# df_paid = df.loc[df['contract_status'] == 'PaidInFull']
# df_default = df.loc[df['contract_status'] == 'Default']
# df_active = df.loc[df['contract_status'] == 'Active']
# print(df['contract_status'].value_counts(normalize=True))

In [None]:
# # '''
# # Look at the Paid-set
# # '''
# print(df_paid['arrears_amount'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_paid['age_of_recent_default'].value_counts(dropna=False, normalize=False).sort_index())
# print()
# print(df_paid['total_balance_outstanding'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_paid['defaultdate'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print((df_paid['term_run'] / df_paid['total_term']).value_counts(dropna=False, normalize=True).sort_index())
# print()


In [None]:
# # '''
# # Look at the Default-set
# # '''
# print(df_default['arrears_amount'].value_counts(dropna=False, normalize=True).sort_values(ascending=False))
# print()
# print(df_default['age_of_recent_default'].value_counts(dropna=False, normalize=True).sort_values(ascending=False))
# print()
# print(df_default['total_balance_outstanding'].value_counts(dropna=False, normalize=True).sort_values(ascending=False))
# print()
# print(df_default['defaultdate'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print((df_default['term_run'] / df_default['total_term']).value_counts(dropna=False, normalize=True).sort_index())
# print()

In [None]:
# # '''
# # Look at the active-set
# # '''
# print(df_active['arrears_amount'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_active['age_of_recent_default'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_active['total_balance_outstanding'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_active['defaultdate'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print((df_active['term_run'] / df_active['total_term']).value_counts(dropna=False, normalize=True).sort_index())
# print()

**My Questions regarding the "contract_status" feature (ground-truth?)**

* All instances in "PaidInFull" subset have 0.0 "arrears_amount", but more than 10% of the instances in this subset has a valid "age_of_recent_default", which suggests default did occur to these intances. All instances in this subset have NaN value for "defaultdate"

* About 3% of the instances in "Default" subset have 0.0 "arrears_amount", which indicate default has never occured to these instances. 99.7% of this subset have a valid "defaultdate" entry.

* "Active" subset has very similar behaviour when compared against "PaidInFull". All instances in this subset have NaN value for "defaultdate"

**Options for ground-truth**

If **True** ==> has default, **False** ==> has no default: 

1. **True** = "Default" subset and **False** = "PaidInFull" + "Active" subsets

2. **True** = "arrears_amount" != 0, and **False** = "arrears_amount" == 0

3. **True** = "defaultdate" == valid date, and **False** = NaN

**According to the observations, none of the above options are fully make sense...**

**Meeting outcome**


4. use the combined recent group to determine the ground truth. 

    i.e. when "age_of_recent_default", "age_of_recent_default_cure" and "recent_default_amt" are all valid ==> **DEFAULT**

In [None]:
# # for instances with null DoB, check if DoB of the same consumer is provided elsewhere
# marker = '/'
# for index, row in df.iterrows():
#     # if the current instance DOB is not in the correct format
#     if pd.isna(row['consumer_year_of_birth']):
#         id = row['consumer_id']
#         if df['consumer_year_of_birth'].loc[df['consumer_id'] == id].isnull().values.all():
#             print(f'consumer: {id} has no DoB info')
#             df['consumer_year_of_birth'].loc[df['consumer_id'] == id] = '99/99/9999'        
#         else:
#             possibleDoBs = df["consumer_year_of_birth"].loc[df["consumer_id"] == id].values
#             possibleDoBs = possibleDoBs[pd.notna(possibleDoBs)]
#             print(f'consumer: {id} has the following BoDs: {possibleDoBs}')
#             print(f'    applying DoB to consumer: {id}')
#             df['consumer_year_of_birth'].loc[df['consumer_id'] == id] = str(possibleDoBs[0])

# # Convert str-type DoB to int-type year of birth
# df['consumer_year_of_birth'] = df['consumer_year_of_birth'].str.split('/', expand=True)[2].astype(int)

In [None]:
# # recent default should not happen before the year of application
# # the age of application has to be > 18 for age of recent default to be effective
# for index, row in df.iterrows():

#     # try to distinguish between absent recent_default_default_date and ...
#     if row['recent_default_year'] == 0: 
#         df['age_of_recent_default'].iloc[index] = int(0)
#         df['recent_default_year'].iloc[index] = int(0)
#         df['recent_default_month'].iloc[index] = int(0)
#     # invalid entry of recent_default_default_date and consumer_year_of_birth
#     elif row['age_of_recent_default'] < row['age_of_application'] or row['age_of_application'] < 18:
#         df['age_of_recent_default'].iloc[index] = int(-1)
#         df['recent_default_year'].iloc[index] = int(0)
#         df['recent_default_month'].iloc[index] = int(0)

# df['age_of_recent_default'].value_counts(normalize=False).sort_index()