<a href="https://colab.research.google.com/github/mrninainaidi/Machine-Learning-Projects/blob/master/personal_loan_default_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preamble

* jupyter notebook theme (optional)
* package

In [None]:
# # Optional: setup theme for Jupyter Notebook
# # comment out if running on Colab
# import jupytertheme as jt
# from jupyterthemes.stylefx import set_nb_theme

# set_nb_theme('chesterish')

In [None]:
import pandas as pd
import numpy as np
import math

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

import xgboost
from xgboost import plot_importance

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

import gc
from scipy import stats
import time
import datetime
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

!pip install missingno
import missingno as msno

!pip install category_encoders
import category_encoders as ce

!pip install imblearn
from imblearn.over_sampling import SMOTE


import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
# import seaborn as sns
# import altair as alt

In [None]:
RAND_STATE = 3

# Data Processing and Cleaning

## Import Data

In [None]:
# For colab
from google.colab import drive
drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/further_study/machine_learning_projects/personal_loan_rating/'
df = pd.read_csv(root_path+'default_loan_no_quotes.csv')

In [None]:
# # For others (Jupyter Notebook)
# # NOTE: This requies the data file to be saved under the same directory as this file.

# df = pd.read_csv('default_loan_no_quotes.csv')

In [None]:
df.head()

In [None]:
df.columns = df.columns.str.replace(' ','_')
df.columns = map(str.lower, df.columns)
df.columns

## Features With Null Values

In [None]:
msno.bar(df)

In [None]:
na_names = list()

for col in df.columns:
    if df[col].isna().sum() > 0:
        print(f'Feature: {col}, has {100 * df[col].isna().sum() / df[col].shape[0]:.3f}%  or {df[col].isna().sum()} null values.')
        na_names.append(col)

na_names

## Drop the columns of little interests


In [None]:
'''
Drop the columns of little interests
'''

not_interested = ['entry_date', 'fist_installment_date',\
                  'id', 'deposit_amt', 'financed_amt', 'term_remaining',\
                  'instalment_amt', 'amt_paid_to_merchant_nettofmerchfeesandgst',\
                  'est_fees', 'proc_fees', 'other_fees', 'total_merchant_charges',\
                  'total_consumer_charges']

for name in not_interested: 
    if name not in df.columns:
        raise ValueError(f'column name: {name} is not valid')

df.drop(columns=not_interested, inplace=True)
# df.head()

## Cleaning the consumer post code feature

In [None]:
# manually correct typos in post code
df['consumer_post_code'].loc[df['consumer_post_code'] == '28501'] = '2850'
df['consumer_post_code'].loc[df['consumer_post_code'] == '2166`1'] = '2166'
df['consumer_post_code'].loc[df['consumer_post_code'] == '414'] = np.nan
df['consumer_post_code'].loc[df['consumer_post_code'] == 'CM144WG'] = np.nan
df['consumer_post_code'].loc[df['consumer_post_code'] == '4Q53'] = '4053'
df['consumer_post_code'].loc[df['consumer_post_code'] == '40/2'] = '4012'
df['consumer_post_code'].loc[df['consumer_post_code'] == '482O'] = '4820'
df['consumer_post_code'].loc[df['consumer_post_code'] == '500O'] = '5000'
df['consumer_post_code'].loc[df['consumer_post_code'] == '430('] = np.nan
df['consumer_post_code'].loc[df['consumer_post_code'] == '48/7'] = '4817'

# convert NA values to 'unknown'
consumerid_list = df['consumer_id'].loc[df['consumer_post_code'].isna()].values
consumerid_list = set(consumerid_list)
print(consumerid_list)

for id in consumerid_list:
    if df['consumer_post_code'].loc[df['consumer_id'] == id].isnull().values.all():
        print(f'consumer: {id} has no post code info')
        df['consumer_post_code'].loc[df['consumer_id'] == id] = 'unknown'
    else:
        possible_post_codes = df["consumer_post_code"].loc[df["consumer_id"] == id].values
        possible_post_codes = possible_post_codes[pd.notna(possible_post_codes)]
        print(f'consumer: {id} has the following post code: {possible_post_codes}')
        print(f'    applying post code to consumer: {id}')
        df['consumer_post_code'].loc[df['consumer_id'] == id] = str(int(possible_post_codes[0]))


# make sure all int and float-type entries are cast to str
df['consumer_post_code'] = df['consumer_post_code'].astype(str).replace('\.0', '', regex=True)
print('Convertion complete.')

In [None]:
df['consumer_post_code'].value_counts()

In [None]:
# to check if all instances of the 'consumer_post_code' feature have been converted
# to string-type
for _, row in df.iterrows():
    try: 
        assert(isinstance(row['consumer_post_code'], str))
    except: 
        print(row)

## Cleaning the consumer year of birth feature

In [None]:
# convert NA values to '99/99/9999'
consumerid_list = df['consumer_id'].loc[df['consumer_year_of_birth'].isna()].values
consumerid_list = set(consumerid_list)

for id in consumerid_list:
    if df['consumer_year_of_birth'].loc[df['consumer_id'] == id].isnull().values.all():
        print(f'consumer: {id} has no DoB info')
        df['consumer_year_of_birth'].loc[df['consumer_id'] == id] = '99/99/9999'
    else:
        
        possibleDoB = df["consumer_year_of_birth"].loc[df["consumer_id"] == id].values
        possibleDoB = possibleDoB[pd.notna(possibleDoB)]
        print(f'consumer: {id} has the following DoBs: {possibleDoB}')
        print(f'    applying DoB to consumer: {id}')
        df['consumer_year_of_birth'].loc[df['consumer_id'] == id] = str(possibleDoB[0])

# Convert str-type DoB to int-type year of birth
df['consumer_year_of_birth'] = df['consumer_year_of_birth'].str.split('/', expand=True)[2].astype(int)
print('Convertion complete.')

In [None]:
# to check if all instances of the 'consumer_year_of_birth' feature have been converted
# to int-type or np.nan
# for _, row in df.iterrows():
#     yob = row['consumer_year_of_birth']
#     if isinstance(yob, int):
#         if yob > 1900 and yob <= 9999:
#             continue
#     else: 
#         print(f'row["consumer_year_of_birth"] = {yob}')
# print('Assertion complete.')

In [None]:
x = df['consumer_year_of_birth'].value_counts(normalize=True).sort_index().index
y = df['consumer_year_of_birth'].value_counts(normalize=True).sort_index().values
plt.xlim(1900,2000)
plt.title('Consumer Age Distribution')
plt.xlabel('Year of Birth')
plt.ylabel('Probability')
plt.plot(x, y,'g*')


## Converting application_date feature to application_month and application_year

In [None]:
# Convert str-type application date to int-type year of application
df['application_year'] = df['application_date'].str.split('/', expand=True)[2].astype(int)
df['application_month'] = df['application_date'].str.split('/', expand=True)[1].astype(int)


## Converting recent_default_default_date to recent_default_year and recent_dafault_month

In [None]:
df['recent_default_default_date'] = df['recent_default_default_date'].replace(np.nan, '00/00/0000', regex=True)
df['recent_default_year'] = df['recent_default_default_date'].str.split('/', expand=True)[2].astype(int)
df['recent_default_month'] = df['recent_default_default_date'].str.split('/', expand=True)[1].astype(int)
# df['recent_default_month'].value_counts()

In [None]:
x = df['recent_default_month'].value_counts(normalize=True).sort_index().index
y = df['recent_default_month'].value_counts(normalize=True).sort_index().values

plt.xlim(1,12)
plt.ylim(0,0.02)

plt.title('Recent default month distribution')
plt.xlabel('Month of default')
plt.ylabel('Probability')
plt.plot(x, y,'g*-')

## Adding age_of_application feature (integer)

In [None]:
df['age_of_application'] = df['application_year'] - df['consumer_year_of_birth']

# use this "age of application" to validate the "consumer year of birth"
# if "age of application" < 18, the minimum legal age of having a credit account
# the "consumer year of birth" entry must be faulty. 
df['consumer_year_of_birth'].loc[df['age_of_application'] < 18] = int(9999)
df['age_of_application'].loc[df['age_of_application'] < 18] = int(-1)

In [None]:
df.drop(df[df['age_of_application'] == -1].index, axis=0, inplace=True)

In [None]:
x = df['age_of_application'].value_counts(normalize=True).sort_index().index
y = df['age_of_application'].value_counts(normalize=True).sort_index().values
plt.xlim(-1,100)
plt.title('Consumer Age of Application')
plt.xlabel('Consumer Age')
plt.ylabel('Probability')
plt.plot(x, y, 'g*')

## Adding age_of_recent_default feature (integer)

In [None]:
df['age_of_recent_default'] = df['recent_default_year'] - df['consumer_year_of_birth']
# df['age_of_recent_default'].value_counts(normalize=True)

In [None]:
# recent default should not happen before the year of application
# the age of application has to be > 18 for age of recent default to be effective

# For invalid entry of recent_default_default_date and consumer_year_of_birth
df['age_of_recent_default'].loc[(df['age_of_recent_default'] <df['age_of_application'])\
                                | (df['age_of_application'] < 18)] = int(-1)
df['recent_default_year'].loc[(df['age_of_recent_default'] <df['age_of_application'])\
                                | (df['age_of_application'] < 18)] = int(0)
df['recent_default_month'].loc[(df['age_of_recent_default'] <df['age_of_application'])\
                                | (df['age_of_application'] < 18)] = int(0)

# For absent recent_default_default_date 
df['age_of_recent_default'].loc[df['recent_default_year'] == 0] = int(0)
df['recent_default_year'].loc[df['recent_default_year'] == 0] = int(0)
df['recent_default_month'].loc[df['recent_default_year'] == 0] = int(0)

print('Convertion complete.')

In [None]:
x = df['age_of_recent_default'].value_counts(normalize=True).sort_index().index
y = df['age_of_recent_default'].value_counts(normalize=True).sort_index().values
plt.xlim(20,100)
plt.ylim(0,0.01)
plt.title('Consumer Age of Recent Default')
plt.xlabel('Consumer Age')
plt.ylabel('Probability')
plt.plot(x, y, 'g*')

## Cleaning product feature

In [None]:
# replace NaN with 'unknown'
df['product'] = df['product'].replace(np.nan, 'unknown', regex=True)

In [None]:
# shorten the tails
x = df['product'].value_counts(normalize=True).index
y = df['product'].value_counts(normalize=True).values

## Shorten the features with heavy tails

* 'product'
* 'merchant_name'
* 'merchant_number'

In [None]:

def convert_tails_to_others(dataframe, feature, fracToConvert):
    x = dataframe[feature].value_counts(normalize=True).index
    y = dataframe[feature].value_counts(normalize=True).values

    all_list = dataframe[feature].value_counts(normalize=True).index.tolist()

    # obtain the list of value to keep
    threshold = 1 - fracToConvert
    current = 0.0
    keep_list = list()

    for i in range(len(y)):
        if current >= threshold:
            break
        current += y[i]
        keep_list.append(x[i])

    drop_list = [x for x in all_list if x not in keep_list]

    # apply keep_list
    dataframe[feature].loc[dataframe[feature].isin(drop_list)] = 'others'
    # print(dataframe[feature].value_counts(normalize=True))
    # print()

In [None]:
col_names = ['product', 'merchant_name', 'merchant_number']
frac_dict = {'product':0.08, 'merchant_name':0.05, 'merchant_number':0.05}

for name in col_names:
    convert_tails_to_others(df, name, frac_dict[name])

## Cleaning total_balance_outstanding feature

In [None]:
df_tmp = df['total_balance_outstanding']

df_tmp.replace(np.nan, '0.0', regex=True, inplace=True)
df_tmp.replace(',', '', regex=True, inplace=True)
df_tmp = df_tmp.astype(float)

df['total_balance_outstanding'] = df_tmp
del df_tmp
print('Convertion complete')

In [None]:
# # check if everything has been converted to float-type
# for index, value in df['total_balance_outstanding'].items():
#     if not isinstance(value, float):
#         print(f'{value} ----- {type(value)}')
# print('Assertion complete.')

## Cleaning recent_default_default_amt feature

In [None]:
df_tmp = df['recent_default_default_amt']

df_tmp.replace(np.nan, '0.0', regex=True, inplace=True)
df_tmp.replace(',', '', regex=True, inplace=True)
df_tmp = df_tmp.astype(float)

df['recent_default_default_amt'] = df_tmp
del df_tmp
print('Convertion complete')

In [None]:
# # check if everything has been converted to float-type
# for index, value in df['recent_default_default_amt'].items():
#     if not isinstance(value, float):
#         print(f'{value} ----- {type(value)}')
# print('Assertion complete.')

## Adding term_run_frac feature
representing the fraction of terms that have been fulfilled. 

In [None]:
df['term_run_frac'] = df['term_run'] / df['total_term']

## Adding total_month feature

In [None]:
df_tmp = pd.DataFrame()
df_tmp['total_term'] = df['total_term']
df_tmp['total_month'] = df['total_term']
df_tmp['freq'] = df['freq']

mask = (df_tmp['freq'] == 'FN')
df_valid = df_tmp[mask]

df_tmp.loc[mask, 'total_month'] = df_valid['total_term'] / 2

df['total_month'] = df_tmp['total_month']
del df_tmp

## Adding conditional mean/std features

In [None]:
# Conditioning for "age_op_application"
df['aop_indName_mean'] = df['age_of_application'] / df.groupby(['industry_name'])['age_of_application'].transform('mean')
df['aop_indName_stdev'] = df['age_of_application'] / df.groupby(['industry_name'])['age_of_application'].transform('std')

df['aop_pmtTp_mean'] = df['age_of_application'] / df.groupby(['payment_type'])['age_of_application'].transform('mean')
df['aop_pmtTp_stdev'] = df['age_of_application'] / df.groupby(['payment_type'])['age_of_application'].transform('std')

df['aop_fq_mean'] = df['age_of_application'] / df.groupby(['freq'])['age_of_application'].transform('mean')
df['aop_fq_stdev'] = df['age_of_application'] / df.groupby(['freq'])['age_of_application'].transform('std')

df['aop_hoId_mean'] = df['age_of_application'] / df.groupby(['homowner_ind'])['age_of_application'].transform('mean')
df['aop_hoId_stdev'] = df['age_of_application'] / df.groupby(['homowner_ind'])['age_of_application'].transform('std')

df['aop_hoCon_mean'] = df['age_of_application'] / df.groupby(['homowner_consumer'])['age_of_application'].transform('mean')
df['aop_hoCon_stdev'] = df['age_of_application'] / df.groupby(['homowner_consumer'])['age_of_application'].transform('std')


# Conditioning for "purchase_amt"
df['pAmt_indName_mean'] = df['purchase_amt'] / df.groupby(['industry_name'])['purchase_amt'].transform('mean')
df['pAmt_indName_stdev'] = df['purchase_amt'] / df.groupby(['industry_name'])['purchase_amt'].transform('std')

df['pAmt_pmtTp_mean'] = df['purchase_amt'] / df.groupby(['payment_type'])['purchase_amt'].transform('mean')
df['pAmt_pmtTp_stdev'] = df['purchase_amt'] / df.groupby(['payment_type'])['purchase_amt'].transform('std')

df['pAmt_fq_mean'] = df['purchase_amt'] / df.groupby(['freq'])['purchase_amt'].transform('mean')
df['pAmt_fq_stdev'] = df['purchase_amt'] / df.groupby(['freq'])['purchase_amt'].transform('std')

df['pAmt_hoId_mean'] = df['purchase_amt'] / df.groupby(['homowner_ind'])['purchase_amt'].transform('mean')
df['pAmt_hoId_stdev'] = df['purchase_amt'] / df.groupby(['homowner_ind'])['purchase_amt'].transform('std')

df['pAmt_hoCon_mean'] = df['purchase_amt'] / df.groupby(['homowner_consumer'])['purchase_amt'].transform('mean')
df['pAmt_hoCon_stdev'] = df['purchase_amt'] / df.groupby(['homowner_consumer'])['purchase_amt'].transform('std')

In [None]:
df.drop(df[df['aop_indName_stdev'].isna()].index, axis=0, inplace=True)

In [None]:
# check for NaN in the conditional features: 
cond_names = ['aop_indName_mean', 'aop_indName_stdev', 'aop_pmtTp_mean',\
              'aop_pmtTp_stdev', 'aop_fq_mean', 'aop_fq_stdev', 'aop_hoId_mean',\
              'aop_hoId_stdev', 'aop_hoCon_mean', 'aop_hoCon_stdev', 'pAmt_indName_mean',\
              'pAmt_indName_stdev', 'pAmt_pmtTp_mean', 'pAmt_pmtTp_stdev', 'pAmt_fq_mean',\
              'pAmt_fq_stdev', 'pAmt_hoId_mean', 'pAmt_hoId_stdev', 'pAmt_hoCon_mean', 'pAmt_hoCon_stdev']

df_tmp = pd.DataFrame()
for name in cond_names:
    df_tmp[name] = df[name].copy()

# df_tmp.head()
msno.bar(df_tmp)

## Define ground truth

In [None]:
# df_recent = df[[col for col in df.columns if 'recent' in col]]
# df_recent['defaultdate'] = df['defaultdate']
# df_recent['consumer_id'] = df['consumer_id']
# df_recent['defaultamount'] = df['defaultamount']
# df_recent['contract_number'] = df['contract_number']
# df_recent['contract_status'] = df['contract_status']
# df_recent['expected_contract_end_date'] = df['expected_contract_end_date']

In [None]:
# for index, row in df_recent.iterrows():
#     if row['recent_default_default_amt'] == 0:
#         if isinstance(row['defaultdate'], str):
#             print(row)
#             print()      

**Test outcome**

* when "recent_default_year" == 0, there are FIVE instances that "recent_default_default_amt" != 0. And all FIVE instances are marked as DEFAULT by the "contract_status"


* when "recent_default_default_amt" == 0, there are TWO instances that "recent_default_year" != 0. And all of the TWO instances are marked as PAIDINFULL by the "contract_status".

    ==> both "recent_default_year" and "recent_default_default_amt" == 0 means NoDefault

**Suspect bad columns:**

Assumning 'recent_default_default_amt' is the indicator for the ground truth... 

* 'defaultdate'

* 'defaultamount'

* 'total_balance_outstanding'

* 'recent_default_default_date' ==> 'recent_default_year' ==> 'recent_default_age'


**Question:**

Do I go back to realign 'recent_default_year' and 'recent_default_age' with the assumed ground truth???


In [None]:
# introduce the ground truth according to above analysis
df['isDefault'] = df['recent_default_default_amt'] > 0

In [None]:
# # to check
# for index, row in df.iterrows():
#     if row['recent_default_default_amt'] > 0:
#         if df['isDefault'] is False:
#             print(row)
# print('Assertion complete.')

## Finalising data cleaning

In [None]:
print(df.columns)

In [None]:
df.head(5)

### Collect the input columns of interest

In [None]:
num_cols = ['purchase_amt', 'deposit_percent', 'gtee_rate', 'term_run_frac', 'total_month']
num_cols = num_cols + cond_names

buk_cols = ['age_of_application']

ind_cols = ['application_year', 'application_month', 'payment_type', 'freq', 'homowner_ind', 'homowner_consumer']

emb_cols = ['product', 'consumer_post_code', 'industry_name', 'merchant_number']

tar_cols = ['isDefault']

all_cols = num_cols + buk_cols + ind_cols + emb_cols + tar_cols

# populate df_train
df_train = pd.DataFrame()
for col in all_cols:
    df_train[col] = df[col].copy()

df_train.shape

### Digitise all **'object'** type columns

In [None]:
for col in df_train.columns:
    if df_train[col].dtypes == 'object' or df_train[col].dtypes == 'bool':
        df_train[col] = pd.Categorical(df_train[col])
        df_train[col] = df_train[col].cat.codes

    if df_train[col].dtypes == 'float64':
        df_train[col] = df_train[col].astype(np.float32)

df_train.dtypes

### Re-sampling with SMOTE

In [None]:
# model input
X = df_train.drop(columns=['isDefault'])

# expected output
y = df_train['isDefault']

print(X.shape)
print(y.shape)

In [None]:
# initialise SMOTE sampling
sm = SMOTE(random_state=RAND_STATE)

# resample the training set
input, target = sm.fit_sample(X, y.ravel())
print(target.mean())

In [None]:
df_tmp = pd.DataFrame(input, columns=X.columns)
df_tmp['isDefault'] = target
df_tmp.head()

In [None]:
df_train = df_tmp
del df_tmp
df_train.shape

In [None]:
# Cast some features to int-type for encoding
for col in df_train.columns:
    if col in ind_cols or col in emb_cols:
        df_train[col] = df_train[col].astype(np.int32)
    elif col in num_cols or col in buk_cols: 
        df_train[col] = df_train[col].astype(np.float32)

df_train.dtypes

### Split the dataframe into train, test, and validation sets

In [None]:
train, test = train_test_split(df_train, test_size=0.2)
print(len(train), 'train examples')
print(len(test), 'test examples')

# Neural Network Model

## Input Pipeline Definition

### Feature Columns

In [None]:
'''
Utility functions definition
'''
# normalising numerical features
def get_scal(feature):
  def minmax(x):
    mini = train[feature].min()
    maxi = train[feature].max()
    return (x - mini)/(maxi-mini)
  return(minmax)

In [None]:
feature_columns = []

# Numerical columns
for feature_name in num_cols:
  scal_input_fn = get_scal(feature_name)
  feature_columns.append(feature_column.numeric_column(feature_name, normalizer_fn=scal_input_fn))

# Bucketized columns
for feature_name in buk_cols:
    # only one feature in the buk_cols ==> age of application
    age_boundaries = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105]
    Age = feature_column.numeric_column(feature_name)
    age_buckets = feature_column.bucketized_column(Age, boundaries=age_boundaries)
    feature_columns.append(age_buckets)

# Categorical indicator columns
for feature_name in ind_cols:
  vocabulary = df_train[feature_name].unique()
  cat_c = tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)
  one_hot = feature_column.indicator_column(cat_c)
  feature_columns.append(one_hot)

# Categorical embedding columns
for feature_name in emb_cols:
  vocabulary = df_train[feature_name].unique()
  cat_c = tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)
  embeding = feature_column.embedding_column(cat_c, dimension=50)
  feature_columns.append(embeding)

len(feature_columns)

### Create the input Pipeline

In [None]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('isDefault')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [None]:
# # Testing the input pipeline

# batch_size = 5 # A small batch sized is used for demonstration purposes
# train_ds = df_to_dataset(train, batch_size=batch_size)
# val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
# test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

# for feature_batch, label_batch in train_ds.take(1):
#   print('Every feature:', list(feature_batch.keys()))
#   print('A batch of merchant_number:', feature_batch['merchant_number'])
#   print('A batch of targets:', label_batch )

## Create & Compile the model


In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    #   tf.keras.metrics.Precision(name='precision'),
    #   tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'), 
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn')
]

def create_and_compile_nn(feature_columns, optimiser, hl1, hl1_act,  hl2=0, hl2_act=''):

    # Create a input-feature layer
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

    # Assemble the model
    # model = tf.keras.Sequential([
    # feature_layer,
    # layers.Dense(hl1, activation=hl1_act),
    # layers.Dense(hl2, activation=hl2_act),
    # layers.Dropout(.1),
    # layers.Dense(1)
    # ])

    model = tf.keras.Sequential()
    model.add(feature_layer)
    model.add(tf.keras.layers.Dense(units=hl1, activation=hl1_act))

    if not hl2 == 0:
        model.add(tf.keras.layers.Dense(units=hl2, activation=hl2_act))
    
    model.add(tf.keras.layers.Dropout(.1))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer=optimiser,
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=METRICS)
    return model

## Training & Evaluation
with

* cross validation
* hyperparameter tuning

In [None]:
def objective(params):
    time1 = time.time()

    params = {
        'hl1'       : int(params['hl1']),
        'hl1_act'   : str(params['hl1_act']), 
        'hl2'       : int(params['hl2']),
        'hl2_act'   : str(params['hl2_act']), 
        'optimiser' : str(params['optimiser'])
    }

    df_toprint = pd.DataFrame(params, index=[0])

    print('\n############## New Run ################')
    print(f"params = {df_toprint.transpose()}")

    # set the number of epochs
    n_epochs = 6

    # tmp batch_size
    tmp_batch_size = 2048

    # obtain the testing set
    test_ds = df_to_dataset(test, shuffle=False, batch_size=tmp_batch_size)
        
    # declair total number of folds and fold counter
    FOLDS = 3
    counter = 1

    # instantiate the TSS model
    skf = KFold(n_splits=FOLDS, shuffle=True, random_state=RAND_STATE)
    cv_accuracy = []
    cv_auc = []

    print(f'\nTraining set shape: {train.shape}')

    # Start the Training and Cross-validation loop
    for t_idx, v_idx in skf.split(train):
        print(f'\nRunning Fold No.: {counter}... ')
        # get the split dataframes
        X_t, X_v = train.iloc[t_idx, :], train.iloc[v_idx, :]

        # convert to TF datasets
        train_ds = df_to_dataset(X_t, shuffle=True, batch_size=tmp_batch_size)   
        val_ds = df_to_dataset(X_v, shuffle=True, batch_size=tmp_batch_size)  

        # Model definition
        model = create_and_compile_nn(feature_columns, 
                                      **params)

        # Model training
        model.fit(train_ds,
                    validation_data=val_ds,
                    epochs=n_epochs,
                    verbose=2)

        # evaluate the model
        scores = model.evaluate(test_ds, verbose=0)

        print(f"\nModel accuracy = {model.metrics_names[1], round((scores[1]*100), 4)}%")
        cv_accuracy.append(scores[1])
        cv_auc.append(scores[2])

        counter += 1
    
    # record the time elapsed
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)} mins")

    gc.collect() # garbage collection
    print('\nTesting results: ')
    print(f"Average model accuracy = {round(np.mean(cv_accuracy), 4)}")
    print(f"Average model AUC = {round(np.mean(cv_auc), 4)}")

    del X_t, X_v, scores

    # optimise parameter search with model accuracy
    return -(np.mean(cv_accuracy))

In [None]:
# space = {
#     'hl1' : hp.choice('hl1', list(range(8,32,2))), 
#     'hl1_act' : hp.choice('hl1_act', ['relu']),
#     'hl2' : hp.choice('hl2', [0]), 
#     'hl2_act' : hp.choice('hl2_act', ['relu', 'sigmoid']),  
#     'optimiser' : hp.choice('optimiser', ['adam']) 
# }

# Dummy space
space = {
    'hl1' : hp.choice('hl1', [32]), 
    'hl1_act' : hp.choice('hl1_act', ['relu']),
    'hl2' : hp.choice('hl2', [0]), 
    'hl2_act' : hp.choice('hl2_act', ['relu', 'sigmoid']),  
    'optimiser' : hp.choice('optimiser', ['adam']) 
}

In [None]:
# Set algoritm parameters
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=1)

print('Done optimising')

# Print best parameters
best_params = space_eval(space, best)

# print("BEST PARAMS: ", best_params)

# Training Results

In [None]:
# Simulation stops HERE
raise SystemExit('Run Terminated.') 

In [None]:
'''
0. Original model
================================================================================
Testing scores: 
Average model accuracy = 0.8542
Average model AUC = 0.5


1. SMOTE
================================================================================
Testing results: 
Average model accuracy = 0.7824
Average model AUC = 0.8253


2. Conditional features
================================================================================
Testing results: 
Average model accuracy = 0.8571
Average model AUC = 0.5001


3. 1 + 2
================================================================================
Testing results: 
Average model accuracy = 0.7805
Average model AUC = 0.8257
'''

# Code Dump

In [None]:
# def objective(params):
#     time1 = time.time()

#     params = {
#         'hl1'       : int(params['hl1']),
#         'hl1_act'   : str(params['hl1_act']), 
#         'hl2'       : int(params['hl2']),
#         'hl2_act'   : str(params['hl2_act']), 
#         'optimiser' : str(params['optimiser'])
#     }

#     df_toprint = pd.DataFrame(params, index=[0])

#     print('\n############## New Run ################')
#     print(f"params = {df_toprint.transpose()}")

#     # set the number of epochs
#     n_epochs = 6

#     # tmp batch_size
#     tmp_batch_size = 500

#     # obtain the testing set
#     test_ds = df_to_dataset(test, shuffle=False, batch_size=1)
        
#     # declair total number of folds and fold counter
#     FOLDS = 6
#     counter = 1

#     # instantiate the TSS model
#     skf = KFold(n_splits=FOLDS, shuffle=True, random_state=RAND_STATE)
#     cvscores = []

#     print(f'\nTraining set shape: {train.shape}')

#     # Start the Training and Cross-validation loop
#     for t_idx, v_idx in skf.split(train):
#         print(f'\nRunning Fold No.: {counter}... ')
#         # get the split dataframes
#         X_t, X_v = train.iloc[t_idx, :], train.iloc[v_idx, :]

#         # convert to TF datasets
#         train_ds = df_to_dataset(X_t, shuffle=False, batch_size=tmp_batch_size)   
#         val_ds = df_to_dataset(X_v, shuffle=False, batch_size=tmp_batch_size)  

#         # Model definition
#         model = create_and_compile_nn(feature_columns, 
#                                       **params)

#         # Model training
#         model.fit(train_ds,
#                     validation_data=val_ds,
#                     epochs=n_epochs,
#                     verbose=2)

#         # evaluate the model
#         scores = model.evaluate(test_ds)

#         print(f"\nModel accuracy = {model.metrics_names[1], scores[1]*100}")
#         cvscores.append(scores[1] * 100)

#         counter += 1
    
#     # record the time elapsed
#     time2 = time.time() - time1
#     print(f"Total Time Run: {round(time2 / 60,2)} mins")


#     gc.collect() # garbage collection
#     print(f"\nAverage model accuracy = {np.mean(cvscores)} +/- {np.std(cvscores)}")

#     del X_t, X_v, scores

#     return -(np.mean(cvscores))

## Exploring other features

### Exploring arrears amount feature

In [None]:
# a = df['arrears_amount'].value_counts(dropna=False).sort_index()
# print(a)

In [None]:
# df_tmp = df.loc[ df['arrears_amount'] == 0]
# df_tmp['contract_status'].value_counts(normalize=True, dropna=False)

### Exploring contract_status feature

In [None]:
# '''
# Separate the dataframe by contract status
# '''

# df_paid = df.loc[df['contract_status'] == 'PaidInFull']
# df_default = df.loc[df['contract_status'] == 'Default']
# df_active = df.loc[df['contract_status'] == 'Active']
# print(df['contract_status'].value_counts(normalize=True))

In [None]:
# # '''
# # Look at the Paid-set
# # '''
# print(df_paid['arrears_amount'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_paid['age_of_recent_default'].value_counts(dropna=False, normalize=False).sort_index())
# print()
# print(df_paid['total_balance_outstanding'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_paid['defaultdate'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print((df_paid['term_run'] / df_paid['total_term']).value_counts(dropna=False, normalize=True).sort_index())
# print()


In [None]:
# # '''
# # Look at the Default-set
# # '''
# print(df_default['arrears_amount'].value_counts(dropna=False, normalize=True).sort_values(ascending=False))
# print()
# print(df_default['age_of_recent_default'].value_counts(dropna=False, normalize=True).sort_values(ascending=False))
# print()
# print(df_default['total_balance_outstanding'].value_counts(dropna=False, normalize=True).sort_values(ascending=False))
# print()
# print(df_default['defaultdate'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print((df_default['term_run'] / df_default['total_term']).value_counts(dropna=False, normalize=True).sort_index())
# print()

In [None]:
# # '''
# # Look at the active-set
# # '''
# print(df_active['arrears_amount'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_active['age_of_recent_default'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_active['total_balance_outstanding'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print(df_active['defaultdate'].value_counts(dropna=False, normalize=True).sort_index())
# print()
# print((df_active['term_run'] / df_active['total_term']).value_counts(dropna=False, normalize=True).sort_index())
# print()

**My Questions regarding the "contract_status" feature (ground-truth?)**

* All instances in "PaidInFull" subset have 0.0 "arrears_amount", but more than 10% of the instances in this subset has a valid "age_of_recent_default", which suggests default did occur to these intances. All instances in this subset have NaN value for "defaultdate"

* About 3% of the instances in "Default" subset have 0.0 "arrears_amount", which indicate default has never occured to these instances. 99.7% of this subset have a valid "defaultdate" entry.

* "Active" subset has very similar behaviour when compared against "PaidInFull". All instances in this subset have NaN value for "defaultdate"

**Options for ground-truth**

If **True** ==> has default, **False** ==> has no default: 

1. **True** = "Default" subset and **False** = "PaidInFull" + "Active" subsets

2. **True** = "arrears_amount" != 0, and **False** = "arrears_amount" == 0

3. **True** = "defaultdate" == valid date, and **False** = NaN

**According to the observations, none of the above options are fully make sense...**

**Meeting outcome**


4. use the combined recent group to determine the ground truth. 

    i.e. when "age_of_recent_default", "age_of_recent_default_cure" and "recent_default_amt" are all valid ==> **DEFAULT**

In [None]:
# from sklearn.model_selection import KFold
# import tensorflow as tf
# from tensorflow.examples.tutorials.mnist import input_data

# # Parameters
# learning_rate = 0.01
# batch_size = 500

# # TF graph
# x = tf.placeholder(tf.float32, [None, 784])
# y = tf.placeholder(tf.float32, [None, 10])
# W = tf.Variable(tf.zeros([784, 10]))
# b = tf.Variable(tf.zeros([10]))
# pred = tf.nn.softmax(tf.matmul(x, W) + b)
# cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
# optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
# accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# init = tf.global_variables_initializer()

# mnist = input_data.read_data_sets("data/mnist-tf", one_hot=True)
# train_x_all = mnist.train.images
# train_y_all = mnist.train.labels
# test_x = mnist.test.images
# test_y = mnist.test.labels

# def run_train(session, train_x, train_y):
#   print "\nStart training"
#   session.run(init)
#   for epoch in range(10):
#     total_batch = int(train_x.shape[0] / batch_size)
#     for i in range(total_batch):
#       batch_x = train_x[i*batch_size:(i+1)*batch_size]
#       batch_y = train_y[i*batch_size:(i+1)*batch_size]
#       _, c = session.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
#       if i % 50 == 0:
#         print "Epoch #%d step=%d cost=%f" % (epoch, i, c)

# def cross_validate(session, split_size=5):
#   results = []
#   kf = KFold(n_splits=split_size)
#   for train_idx, val_idx in kf.split(train_x_all, train_y_all):
#     train_x = train_x_all[train_idx]
#     train_y = train_y_all[train_idx]
#     val_x = train_x_all[val_idx]
#     val_y = train_y_all[val_idx]
#     run_train(session, train_x, train_y)
#     results.append(session.run(accuracy, feed_dict={x: val_x, y: val_y}))
#   return results

# with tf.Session() as session:
#   result = cross_validate(session)
#   print "Cross-validation result: %s" % result
#   print "Test accuracy: %f" % session.run(accuracy, feed_dict={x: test_x, y: test_y})

In [None]:
# n_epochs = 10
# batch_size_train = train.shape[0] // n_epochs

# train_ds = df_to_dataset(train, shuffle=False, batch_size=batch_size_train)

# test_ds = df_to_dataset(test, shuffle=False, batch_size=test.shape[0])

## Train and evaluate the model

In [None]:
# model.fit(train_ds,
#           validation_data=val_ds,
#           epochs=n_epochs)

## Test the model

In [None]:
# test_ds = df_to_dataset(test, shuffle=False, batch_size=test.shape[0])

# loss, accuracy = model.evaluate(test_ds)
# print("Accuracy", accuracy)