# Making Prediction for Tabular Data with Categorical Variable using FastAI

Publication at arXiv:

### Import required libraries

In [None]:
from fastai.tabular import *  # Quick accesss to tabular functionality
from fastai.utils.mem import *

In [None]:
BASE = '/home/odin/Data Science/Temp/PTP/'

### Load data

In [None]:
df_eq = pd.read_csv(BASE+'df_EQ_dataset.csv', encoding='utf8')

In [None]:
df_eq.sample(5)

In [None]:
df_eq['billPaid'].unique()

In [None]:
df_eq['billPaid'].value_counts()/len(df_eq)*100

### Split Dataset into Train and Test Set

In [None]:
df_train = df_eq.iloc[0 : int(len(df_eq)*0.8)]
df_test = df_eq.iloc[int(len(df_eq)*0.8):]
del df_eq
len(df_train), len(df_test)

In [None]:
len(df_train)/(len(df_train) + len(df_test)), len(df_test)/(len(df_train) + len(df_test))

In [None]:
df_train.sample(5)

In [None]:
df_test.sample(5)

In [None]:
len(df_test)

### Define a function for under sampling

In [None]:
def undersample(df=None):
    df_0 = df[df.billPaid==0]
    df_1 = df[df.billPaid==1]
    if len(df_0) < len(df_1):
        df_1 = df_1.sample(len(df_0), replace=False)
    elif len(df_0) > len(df_1):
        df_0 = df_0.sample(len(df_1), replace=False)
    df = pd.concat([df_0, df_1])
    del df_0, df_1
    df = df.sample(frac=1).reset_index(drop=True)
    return df

In [None]:
#df_train = undersample(df=df_train)

### Define a function for oversampling

In [None]:
def oversample(df=None):
    df_0 = df[df.billPaid==0]
    df_1 = df[df.billPaid==1]
    if len(df_0) > len(df_1):
        df_1 = df_1.sample(len(df_0), replace=True)
    elif len(df_0) < len(df_1):
        df_0 = df_0.sample(len(df_1), replace=True)
    df = pd.concat([df_0, df_1])
    del df_0, df_1
    df = df.sample(frac=1).reset_index(drop=True)
    return df

In [None]:
df_train = oversample(df=df_train)

In [None]:
df_train['billPaid'].value_counts()/len(df_train)*100

In [None]:
df_test['billPaid'].value_counts()/len(df_test)*100

In [None]:
df_train.columns

### Decide categorical and continuous variables

In [None]:
dep_var = 'billPaid'
cat_names = ['billOrder', 'billType', 'dueDayW', 'dueMonth', 'billRoute', 'hasMailAddress', 'segment', 
             'dueDayM', 'dueMonth', 'dueDayW', 'billDayM', 'billMonth', 'billRoute', 'numAccountHolders',
             'hasMailAddress', 'city', 'postcode', 'incomeGroup', 'wealthGroup', 'segment', 'RA_CODE_2016',
            'AverageHhdSize', 'MB_CODE_2016', 'SA1_7DIGITCODE_2016', 'medianPersonPerBedroom']
cont_names = ['dueYear', 'billDuration', 'minDOB', 'medianHhdIncWkly', 'medianMortgageWkly', 'medianRentWkly',
             'setupYear']
procs = [FillMissing, Categorify, Normalize]

In [None]:
# check which variables are excluded
set(df_train.columns) - set(cat_names) - set(cont_names)

In [None]:
# check if any variable has been added mistakenly to both categorical and continuous variables
set(cat_names).intersection(set(cont_names))

In [None]:
#get the GPU ID of the GPU with max free ram
gpu_with_max_free_mem()

In [None]:
#Optimise memory space of dataframe
reduce_mem_usage(df_train);

In [None]:
df_test1 = df_test.copy(deep=True)

In [None]:
#Optimise memory space of dataframe
reduce_mem_usage(df_test);

### Creat data loaders

In [None]:
# Creat data loader for test data
#Test data from len(df)*0.8 index to len(df)
test_data = TabularList.from_df(df_test, cat_names=cat_names, cont_names=cont_names)

In [None]:
# Creat data loader for train data 
# 10% of the training data is used for validation
data = (TabularList.from_df(df_train, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(list(range(int(len(df_train)*0.9),len(df_train))))
                           .label_from_df(cols=dep_var)
                           .add_test(test_data)
                           .databunch())

In [None]:
len(data.train_ds)/len(df_train)*100

In [None]:
len(data.valid_ds)/len(df_train)*100

In [None]:
len(data.test_ds)/(len(df_train)+len(df_test))*100

In [None]:
data.show_batch(rows=10)

### Train Model

In [None]:
%%time
learn = tabular_learner(data, layers=[512,1024, 256,64], metrics=accuracy)
learn.fit(1, 1e-2)

In [None]:
%%time
learn.fit(1, 1e-2)

In [None]:
#%%time
#learn.fit(5, 1e-2)

## Make Inference

In [None]:
df_test = df_test1

In [None]:
row = df_test.sample().iloc[0]
learn.predict(row)

In [None]:
df_test = df_test.reset_index(drop=True)
df_test.head()

In [None]:
y_test = df_test[dep_var]
y_test = list(y_test)

In [None]:
limit = len(y_test)

In [None]:
y_test[:10]

In [None]:
all_names = list(set(cat_names).union(set(cont_names)))

In [None]:
%%time
predicted = learn.predict(df_test[all_names].iloc[0])

In [None]:
#limit = 1000
predicted = [learn.predict(df_test.iloc[idx]) for idx in df_test.index[:limit]]

In [None]:
predicted = [p[0].obj for p in predicted]

In [None]:
predicted[:10]

### Measure Performance

In [None]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
import numpy as np

predicted = np.array(predicted)
actual = np.array(y_test[:limit])

tp = np.count_nonzero(predicted * actual)
tn = np.count_nonzero((predicted - 1) * (actual - 1))
fp = np.count_nonzero(predicted * (actual - 1))
fn = np.count_nonzero((predicted - 1) * actual)

print('True Positive\t' + str(tp))
print('True Negative\t' + str(tn))
print('False Positive\t' + str(fp))
print('False Negative\t' + str(fn))

accuracy = (tp + tn) / (tp + fp + fn + tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
fmeasure = (2 * precision * recall) / (precision + recall)
cohen_kappa_score = cohen_kappa_score(predicted, actual)
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predicted)
auc_val = auc(false_positive_rate, true_positive_rate)
roc_auc_val = roc_auc_score(actual, predicted)

print('Accuracy\t' + str(accuracy))
print('Precision\t' + str(precision))
print('Recall\t' + str(recall))
print('f-measure\t' + str(fmeasure))
print('cohen_kappa_score\t' + str(cohen_kappa_score))
print('auc\t' + str(auc_val))
print('roc_auc\t' + str(roc_auc_val))

#print("Average of ROC-AUC score: %.3f" % roc_auc_score(ytest, predictions))

In [None]:
# Performance test
from sklearn.metrics import classification_report
print(classification_report(actual, predicted))

In [None]:
model_name = 'DNN'
strategy = 'upsamping'

In [None]:
import datetime
from sklearn.metrics import classification_report
now = datetime.datetime.now()

out_string = '=========='+str(now)+'==============\n'
out_string += 'Strategy:\t' + strategy + '\n'
out_string += str('Model Name:\t' + model_name+'\n')
out_string += '-------------------------------------------------' + '\n'

out_string += 'Total Samples:\t' + str(len(actual)) + '\n'
out_string += 'Positive Samples:\t' + str(sum(actual)) + '\n'
out_string += 'Negative Samples:\t' + str(len(actual)-sum(actual)) + '\n'

out_string += 'True Positive:\t' + str(tp) + '\n'
out_string += 'True Negative:\t' + str(tn) + '\n'
out_string += 'False Positive:\t' + str(fp) + '\n'
out_string += 'False Negative:\t' + str(fn) + '\n'

out_string += 'Accuracy:\t' + str(accuracy) + '\n'
out_string += 'Precision:\t' + str(precision) + '\n'
out_string += 'Recall:\t' + str(recall) + '\n'
out_string += 'F-measure:\t' + str(fmeasure) + '\n'
out_string += 'Cohen_Kappa_Score:\t' + str(cohen_kappa_score) + '\n'
out_string += 'AUC:\t' + str(auc_val) + '\n'
out_string += 'ROC_AUC:\t' + str(roc_auc_val) + '\n'
out_string += '\n'
out_string += classification_report(actual, predicted)
out_string += '\n'
print(out_string)
with open(model_name+'_'+strategy+'_POP.txt', 'a+') as FO:
    FO.write(out_string)