In [13]:
#Uncomment the line below if this is the first time you are running the notebook
!pip install category_encoders

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.[0m


In [14]:
#imports
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np 
import category_encoders as ce

from six.moves import urllib
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

ModuleNotFoundError: No module named 'category_encoders'

In [None]:
DOWNLOAD_ROOT = "http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening"
CREDIT_DATA_URL = DOWNLOAD_ROOT + "/crx.data"
CREDIT_DATA_PATH = "datasets/credit-screening"

In [None]:
# fetch data from its origin
def fetch_credit_card_data(credit_data_url=CREDIT_DATA_URL, credit_path=CREDIT_DATA_PATH):
    if not os.path.isdir(credit_path):
        os.makedirs(credit_path)
    credit_data_path = os.path.join(credit_path, "crx.data")
    urllib.request.urlretrieve(credit_data_url, credit_data_path)

fetch_credit_card_data()

In [None]:
# load data from csv file 
def load_credit_card_data(credit_data_path=CREDIT_DATA_PATH):
    csv_path=os.path.join(credit_data_path, "crx.data")
    pd.set_option('display.max_columns', None)
    col_names = ["Gender","Age","Debt","Married","BankCustomer","EducationLevel","Ethnicity","YearsEmployed","PriorDefault","Employed","CreditScore", "DriversLicense", "Citizen", "ZipCode", "Income" , "ApprovalStatus"]
    return pd.read_csv(csv_path, header=None, names=col_names)

dataset = load_credit_card_data()
dataframe = dataset.copy()

In [None]:
# replace question mark with NaN
# replace + and - with 1 and 0
dataset = dataset.replace("?", np.nan).replace('+', 1).replace('-', 0)

# convert age from object to float
dataset = dataset.astype({"Age": float})

# replace missing numeric values with mean
dataset.fillna(dataset.mean(), inplace=True)

# replace missing object values with the most freequent value
for col in dataset:
    if dataset[col].dtypes == 'object':
        dataset = dataset.fillna(dataset[col].value_counts().index[0])

dataset.isna().sum()

In [None]:
dataset['ApprovalStatus'].value_counts()

In [None]:
CountStatus = pd.value_counts(dataset['ApprovalStatus'].values, sort=False)
plt.ylabel('Number of applications')
plt.title('Approval status')
CountStatus.plot.bar()

In [None]:
dataset.dtypes

In [None]:
# Label encode for correlation
le = preprocessing.LabelEncoder()

for col in dataset:
    if dataset[col].dtypes == 'object':
        dataset[col]=le.fit_transform(dataset[col])

dataset.dtypes

In [None]:
dataset.hist(bins=20, figsize=(20, 15))

In [None]:
#Correlation heatmap
def make_corr_heatmap(data):
    corr = data.corr()
    sns.heatmap(corr, 
                xticklabels=corr.columns.values,
                yticklabels=corr.columns.values)

make_corr_heatmap(dataset)

In [None]:
#Correlation with ApprovalStatus
dataset.corr()['ApprovalStatus'].sort_values()

In [None]:
#use MAD to check the spread of the observation from the mean. MAD > std because we have outliers like age, income
dataset.mad()

In [None]:
#drop columns with the lowest correlation, or columns that do not make sense
dataset = dataset.drop(['DriversLicense', 'ZipCode', 'Ethnicity', 'Gender'], axis=1)
dataset.head()

In [None]:
ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
dataset = ohe.fit_transform(dataset)
dataset.head()

In [None]:
#separate the features and labels
dataset = dataset.values
X,y = dataset[:,0:34] , dataset[:,34]

In [None]:
#split the dataset into train, validation and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [None]:
#rescale the training, validation and testing sets
#scaler = MinMaxScaler(feature_range=(0, 1))
scaler = MinMaxScaler()
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)
rescaledX_val = scaler.fit_transform(X_val)

In [None]:
#instantiate the algorithms
rfc = RandomForestClassifier(random_state=42)
gnb = GaussianNB()
logreg = LogisticRegression(solver='liblinear')

In [None]:
print("Random Forrest")

roc_rfc = cross_val_score(rfc, rescaledX_train, y_train, scoring='roc_auc', cv = 5).mean()
f1_rfc = cross_val_score(rfc, rescaledX_train, y_train, scoring='f1', cv = 5).mean()
precision_rfc = cross_val_score(rfc, rescaledX_train, y_train, scoring='precision', cv = 5).mean()
rec_rfc = cross_val_score(rfc, rescaledX_train, y_train, scoring='recall', cv = 5).mean()
acc_rfc = cross_val_score(rfc, rescaledX_train, y_train, scoring='accuracy', cv = 5).mean()

y_pred_rfc = cross_val_predict(rfc, rescaledX_train, y_train, cv=5)
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_rfc).ravel()

#yi_rfc = 2*roc_rfc-1
yi_rfc = ((tp/(tp+fn)-(1-(tn/(fp+tn)))))

print("AUC: " , roc_rfc)
print("F1: " , f1_rfc)
print("Precission: " , precision_rfc)
print("Recall: " , rec_rfc)
print("Accuracy: " , acc_rfc)
print("Youden's index", yi_rfc)
print("True Negatives: ",tn)
print("False Positives: ",fp)
print("False Negatives: ",fn)
print("True Positives: ",tp)

In [None]:
print("Naive Bayes")

roc_gnb = cross_val_score(gnb, rescaledX_train, y_train, scoring='roc_auc', cv = 5).mean()
f1_gnb = cross_val_score(gnb, rescaledX_train, y_train, scoring='f1', cv = 5).mean()
precision_gnb = cross_val_score(gnb, rescaledX_train, y_train, scoring='precision', cv = 5).mean()
rec_gnb = cross_val_score(gnb, rescaledX_train, y_train, scoring='recall', cv = 5).mean()
acc_gnb= cross_val_score(gnb, rescaledX_train, y_train, scoring='accuracy', cv = 5).mean()

y_pred_gnb = cross_val_predict(gnb, rescaledX_train, y_train, cv=5)
tn, fp, fn, tp = metrics.confusion_matrix(y_train,y_pred_gnb)

yi_gnb = 2*roc_gnb-1

print("AUC" , roc_gnb)
print("F1: " , f1_gnb)
print("Precission: " , precision_gnb)
print("Recall: " , rec_gnb)
print("Accuracy: " , acc_gnb)
print("Youden's index", yi_gnb)
print("True Negatives: ",tn)
print("False Positives: ",fp)
print("False Negatives: ",fn)
print("True Positives: ",tp)

In [None]:
print("Logistic regression")

roc_logreg = cross_val_score(logreg, rescaledX_train, y_train, scoring='roc_auc', cv = 5).mean()
f1_logreg = cross_val_score(logreg, rescaledX_train, y_train, scoring='f1', cv = 5).mean()
precision_logreg = cross_val_score(logreg, rescaledX_train, y_train, scoring='precision', cv = 5).mean()
rec_logreg = cross_val_score(logreg, rescaledX_train, y_train, scoring='recall', cv = 5).mean()
acc_logreg = cross_val_score(logreg, rescaledX_train, y_train, scoring='accuracy', cv = 5).mean()
y_pred_logreg = cross_val_predict(logreg, rescaledX_train, y_train, cv=5)
tn, fp, fn, tp = metrics.confusion_matrix(y_train, y_pred_logreg)
yi_logreg = 2*roc_logreg-1

print("AUC of Logistic Regression is: " , roc_logreg)
print("F1: " , f1_logreg)
print("Precission: " , precision_logreg)
print("Recall: " , rec_logreg)
print("Accuracy: " , acc_gnb)
print("Youden's index", yi_logreg)
print("True Negatives: ",tn)
print("False Positives: ",fp)
print("False Negatives: ",fn)
print("True Positives: ",tp)