# Feature Description

## Status
0: 1-29 days past due \
1: 30-59 days past due \
2: 60-89 days overdue \
3: 90-119 days overdue \
4: 120-149 days overdue \
5: Overdue or bad debts, write-offs for more than 150 days \
C: paid off that month \
X: No loan for the month

## Target
0: Normal Borrower \
1: Risky Borrower

# Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import psycopg2 as pg
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', palette = 'Paired')
#plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_formats = ['svg']
np.set_printoptions(suppress=True) # Suppress scientific notation where possible
from ipywidgets import interactive, FloatSlider

from sklearn.inspection import permutation_importance
from sklearn import linear_model, svm, naive_bayes, neighbors, ensemble
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, plot_confusion_matrix, confusion_matrix,\
roc_auc_score, roc_curve, precision_recall_curve, f1_score, fbeta_score, recall_score,\
precision_recall_fscore_support

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, 
                              AdaBoostClassifier, BaggingRegressor)
from sklearn.metrics import accuracy_score, make_scorer, log_loss

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from mlxtend.plotting import plot_decision_regions

import xgboost as xgb

# Import Data

In [2]:
df = pd.read_csv('/Users/dominguez/Documents/Loan_Decision_Bot/data/credit_card_approval.csv', low_memory=False)

# Data Exploration

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
plt.hist(df['CODE_GENDER']);

In [None]:
plt.hist(df['FLAG_OWN_CAR']);

In [None]:
plt.hist(df['CNT_CHILDREN']);

In [None]:
plt.hist(df['AMT_INCOME_TOTAL'], bins=50);

In [None]:
plt.hist(df['DAYS_EMPLOYED']);

In [None]:
df['JOB'].value_counts()

In [None]:
plt.hist(df['BEGIN_MONTHS']);

In [None]:
plt.hist(df['STATUS']);

In [None]:
df['STATUS'].value_counts()

# Feature Selection

There are a lot of features that have no impact on the target variable. Let's select the features that have the highest correlation with the target variable

1. AMT_INCOME_TOTAL
2. DAYS_EMPLOYED
3. BEGIN_MONTHS
4. STATUS
5. TARGET

In [5]:
df = df[['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'BEGIN_MONTHS', 'STATUS', 'TARGET']]
df.head()

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_EMPLOYED,BEGIN_MONTHS,STATUS,TARGET
0,270000.0,-2300,-6,C,0
1,81000.0,-377,-4,0,0
2,270000.0,-1028,0,C,0
3,112500.0,-1956,-3,0,0
4,139500.0,-5578,-29,0,0


It doesn't make sense for DAYS_EMPLOYED and BEGIN_MONTHS to be negative, just makes things weird. Let's convert to positive

In [6]:
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'] * -1
df['BEGIN_MONTHS'] = df['BEGIN_MONTHS'] * -1

In [20]:
df.head()

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_EMPLOYED,BEGIN_MONTHS,STATUS,TARGET
0,270000.0,2300,6,C,0
1,81000.0,377,4,0,0
2,270000.0,1028,0,C,0
3,112500.0,1956,3,0,0
4,139500.0,5578,29,0,0


In [23]:
# Replace STATUS values

df['STATUS'].replace(to_replace='0', value='1-29 days', inplace=True)
df['STATUS'].replace(to_replace='1', value='30-59 days', inplace=True)
df['STATUS'].replace(to_replace='2', value='60-89 days', inplace=True)
df['STATUS'].replace(to_replace='3', value='90-119 days', inplace=True)
df['STATUS'].replace(to_replace='4', value='120-149 days', inplace=True)
df['STATUS'].replace(to_replace='5', value='150+ days', inplace=True)
df['STATUS'].replace(to_replace='C', value='paid off', inplace=True)
df['STATUS'].replace(to_replace='X', value='no loan', inplace=True)

In [24]:
df

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_EMPLOYED,BEGIN_MONTHS,STATUS,TARGET
0,270000.0,2300,6,paid off,0
1,81000.0,377,4,1-29 days,0
2,270000.0,1028,0,paid off,0
3,112500.0,1956,3,1-29 days,0
4,139500.0,5578,29,1-29 days,0
...,...,...,...,...,...
537662,166500.0,5401,8,1-29 days,0
537663,135000.0,4635,8,1-29 days,0
537664,180000.0,2462,7,1-29 days,0
537665,220500.0,3847,1,1-29 days,0


In [32]:
df[df['STATUS'] == 'paid off']

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_EMPLOYED,BEGIN_MONTHS,STATUS,TARGET
0,270000.0,2300,6,paid off,0
2,270000.0,1028,0,paid off,0
15,270000.0,1117,12,paid off,0
17,202500.0,8375,7,paid off,0
19,67500.0,3072,35,paid off,0
...,...,...,...,...,...
537657,135000.0,4447,35,paid off,0
537658,247500.0,875,13,paid off,0
537659,495000.0,3880,8,paid off,0
537660,315000.0,1268,5,paid off,0


Are paid off loans relevant?

In [35]:
df.corr()

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_EMPLOYED,BEGIN_MONTHS,TARGET
AMT_INCOME_TOTAL,1.0,0.0095,0.001829,0.002887
DAYS_EMPLOYED,0.0095,1.0,0.044414,-0.01923
BEGIN_MONTHS,0.001829,0.044414,1.0,0.001646
TARGET,0.002887,-0.01923,0.001646,1.0


# Base Model

In [None]:
# Establish features and target variables

X = df.loc[:,'AMT_INCOME_TOTAL':'STATUS']

y = df['TARGET']

In [None]:
# Encoding categorical variables

categoricals = ['STATUS']
numericals = ['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'BEGIN_MONTHS']

ohe = OneHotEncoder(sparse=False, drop='first')
cat_matrix = ohe.fit_transform(X.loc[:, categoricals])
X_ohe = pd.DataFrame(cat_matrix,
                       columns=ohe.get_feature_names(categoricals), #create meaningful column names
                       index=X.index) #keep the same index values
X = pd.concat([X.loc[:, numericals], X_ohe], axis=1)

In [None]:
# hold out 20% of the data for final testing
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=42)

# Split train and validate sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=42)

#this helps with the way kf will generate indices below
# X, y = np.array(X), np.array(y)

In [42]:
# Scale features

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
# Validate

# Knn
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled,y_train)
print(f'KNN score: {knn.score(X_val_scaled,y_val)}')

# Logistic Regression
lr = LogisticRegression(penalty='none')
lr.fit(X_train_scaled,y_train)
print(f'Logistic Regression score: {lr.score(X_val_scaled,y_val)}')

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train_scaled,y_train)
print(f'Random Forest score: {rf.score(X_val_scaled,y_val)}')

# XGBoost
gbm = xgb.XGBClassifier( )
gbm.fit(X_train_scaled,y_train)
print(f'XGBoost score: {gbm.score(X_val_scaled,y_val)}')

# SVC
svc = SVC(probability=True)
svc.fit(X_train_scaled,y_train)
print(f'SVC score: {svc.score(X_val_scaled,y_val)}')

In [None]:
# Test

# Scale test data

X_test_scaled = scaler.transform(X_test)

# Knn
print(f'KNN test score: {knn.score(X_test_scaled,y_test)}')

# Logistic Regression
print(f'Logistic Regression test score: {lr.score(X_test_scaled,y_test)}')

# Random Forest
print(f'Random Forest test score: {rf.score(X_test_scaled,y_test)}')

# XGBoost
print(f'XGBoost test score: {gbm.score(X_test_scaled,y_test)}')

# SVC
print(f'SVC test score: {svc.score(X_test_scaled,y_test)}')

# Feature Importance

In [41]:
result = permutation_importance(rf, X_test, y_test, n_repeats=10,
                                random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()

fig, ax = plt.subplots()
ax.boxplot(result.importances[sorted_idx].T,
           vert=False, labels=X_test.columns[sorted_idx])
ax.set_title("Permutation Importances (test set)")
fig.tight_layout()
plt.show()

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.