# Home Credit Default Risk

We have a dataframe in which are registered a lot of information about a bank's clients.

The bank wants to know if they should give them a loan or not. The machine learning model needs to predict if the client will repay the loan or not.

If the target is equal to 0: the loan was repaid, if it's equal to 1: the loan was not repaid.

The models will determine if the loan will be repaid or not according to the given features.

# Summary

* Data observation
* Data cleanup
* Boruta 

Machine Learning Models: 
* KNeighbors Classifier using Grid Search
* Logistic Regression
* Decision Tree
* Random Forest
* XGBoost

* Comparing models predictions on application test

# Library used

In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import graphviz

# Undersampling
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Machine Learning Library
from sklearn import metrics
from sklearn.metrics import multilabel_confusion_matrix, classification_report, accuracy_score, precision_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
import joblib

# Grid Search
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Boruta
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy as bp
from sklearn.datasets import load_boston

# KNeighbors Classifier
from sklearn.neighbors import KNeighborsClassifier

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Decision Tree
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# XGBoost
import xgboost as xgb

from warnings import filterwarnings
filterwarnings('ignore')

## Data

In [None]:
app_train = pd.read_csv("./resources/application_train.csv", encoding='utf-8', sep=',')
app_test = pd.read_csv("./resources/application_test.csv", encoding='utf-8', sep=',')
app_train.drop_duplicates()
app_test.drop_duplicates()

## Aligning the two datasets

In [None]:
train_labels = app_train['TARGET']
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
app_train['TARGET'] = train_labels
print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

# Data Analysis

In [None]:
app_train.dtypes.value_counts()

## Is data unbalanced ? 

In [None]:
fig = sns.countplot(x="TARGET", data=app_train)
plt.title('Balance of target')
plt.show()
print("Unbalanced data on the TARGET column:")
print(app_train['TARGET'].value_counts())

The data is very unbalanced.

## Gender distribution

In [None]:
gender_group = app_train.groupby(['CODE_GENDER'])
gender_group.size().plot(kind='pie', 
                         ylabel='Gender', 
                         colors=['pink', 'steelblue', 'pink'], 
                         title='Gender distribution',
                         autopct='%.0f%%')
plt.show()

### Contract type distribution

In [None]:
contract_group = app_train.groupby(['NAME_CONTRACT_TYPE'])
contract_group.size().plot(kind='pie', 
                           ylabel='', 
                           colors=['green', 'steelblue'], 
                           title='Contract type distribution',
                           autopct='%.0f%%')
plt.show()

## Days Birth feature

### Informations

In [None]:
(app_train['DAYS_BIRTH'] / -365).describe()

### Minimum and maximum value

In [None]:
mini = abs(app_train['DAYS_BIRTH'].max())
if mini > 365:
    print("Days birth min :", mini/365, "days" )
else:
    print("Days birth min :", mini, "days" )
    
maxi = abs(app_train['DAYS_BIRTH'].min())
print("Days birth max :", maxi/365, "years" )

### Boxplots

In [None]:
ax = sns.boxplot(x=abs(app_train['DAYS_BIRTH']))
plt.title('Boxplot of Days Birth before cleaning')
plt.show()

## Missing values

In [None]:
print("30 first columns filled with the most NaN values with their percentage:")
((app_train.isnull().sum()/app_train.shape[0])*100).sort_values(ascending=False).head(30)

# Data cleanup 

## Label encoding

In [None]:
le = LabelEncoder()
le_count = 0

for col in app_train:
    if app_train[col].dtype == 'object' or app_train[col].dtype == 'string':
        le.fit(app_train[col])
        app_train[col] = le.transform(app_train[col])
        app_test[col] = le.transform(app_test[col])
        le_count += 1
        print(col)
app_train.reset_index()
app_test.reset_index()
print('%d columns were label encoded.' % le_count)

## Missing and infinite values

In [None]:
# Replacing Infinite values with NaN values
app_train.replace([np.inf, -np.inf], np.nan, inplace=True)
app_test.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy="median").fit(app_train)
imputer = imputer.fit_transform(app_train)
app_train = pd.DataFrame(imputer, columns = app_train.columns.values.tolist())

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy="median").fit(app_test)
imputer = imputer.fit_transform(app_test)
app_test = pd.DataFrame(imputer, columns = app_test.columns.values.tolist())

## Handling unbalanced data

In [None]:
# Undersampling
X = app_train
Y = np.array(app_train['TARGET'])
X.drop('TARGET', axis=1, inplace=True)

rus = RandomUnderSampler(random_state=0)
app_train, y_resampled = rus.fit_resample(X,Y)
app_train['TARGET'] = y_resampled
print(sorted(Counter(y_resampled).items()), y_resampled.shape)

In [None]:
fig = sns.countplot(x="TARGET", data=app_train)
plt.title('Balance of target')
plt.show()

The target column is now balanced, this will allow the models to get better results.

## Days Employed feature

### Informations 

In [None]:
app_train['DAYS_EMPLOYED'].describe()

### Minimum and maximum values

In [None]:
mini = abs(app_train['DAYS_EMPLOYED'].max())
if mini > 365:
    print("Days employed min :", mini/365, "years" )
else:
    print("Days employed min :", mini, "days" )
    
maxi = abs(app_train['DAYS_EMPLOYED'].min())
print("Days employed max :", maxi/365, "years" )

We can see an anomaly : the biggest 'Days Employed' value is around 1000 years !

### Boxplots

In [None]:
ax = sns.boxplot(x=app_train['DAYS_EMPLOYED'])
plt.title('Boxplot of Days Employed before cleaning')
plt.show()

### Removing anomalies

In [None]:
app_train.drop(app_train.index[(app_train["DAYS_EMPLOYED"] > 12000)], axis=0, inplace=True)
app_test.drop(app_test.index[(app_test["DAYS_EMPLOYED"] > 12000)], axis=0, inplace=True)

In [None]:
ax = sns.boxplot(x=abs(app_train['DAYS_EMPLOYED']))
plt.title('Boxplot of Days Employed after cleaning')
plt.show()

## Correlations

In [None]:
correlations = app_train.corr()['TARGET'].sort_values()
print('Most Positive Correlations:\n')
print(correlations.tail(15))
print('\nMost Negative Correlations:\n')
print(correlations.head(15))

# Boruta

In [None]:
Xdf = app_train.copy()
Xdf.drop('TARGET', axis = 1, inplace = True)
X_boruta = Xdf

y = app_train["TARGET"]

In [None]:
forest = RandomForestRegressor(
    n_jobs=-1,
    max_depth=5
)

boruta = bp(
    estimator=forest,
    n_estimators=20,
    max_iter=100 # numbers of trials
)

boruta.fit(np.array(X_boruta), np.array(y))

In [None]:
# Features to keep
green_area = X_boruta.columns[boruta.support_].to_list()
blue_area = X_boruta.columns[boruta.support_weak_].to_list()
print("features in the green area", green_area)
print("features in the blue area", blue_area)

# Splitting data into train and test sets

In [None]:
features = green_area + blue_area
X = X_boruta[features]
app_test = app_test[features]
X

In [None]:
# Recommended test sizes for crossvalidation : [20, 25, 30]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y)

# KNeighbors Classifier Model

## Kneighbors - Hyperparameters

In [None]:
param_grid = {'n_neighbors': np.arange(1, 5),
              'metric':['euclidean', 'manhattan']
             }

In [None]:
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

## Kneighbors - Model training

In [None]:
grid.fit(X_train, y_train)

In [None]:
round(grid.best_score_, 2)*100

In [None]:
grid.best_params_

## Kneighbors - Saving the best model

In [None]:
KN = grid.best_estimator_
score = KN.score(X_test, y_test)
print("KNeighbors classifier score :" , round(score*100, 2) ,'%\n')

# KNeighbors - Model Testing

In [None]:
y_pred = KN.predict(X_test)
print("Predictions:\n\n", y_pred, '\n')
print("Real values:\n\n", y_test)

## KNeighbors - Confusion Matrix

In [None]:
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

print('The confusion matrix shows us the number of :\n')
print('* True positives :', conf_matrix[0][0] ,'\n')
print('* True negatives :', conf_matrix[0][1],'\n')
print('* False positives:', conf_matrix[1][0] ,'\n')
print('* False negatives:', conf_matrix[1][1] ,'\n')

## Kneighbors - Cross Validation Accuracy Score

In [None]:
print("Accuracy score using cross validation:", 
      round((cross_val_score(KN, X_train, y_train, cv=3, scoring='accuracy').mean())*100, 2), '%\n')

## KNeighbors - Learning Curve

In [None]:
N, train_score, val_score = learning_curve(KN, 
                                           X_train, 
                                           y_train, 
                                           train_sizes = np.linspace(0.1, 1.0, 10),
                                           cv=5)

In [None]:
plt.plot(N, train_score.mean(axis=1), label='train')
plt.plot(N, val_score.mean(axis=1), label='validation')
plt.xlabel('train_sizes')
plt.legend()
plt.show()

# Logistic regression

Our problem is a very binary one : will someone repay their credit or won't they ? 

This is why we use logistic regression as our machine learning model.

In [None]:
LR = LogisticRegression()

## LR - Model training

In [None]:
LR.fit(X_train, y_train)

## LR - Model testing

In [None]:
y_pred = LR.predict(X_test)
print("Predictions:\n\n", y_pred, '\n')
print("Real values:\n\n", y_test)

## LR - Model evaluation

### LR - Confusion Matrix

In [None]:
conf_matrix = metrics.confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

print('The confusion matrix shows us the number of :\n')
print('* True positives :', conf_matrix[0][0] ,'\n')
print('* True negatives :', conf_matrix[0][1],'\n')
print('* False positives:', conf_matrix[1][0] ,'\n')
print('* False negatives:', conf_matrix[1][1] ,'\n')

### LR - Classification Report

In [None]:
print(classification_report(y_test, y_pred))

### LR - Accuracy Score

In [None]:
print("Accuracy score:", round((accuracy_score(y_test, y_pred)*100), 3), '%\n')
print("Accuracy score using cross validation:", 
      round((cross_val_score(LR, X_train, y_train, cv=3, scoring='accuracy').mean())*100, 2), '%\n')

Model accuracy is a machine learning model performance metric that is defined as the ratio of true positives and true negatives to all positive and negative observations.

The accuracy rate is great but it doesn’t tell us anything about the errors our machine learning models make on new data we haven’t seen before.

Mathematically, it represents the ratio of the sum of true positive and true negatives out of all the predictions.

### LR - Precision Score

In [None]:
print("Precision score:", round((precision_score(y_test, y_pred, average='macro')*100), 2), '%\n')

The precision score is a useful measure of the success of prediction when the classes are very imbalanced.

Mathematically, it represents the ratio of true positive to the sum of true positive and false positive.

### LR - Recall Score

In [None]:
print("Recall score:", round((metrics.recall_score(y_test, y_pred)*100), 2), '%\n')

Model recall score represents the model’s ability to correctly predict the positives out of actual positives. This is unlike precision which measures how many predictions made by models are actually positive out of all positive predictions made.

Recall score is a useful measure of success of prediction when the classes are very imbalanced. 

Mathematically, it represents the ratio of true positive to the sum of true positive and false negative.

### LR - F1 Score

In [None]:
print("F1 Score:", round((metrics.f1_score(y_test, y_pred))*100), 2, '%\n')

F1-score is harmonic mean of precision and recall score and is used as a metrics in the scenarios where choosing either of precision or recall score can result in compromise in terms of model giving high false positives and false negatives respectively.

### LR - ROC Curve

In [None]:
prediction_prob = LR.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  prediction_prob)
auc = metrics.roc_auc_score(y_test, prediction_prob)

#create ROC curve
plt.title("Receiver Operating Characteristic curve")
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.legend(loc=4)
plt.show()

This curve displays the percentage of true positives predicted by the model as the prediction probability cutoff is lowered from 1 to 0.

The higher the AUC (area under the curve), the more accurately our model is able to predict outcomes.

## LR - Learning Curve

In [None]:
N, train_score, val_score = learning_curve(LR, 
                                           X_train, 
                                           y_train,
                                           train_sizes = np.linspace(0.1, 1.0, 10),
                                           cv=5)

In [None]:
plt.plot(N, train_score.mean(axis=1), label='train')
plt.plot(N, val_score.mean(axis=1), label='validation')
plt.xlabel('train_sizes')
plt.legend()
plt.show()

## Using our LR model on application test

In [None]:
app_test_LR = app_test.copy()
# app_test_LR['TARGET'] = 0

Application test doesn't have a TARGET column. 

That's why after the prediction, we can not see if our model finds the right value.

In [None]:
y_pred_test = LR.predict(app_test_LR)
app_test_LR['TARGET'] = y_pred_test.astype(int)
print(app_test_LR['TARGET'])

# Decision Tree

In [None]:
DT = DecisionTreeClassifier(criterion='gini')
DT.fit(X_train, y_train)
plt.figure(figsize=(20,20))
tree.plot_tree(DT)
plt.show()

## DT - Model testing

In [None]:
y_pred = DT.predict(X_test)
print("Predictions:\n\n", y_pred, '\n')
print("Real values:\n\n", y_test)

## DT - Model evaluation

### DT - Confusion Matrix

In [None]:
conf_matrix = metrics.confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

print('The confusion matrix shows us the number of :\n')
print('* True positives :', conf_matrix[0][0] ,'\n')
print('* True negatives :', conf_matrix[0][1],'\n')
print('* False positives:', conf_matrix[1][0] ,'\n')
print('* False negatives:', conf_matrix[1][1] ,'\n')

### DT - Classification report

In [None]:
print("Classification report:\n\n", classification_report(y_test, y_pred))

### DT - Accuracy Score

In [None]:
print("Accuracy score:", round((accuracy_score(y_test, y_pred)*100), 2), '%\n')
print("Accuracy score using cross validation:", 
      round((cross_val_score(DT, X_train, y_train, cv=3, scoring='accuracy').mean())*100, 2), '%\n')

## DT - Learning Curve

In [None]:
N, train_score, val_score = learning_curve(DT, 
                                           X_train, 
                                           y_train,
                                           train_sizes = np.linspace(0.1, 1.0, 10),
                                           cv=5)

In [None]:
plt.plot(N, train_score.mean(axis=1), label='train')
plt.plot(N, val_score.mean(axis=1), label='validation')
plt.xlabel('train_sizes')
plt.legend()
plt.show()

## Using our DT model on application test

In [None]:
app_test_DT = app_test.copy()
# app_test_DT['TARGET'] = 0

In [None]:
y_pred_test = DT.predict(app_test_DT)
app_test_DT['TARGET'] = y_pred_test.astype(int)
print(app_test_DT['TARGET'])

# Random Forest

In [None]:
RF = RandomForestClassifier()

In [None]:
RF.fit(X_train, y_train)

## RF - Model testing

In [None]:
y_pred = RF.predict(X_test)
print("Predictions:\n\n", y_pred, '\n')
print("Real values:\n\n", y_test)

## RF - Model evaluation

### RF - Confusion Matrix

In [None]:
conf_matrix = metrics.confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

print('The confusion matrix shows us the number of :\n')
print('* True positives :', conf_matrix[0][0] ,'\n')
print('* True negatives :', conf_matrix[0][1],'\n')
print('* False positives:', conf_matrix[1][0] ,'\n')
print('* False negatives:', conf_matrix[1][1] ,'\n')

### RF - Classification Report

In [None]:
print("Classification report:", classification_report(y_test, y_pred))

### RF -  Accuracy Score

In [None]:
print("Accuracy score:", round((accuracy_score(y_test, y_pred)*100), 2), '%\n')
print("Accuracy score using cross validation:", 
      round((cross_val_score(RF, X_train, y_train, cv=3, scoring='accuracy').mean())*100, 2), '%\n')

## RF - Learning Curve

In [None]:
N, train_score, val_score = learning_curve(RF, 
                                           X_train, 
                                           y_train,
                                           train_sizes = np.linspace(0.1, 1.0, 10),
                                           cv=5)

In [None]:
plt.plot(N, train_score.mean(axis=1), label='train')
plt.plot(N, val_score.mean(axis=1), label='validation')
plt.xlabel('train_sizes')
plt.legend()
plt.show()

## Using our RF model on application test

In [None]:
app_test_RF = app_test.copy()
#app_test_RF['TARGET'] = 0

In [None]:
y_pred_test = RF.predict(app_test_RF)
app_test_RF['TARGET'] = y_pred_test.astype(int)
print(app_test_RF['TARGET'])

# XGBoost

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

## XGBoost - Model training

In [None]:
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations
XGBst = xgb.train(param, dtrain, num_round)

In [None]:
"""
XGBst.dump_model('dump.raw.txt')
f = open('dump.raw.txt', 'r')
print(f.read())
"""

## XGBoost - Model Testing

In [None]:
probs_predictions = XGBst.predict(dtest)
y_pred = np.asarray([np.argmax(line) for line in probs_predictions])
print("Predictions:\n\n", y_pred, '\n')
print("Real values:\n\n", y_test)

## XGBoost - Model Evaluation

### XGBoost - Classification Report

In [None]:
print("Classification report:", classification_report(y_test, y_pred))

### XGBoost - Confusion Matrix

In [None]:
conf_matrix = metrics.confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

print('The confusion matrix shows us the number of :\n')
print('* True positives :', conf_matrix[0][0] ,'\n')
print('* True negatives :', conf_matrix[0][1],'\n')
print('* False positives:', conf_matrix[1][0] ,'\n')
print('* False negatives:', conf_matrix[1][1] ,'\n')

# Comparing models predictions on application test

In [None]:
LR_target = app_test_LR['TARGET'] 
DT_target = app_test_DT['TARGET']
RF_target = app_test_RF['TARGET']

one_dif = LR_target.compare(DT_target)
two_dif = LR_target.compare(RF_target)
three_dif = DT_target.compare(RF_target)

print('Difference between LR and DT on app_test:\nNumber of differences:', len(one_dif), '\n', one_dif)
print('Difference between LR and RF on app_test:\nNumber of differences:', len(two_dif), '\n', two_dif)
print('Difference between DT and RF on app_test:\nNumber of differences:', len(three_dif), '\n', three_dif)

if DT_target.equals(RF_target):
    print("Decision Tree and Random Forest found the same target values on application test")

if LR_target.equals(DT_target):
    if LR_target.equals(RF_target):
        print("All three models found the same target values on application test.")
else: 
    print("All three models did not find the same target values on application test.")

# Conclusion

The data given was very unbalanced, we had to use undersampling to balance it to get accurate models.
As the bank, if the model predicts too many true positives, this is not an issue. The bank would not give the loan to someone who could have repaid it. This isn't so great for the clients.