## Imports :

In [1]:
# importing libraries and dependencies:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

## Data Processing :

In [3]:
# retreiving the data :
df = pd.read_csv('lending_data.csv')

In [67]:
df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


In [68]:
# Setting features:
X = df.iloc[:,:-1]

In [69]:
# Encoding the non-numeric feature:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

In [70]:
X['home_owner'] = label.fit_transform(X['homeowner']) 

In [71]:
X = X.drop(columns = ['homeowner'])

In [72]:
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,home_owner
0,10700.0,7.672,52800,0.431818,5,1,22800,1
1,8400.0,6.692,43600,0.311927,3,0,13600,1
2,9000.0,6.963,46100,0.349241,3,0,16100,2
3,10700.0,7.664,52700,0.43074,5,1,22700,1
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [73]:
# Setting the target
y = df['loan_status']

In [74]:
y

0         low_risk
1         low_risk
2         low_risk
3         low_risk
4         low_risk
           ...    
77531    high_risk
77532    high_risk
77533    high_risk
77534    high_risk
77535    high_risk
Name: loan_status, Length: 77536, dtype: object

In [75]:
X.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,home_owner
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804,0.606144
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077,0.667811
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0,0.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0,1.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0,1.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0,2.0


In [76]:
y.value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

## Splitting Data :

In [77]:
# Splitting the data set into training and testings sets:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =train_test_split(X,y,random_state = 1)

## Scaling :

In [78]:
# Scaaling the features to prepare them to fit in the model :
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [79]:
x_scaler =scaler.fit(X_train)

In [80]:
x_train_scaled = x_scaler.transform(X_train)
x_test_scaled = x_scaler.transform(X_test)

## Logistic Regression :

In [81]:
#  Instantiating the model :
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver = 'lbfgs' , random_state =1)

In [82]:
model.fit(x_train_scaled,y_train)

LogisticRegression(random_state=1)

In [83]:
# making predictions :
predictions = model.predict(x_test_scaled)

#### Evaluation of model :

In [21]:
from sklearn.metrics import balanced_accuracy_score , confusion_matrix
balanced_accuracy_score(y_test,predictions)

0.9889115309798473

In [22]:
print(confusion_matrix(y_test,predictions))

[[  609    10]
 [  113 18652]]


In [111]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test,predictions,digits=4))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk     0.8435    0.9838    0.9940    0.9083    0.9889    0.9769       619
   low_risk     0.9995    0.9940    0.9838    0.9967    0.9889    0.9789     18765

avg / total     0.9945    0.9937    0.9842    0.9939    0.9889    0.9788     19384



## Over Sampling:

In [25]:
# Naive Random Over Sampling 
from imblearn.over_sampling import RandomOverSampler

In [84]:
# oversampling the sets :
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(x_train_scaled,y_train)

In [85]:
Counter(y_resampled)

Counter({'low_risk': 56271, 'high_risk': 56271})

In [86]:
# fitting the model with oversampled data sets :
model.fit(X_resampled,y_resampled)

LogisticRegression(random_state=1)

In [87]:
# making predictions :
pred_resampled = model.predict(x_test_scaled)

#### Evaluation :

In [88]:
# Evalauting the performance of model with over sampled data :
balanced_accuracy_score(y_test , pred_resampled)

0.9934649587814939

In [89]:
confusion_matrix(y_test,pred_resampled)

array([[  615,     4],
       [  124, 18641]], dtype=int64)

In [112]:
print(classification_report_imbalanced(y_test,pred_resampled,digits =4))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk     0.8322    0.9935    0.9934    0.9057    0.9935    0.9870       619
   low_risk     0.9998    0.9934    0.9935    0.9966    0.9935    0.9870     18765

avg / total     0.9944    0.9934    0.9935    0.9937    0.9935    0.9870     19384



## SMOTE Oversampling

In [91]:
from imblearn.over_sampling import SMOTE

In [92]:
X_resampled1,y_resampled1=SMOTE(sampling_strategy=1.0,random_state=1).fit_resample(x_train_scaled,y_train)
Counter(y_resampled1)

Counter({'low_risk': 56271, 'high_risk': 56271})

In [97]:
# fitiing the model with over sampled data using smote oversampling :
model.fit(X_resampled1,y_resampled1)
# making predictions:
pred_resampled1 = model.predict(x_test_scaled)

#### Evaluation :

In [37]:
balanced_accuracy_score(y_test,pred_resampled1)

0.9934116680807212

In [38]:
confusion_matrix(y_test,pred_resampled1)

array([[  615,     4],
       [  126, 18639]], dtype=int64)

In [113]:
print(classification_report_imbalanced(y_test,pred_resampled1,digits=4))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk     0.8300    0.9935    0.9933    0.9044    0.9934    0.9869       619
   low_risk     0.9998    0.9933    0.9935    0.9965    0.9934    0.9868     18765

avg / total     0.9944    0.9933    0.9935    0.9936    0.9934    0.9868     19384



## Under Sampling

In [41]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)

In [99]:
# Under sampling the data sets :
X_resampled2,y_resampled2 = cc.fit_resample(x_train_scaled,y_train)
Counter(y_resampled2)

Counter({'high_risk': 1881, 'low_risk': 1881})

In [100]:
# fitting the model with under sampled data sets :
model.fit(X_resampled2,y_resampled2)

LogisticRegression(random_state=1)

In [101]:
# making predictions :
pred_undersampling = model.predict(x_test_scaled)

#### Evaluation :

In [108]:
balanced_accuracy_score(y_test,pred_undersampling)

0.9921425487504449

In [49]:
confusion_matrix(y_test,pred_undersampling)

array([[  613,     6],
       [  113, 18652]], dtype=int64)

In [114]:
print(classification_report_imbalanced(y_test,pred_undersampling,digits=4))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk     0.8444    0.9903    0.9940    0.9115    0.9921    0.9840       619
   low_risk     0.9997    0.9940    0.9903    0.9968    0.9921    0.9847     18765

avg / total     0.9947    0.9939    0.9904    0.9941    0.9921    0.9847     19384



## Combination of Over and UnderSampling

In [52]:
from imblearn.combine import SMOTEENN

In [53]:
sm = SMOTEENN(random_state=1)

In [102]:
# manipulating the data with a combination of over and under sampling :
X_resampled3,y_resampled3 = sm.fit_resample(x_train_scaled,y_train)
Counter(y_resampled3)

Counter({'high_risk': 55551, 'low_risk': 55922})

In [103]:
# fitting the model with the manipulated data :
model.fit(X_resampled3,y_resampled3)

LogisticRegression(random_state=1)

In [104]:
# making predictions :
pred_combine = model.predict(x_test_scaled)

#### Evaluation :

In [105]:
balanced_accuracy_score(y_test,pred_combine)

0.9934649587814939

In [106]:
confusion_matrix(y_test,pred_combine)

array([[  615,     4],
       [  124, 18641]], dtype=int64)

In [115]:
print(classification_report_imbalanced(y_test,pred_combine,digits=4))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk     0.8322    0.9935    0.9934    0.9057    0.9935    0.9870       619
   low_risk     0.9998    0.9934    0.9935    0.9966    0.9935    0.9870     18765

avg / total     0.9944    0.9934    0.9935    0.9937    0.9935    0.9870     19384



## Findings :

#### Q-1: Best accuracy score ?

In [110]:
# Models with oversampled data seem to perform better in comparison to other models where data was not over sampled.
# Accuracy score being : 99.34

#### Q-2: Best recall ?

In [116]:
# According the the classification report the best recall score is acheived when data was under sampled.
# Recall Score = 99.39

#### Q-3: Best geometric mean score ?

In [117]:
# The best geometric mean score was acheived when the data was oversampled.
# Geo Score = 99.34.