## CS50x 2024 - FINAL PROJECT

Made by: Pedro Nícollas Pereira Leon Lopes

Classification Machine Learning Model to classify Bank Loans

In [1]:
'''
DATASET -> loan_data (data/loan_data.csv)

Available at: https://www.kaggle.com/datasets/taweilo/loan-approval-classification-data
'''

'\nDATASET -> loan_data (data/loan_data.csv)\n\nAvailable at: https://www.kaggle.com/datasets/taweilo/loan-approval-classification-data\n'

In [2]:
# Collection of Imports
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("../data/loan_data.csv")            # Reading the .csv
data.head()                                         # Visualizing the .csv head

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [4]:
# MAPPING CATEGORIAL DATA TO NUMERICAL DATA
data['person_gender'], bank_mapping = pd.factorize(data['person_gender'])
data['person_education'], bank_mapping = pd.factorize(data['person_education'])
data['person_home_ownership'], bank_mapping = pd.factorize(data['person_home_ownership'])
data['loan_intent'], bank_mapping = pd.factorize(data['loan_intent'])
data['previous_loan_defaults_on_file'], bank_mapping = pd.factorize(data['previous_loan_defaults_on_file'])

# Dropping NaNs after Transformation
data = data.dropna(subset=['person_age', 'person_gender', 'person_education', 'person_income', 'person_emp_exp', 'person_home_ownership', 'loan_amnt', 'loan_intent', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score', 'previous_loan_defaults_on_file', 'loan_status'])

data.head()            # Entirely Numeric Representation

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,0,0,71948.0,0,0,35000.0,0,16.02,0.49,3.0,561,0,1
1,21.0,0,1,12282.0,0,1,1000.0,1,11.14,0.08,2.0,504,1,0
2,25.0,0,1,12438.0,3,2,5500.0,2,12.87,0.44,3.0,635,0,1
3,23.0,0,2,79753.0,0,0,35000.0,2,15.23,0.44,2.0,675,0,1
4,24.0,1,0,66135.0,1,0,35000.0,2,14.27,0.53,4.0,586,0,1


Normalizing the Data (0-1)

In [5]:
from sklearn.preprocessing import MinMaxScaler          # Importing "sklearn" to use MinMax normalizer
normalizer = MinMaxScaler(feature_range=(0,1))          # MinMax to create a normalizer (0-1)

In [6]:
# Normalizing the Data (0-1)
columns_to_normalize = ['person_age', 'person_income', 'loan_amnt', 'loan_int_rate', 'credit_score']

for col in columns_to_normalize:
    data[col] = normalizer.fit_transform(data[col].values.reshape(-1, 1))

data.head()            # Columns after normalizations

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,0.016129,0,0,0.008891,0,0,1.0,0,0.727023,0.49,3.0,0.371739,0,1
1,0.008065,0,1,0.000595,0,1,0.014493,1,0.392318,0.08,2.0,0.247826,1,0
2,0.040323,0,1,0.000617,3,2,0.144928,2,0.510974,0.44,3.0,0.532609,0,1
3,0.024194,0,2,0.009976,0,0,1.0,2,0.67284,0.44,2.0,0.619565,0,1
4,0.032258,1,0,0.008082,1,0,1.0,2,0.606996,0.53,4.0,0.426087,0,1


Splitting the data into Training and Test Samples

In [7]:
from sklearn.model_selection import train_test_split            # Importing "train_test_split" from sklearn to train the model

In [8]:
X = data.drop(['loan_status'], axis=1).values          # Selecting all Columns Except 'loan_status' (Target Column)"
Y = data['loan_status'].values                         # Selecting Target Variable

In [9]:
X_treino,X_teste,Y_treino,Y_teste=train_test_split(X,Y,test_size=0.30,random_state=0)           # Separating 30% for test and 70% for training

### Using *Random Forest Classifier Model*

In [10]:
from sklearn.ensemble import RandomForestClassifier                                     # Importing the RandomForestClassifier() model
loan_model = RandomForestClassifier(class_weight='balanced', random_state=42)           # RandomForestClassifier Instance

In [11]:
loan_model.fit(X_treino, Y_treino)          # Training the model

In [12]:
Y_previsto = loan_model.predict(X_teste)            # Making Predictions on the Test Sample

## RESULTS

In [13]:
# Imports for evaluation Metrics
from sklearn.metrics import (accuracy_score, f1_score)          # Accuracy and F1-score

In [14]:
# ACCURACY SCORE:
accuracy = accuracy_score(Y_teste, Y_previsto)
print(f"ACCURACY: {accuracy:.2f}")
# F1-SCORE:
f1_score = f1_score(Y_teste, Y_previsto)
print(f"F1-SCORE: {f1_score:.2f}")

ACCURACY: 0.93
F1-SCORE: 0.82


### Creating a ".pkl" file to use it at the Flask Application

In [15]:
import joblib           # Importing joblib to save the model (.pkl)

In [16]:
# Saves the model locally, using joblib
joblib.dump(loan_model, '../ML_model/loan_model.pkl')         # (.pkl)

['../ML_model/loan_model.pkl']