__Machine Learning__

This section details the differnet models I tried out to classify the credit as Fully Paid or Charged Off.


In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
import matplotlib.pyplot as plt

In [33]:
# Set display options to avoid line breaks
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.width', 1000)  # Set the display width to avoid line breaks
pd.set_option('display.float_format', '{:.0f}'.format)#silence scientific displays

In [99]:
#Load the Clean Data
import os
# Change the working directory to where the data file is located
os.chdir('C:/Users/moger/Downloads/Loan_Prediction_ML')

# Specify the absolute path to the data file
file_path = 'C:/Users/moger/Downloads/Loan_Prediction_ML/Clean_Data/credit_train_final_clean_v1.csv'

# Load the raw data
credit_train = pd.read_csv(file_path)

In [101]:
#drop unnecessary columns
credit_train.drop(['Loan ID', 'Customer ID'], axis=1, inplace=True)


In [111]:
#Create a new calculated field loan to income ratio
credit_train['Loan_to_Income_Ratio'] = credit_train['Current Loan Amount'] / credit_train['Annual Income']

In [113]:
#Convert the loan status variable to a dummy
credit_train['Loan Status'] = credit_train['Loan Status'].map({'Charged Off': 0, 'Fully Paid': 1})

In [55]:
print(credit_train.head(10))

   Loan Status  Current Loan Amount        Term  Credit Score  Annual Income Home Ownership             Purpose  Monthly Debt  Years of Credit History  Months since last delinquent  Number of Open Accounts  Number of Credit Problems  Current Credit Balance  Maximum Open Credit  Tax Liens
0   Fully Paid               217646  Short Term           730        1184194  Home Mortgage  Debt Consolidation         10855                       20                            10                       13                          1                  122170               272052          0
1   Fully Paid               548746  Short Term           678        2559110           Rent  Debt Consolidation         18660                       23                            33                        4                          0                  437171               555038          0
2   Fully Paid               234124  Short Term           727         693234           Rent  Debt Consolidation         14211        

In [115]:
# Define features and target
X = credit_train.drop('Loan Status', axis=1)
y = credit_train['Loan Status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=87)

In [117]:
#Encode categorical variables
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Align the train and test data to ensure they have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [119]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=87)
rf_model.fit(X_train, y_train)

# Make predictions
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))
print("Random Forest Accuracy:", accuracy_score(y_test, rf_predictions))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, rf_predictions))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.09      0.15      4556
           1       0.76      0.97      0.85     13401

    accuracy                           0.75     17957
   macro avg       0.64      0.53      0.50     17957
weighted avg       0.70      0.75      0.67     17957

Random Forest Accuracy: 0.748287575875703
Random Forest Confusion Matrix:
 [[  410  4146]
 [  374 13027]]


In [123]:
#Train a logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Assuming X_train, X_test, y_train, y_test are already defined and preprocessed

# Initialize and train the Logistic Regression model
lr_model = LogisticRegression(random_state=87)
lr_model.fit(X_train, y_train)

# Make predictions
lr_predictions = lr_model.predict(X_test)

# Evaluate the model
print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_predictions))
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_predictions))
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, lr_predictions))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.01      0.01      4556
           1       0.75      1.00      0.86     13401

    accuracy                           0.75     17957
   macro avg       0.73      0.50      0.43     17957
weighted avg       0.74      0.75      0.64     17957

Logistic Regression Accuracy: 0.7471738040875424
Logistic Regression Confusion Matrix:
 [[   26  4530]
 [   10 13391]]


In [125]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the model
lr_model = LogisticRegression(random_state=87)

# Define the hyperparameters and their values
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'solver': ['liblinear', 'lbfgs']  # Different solvers
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=lr_model, param_grid=param_grid, cv=5, scoring='accuracy')
# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

Best Parameters: {'C': 0.1, 'solver': 'liblinear'}
Best Cross-Validation Score: 0.7482179706609938


In [120]:
#Try XG Boost model to compare
import xgboost as xgb

# Initialize and train the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=49)
xgb_model.fit(X_train, y_train)

# Make predictions
xgb_predictions = xgb_model.predict(X_test)

# Evaluate the model
print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_predictions))
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_predictions))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, xgb_predictions))



Parameters: { "use_label_encoder" } are not used.



XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.13      0.20      4556
           1       0.76      0.95      0.85     13401

    accuracy                           0.74     17957
   macro avg       0.62      0.54      0.52     17957
weighted avg       0.69      0.74      0.68     17957

XGBoost Accuracy: 0.7421061424514117
XGBoost Confusion Matrix:
 [[  585  3971]
 [  660 12741]]


In [127]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the SVM model
svm_model = SVC(kernel='linear', random_state=92)
svm_model.fit(X_train_scaled, y_train)

# Make predictions
svm_predictions = svm_model.predict(X_test_scaled)

# Evaluate the model
print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))
print("SVM Accuracy:", accuracy_score(y_test, svm_predictions))
print("SVM Confusion Matrix:\n", confusion_matrix(y_test, svm_predictions))


SVM Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4556
           1       0.75      1.00      0.85     13401

    accuracy                           0.75     17957
   macro avg       0.37      0.50      0.43     17957
weighted avg       0.56      0.75      0.64     17957

SVM Accuracy: 0.746282786657014
SVM Confusion Matrix:
 [[    0  4556]
 [    0 13401]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [128]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=87)
gb_model.fit(X_train, y_train)

# Make predictions
gb_predictions = gb_model.predict(X_test)

# Evaluate the model
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, gb_predictions))
print("Gradient Boosting Accuracy:", accuracy_score(y_test, gb_predictions))
print("Gradient Boosting Confusion Matrix:\n", confusion_matrix(y_test, gb_predictions))

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.05      0.09      4556
           1       0.75      0.99      0.85     13401

    accuracy                           0.75     17957
   macro avg       0.66      0.52      0.47     17957
weighted avg       0.70      0.75      0.66     17957

Gradient Boosting Accuracy: 0.7490672161274154
Gradient Boosting Confusion Matrix:
 [[  236  4320]
 [  186 13215]]


In [131]:
from sklearn.neural_network import MLPClassifier

# Initialize and train the Neural Network model
nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=87)
nn_model.fit(X_train_scaled, y_train)

# Make predictions
nn_predictions = nn_model.predict(X_test_scaled)

# Evaluate the model
print("Neural Network Classification Report:")
print(classification_report(y_test, nn_predictions))
print("Neural Network Accuracy:", accuracy_score(y_test, nn_predictions))
print("Neural Network Confusion Matrix:\n", confusion_matrix(y_test, nn_predictions))

Neural Network Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.11      0.17      4556
           1       0.76      0.95      0.84     13401

    accuracy                           0.73     17957
   macro avg       0.58      0.53      0.51     17957
weighted avg       0.67      0.73      0.67     17957

Neural Network Accuracy: 0.7345324942919196
Neural Network Confusion Matrix:
 [[  491  4065]
 [  702 12699]]


In [134]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train_encoded)
nb_predictions = nb_model.predict(X_test)
print("Naive Bayes Classification Report:")
print(classification_report(y_test_encoded, nb_predictions))
print("Naive Bayes Accuracy:", accuracy_score(y_test_encoded, nb_predictions))
print("Naive Bayes Confusion Matrix:\n", confusion_matrix(y_test_encoded, nb_predictions))

# K-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train_encoded)
knn_predictions = knn_model.predict(X_test)
print("KNN Classification Report:")
print(classification_report(y_test_encoded, knn_predictions))
print("KNN Accuracy:", accuracy_score(y_test_encoded, knn_predictions))
print("KNN Confusion Matrix:\n", confusion_matrix(y_test_encoded, knn_predictions))


Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.24      0.03      0.06      4556
           1       0.75      0.97      0.84     13401

    accuracy                           0.73     17957
   macro avg       0.49      0.50      0.45     17957
weighted avg       0.62      0.73      0.64     17957

Naive Bayes Accuracy: 0.7287408809934844
Naive Bayes Confusion Matrix:
 [[  144  4412]
 [  459 12942]]
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.17      0.22      4556
           1       0.76      0.88      0.81     13401

    accuracy                           0.70     17957
   macro avg       0.54      0.52      0.52     17957
weighted avg       0.65      0.70      0.66     17957

KNN Accuracy: 0.7008965862894693
KNN Confusion Matrix:
 [[  760  3796]
 [ 1575 11826]]


I will need to chose between Random Forest model and Logistic regression model as they tend to tie in accuracy and precision, with the logistic regression model having a better precsion for correctly classifying charged off laons (72%) versus random forest's 52%. 

Also I will compare the models with the test data.