# Credit Score Modeling

## 1. Problem Definition

🎯 Our data is now clean, scaled, and encoded. We will test several models to select the one with the best performance.

## 2. Data Collection

In [1]:
# 1. DATA MANIPULATION
import pandas as pd
import numpy as np

# 2. DATA VISUALISATION
import matplotlib.pyplot as plt
import seaborn as sns

# 3. STATISTICS
from statsmodels.graphics.gofplots import qqplot

# 4. MACHINE LEARNING

## 4.1. Preprocessing

### 4.1.1. Scalers
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

### 4.1.2. Encoders
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

### 4.1.3. Crossvalidation, Training, Model
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### 4.1.4. Evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics  import ConfusionMatrixDisplay

In [2]:
# Import train data from KAGGLE
data_train_preprocessed = pd.read_csv('raw_data/df_preprocessed_23082024.csv', low_memory=False)
data_train_prep = data_train_preprocessed.copy()

# Import test data from KAGGLE
data_test_kaggle = pd.read_csv('raw_data/test_kaggle.csv', low_memory=False)
data_test = data_test_kaggle.copy()

# Working with the training dataset
df = data_train_prep

In [3]:
# Display all columns
pd.set_option('display.max_columns', None)

## 3. Data Exploration

### 3.1. Reminder about our dataset

In [4]:
df.head()

Unnamed: 0,Age,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Outstanding_Debt,Credit_Utilization_Ratio,Total_EMI_per_month,Amount_invested_monthly,Monthly_Balance,sin_Month,cos_Month,Credit_History_Age_Months,Customer_ID_infrequent_sklearn,Occupation_Accountant,Occupation_Architect,Occupation_Developer,Occupation_Doctor,Occupation_Engineer,Occupation_Entrepreneur,Occupation_Journalist,Occupation_Lawyer,Occupation_Manager,Occupation_Mechanic,Occupation_MediaManager,Occupation_Musician,Occupation_Scientist,Occupation_Teacher,Occupation_Writer,Credit_Mix_Bad,Credit_Mix_Good,Credit_Mix_Standard,Payment_of_Min_Amount_NM,Payment_of_Min_Amount_No,Payment_of_Min_Amount_Yes,Payment_Behaviour_HighspentLargevaluepayments,Payment_Behaviour_HighspentMediumvaluepayments,Payment_Behaviour_HighspentSmallvaluepayments,Payment_Behaviour_LowspentLargevaluepayments,Payment_Behaviour_LowspentMediumvaluepayments,Payment_Behaviour_LowspentSmallvaluepayments,Credit_Score
0,-0.555556,-0.346209,-0.75,-0.333333,-0.833333,0.25,-0.833333,-0.777778,0.196635,-0.333333,-0.258118,-0.649349,-0.150282,-0.292425,-0.111821,0.0,1.378396,0.291755,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,-0.555556,-0.346209,-0.75,-0.333333,-0.833333,0.25,-1.0,-0.888889,0.196635,-0.333333,-0.258118,-0.042731,-0.150282,-0.101779,-0.251368,0.36934,1.009056,0.312896,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,-0.555556,-0.346209,-0.75,-0.333333,-0.833333,0.25,-0.833333,-0.777778,0.091483,-0.333333,-0.258118,-0.437753,-0.150282,-0.285959,-0.018093,0.504528,0.5045279,0.30444,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3,-0.555556,-0.346209,-0.75,-0.333333,-0.833333,0.25,-0.722222,-1.111111,-0.329127,-0.333333,-0.258118,-0.10989,-0.150282,0.306942,-0.557745,0.36934,2.800693e-16,0.310782,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,-0.555556,-0.346209,-0.75,-0.333333,-0.833333,0.25,-0.666667,-0.888889,0.196635,-0.333333,-0.258118,-0.889194,-0.150282,-0.488761,0.033386,0.0,-0.3693401,0.317125,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0


In [5]:
df.shape

(100000, 47)

### 3.2. Defining the features X_preprocessed and the target y_encoded

In [6]:
y_encoded = df['Credit_Score']
y_encoded.head()

0    0
1    0
2    0
3    0
4    0
Name: Credit_Score, dtype: int64

In [7]:
y_encoded.value_counts()

Credit_Score
2    53174
1    28998
0    17828
Name: count, dtype: int64

In [8]:
y = y_encoded

In [9]:
X_preprocessed = df.drop(columns=['Credit_Score'])
X_preprocessed.head()

Unnamed: 0,Age,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Outstanding_Debt,Credit_Utilization_Ratio,Total_EMI_per_month,Amount_invested_monthly,Monthly_Balance,sin_Month,cos_Month,Credit_History_Age_Months,Customer_ID_infrequent_sklearn,Occupation_Accountant,Occupation_Architect,Occupation_Developer,Occupation_Doctor,Occupation_Engineer,Occupation_Entrepreneur,Occupation_Journalist,Occupation_Lawyer,Occupation_Manager,Occupation_Mechanic,Occupation_MediaManager,Occupation_Musician,Occupation_Scientist,Occupation_Teacher,Occupation_Writer,Credit_Mix_Bad,Credit_Mix_Good,Credit_Mix_Standard,Payment_of_Min_Amount_NM,Payment_of_Min_Amount_No,Payment_of_Min_Amount_Yes,Payment_Behaviour_HighspentLargevaluepayments,Payment_Behaviour_HighspentMediumvaluepayments,Payment_Behaviour_HighspentSmallvaluepayments,Payment_Behaviour_LowspentLargevaluepayments,Payment_Behaviour_LowspentMediumvaluepayments,Payment_Behaviour_LowspentSmallvaluepayments
0,-0.555556,-0.346209,-0.75,-0.333333,-0.833333,0.25,-0.833333,-0.777778,0.196635,-0.333333,-0.258118,-0.649349,-0.150282,-0.292425,-0.111821,0.0,1.378396,0.291755,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.555556,-0.346209,-0.75,-0.333333,-0.833333,0.25,-1.0,-0.888889,0.196635,-0.333333,-0.258118,-0.042731,-0.150282,-0.101779,-0.251368,0.36934,1.009056,0.312896,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.555556,-0.346209,-0.75,-0.333333,-0.833333,0.25,-0.833333,-0.777778,0.091483,-0.333333,-0.258118,-0.437753,-0.150282,-0.285959,-0.018093,0.504528,0.5045279,0.30444,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.555556,-0.346209,-0.75,-0.333333,-0.833333,0.25,-0.722222,-1.111111,-0.329127,-0.333333,-0.258118,-0.10989,-0.150282,0.306942,-0.557745,0.36934,2.800693e-16,0.310782,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.555556,-0.346209,-0.75,-0.333333,-0.833333,0.25,-0.666667,-0.888889,0.196635,-0.333333,-0.258118,-0.889194,-0.150282,-0.488761,0.033386,0.0,-0.3693401,0.317125,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [10]:
X_preprocessed.shape

(100000, 46)

In [11]:
X = X_preprocessed

## 4. Logistic Regression

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((70000, 46), (30000, 46), (70000,), (30000,))

In [13]:
# Instantiate a Logistic Regression model
logreg = LogisticRegression(multi_class='multinomial', max_iter=10000)

# Train the model
logreg.fit(X_train, y_train)

# Evaluate the model
logreg.score(X_test, y_test)



0.6248666666666667

## 5. Decision Tree

In [14]:
from sklearn.tree import DecisionTreeClassifier

# Instantiate the model
decision_tree = DecisionTreeClassifier()

# Train the model
decision_tree.fit(X_train, y_train)

# Evaluate the model
decision_tree.score(X_test, y_test)

0.7131666666666666

## 6. Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate the model
random_forest = RandomForestClassifier()

# Train the model
random_forest.fit(X_train, y_train)

# Evaluate the model
random_forest.score(X_test, y_test)

0.7950333333333334

## 7. Linear SVM

In [22]:
from sklearn.svm import LinearSVC

# Instantiate the model
linear_svm = LinearSVC(C=1, max_iter=10000)

# Train the model
linear_svm.fit(X_train, y_train)

# Evaluate the model
linear_svm.score(X_test, y_test)

0.6320666666666667

## 8. Polynomial SVM

In [15]:
from sklearn.svm import SVC

# Instantiate the model
svm_poly = SVC(kernel='poly', degree=2, C=1)

# Train the model
svm_poly.fit(X_train, y_train)

# Evaluate the model
svm_poly.score(X_test, y_test)

0.5336333333333333

## 9. CatBoost

In [13]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-macosx_11_0_universal2.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Using cached graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-5.23.0-py3-none-any.whl.metadata (7.3 kB)
Collecting tenacity>=6.2.0 (from plotly->catboost)
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp310-cp310-macosx_11_0_universal2.whl (26.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.2/26.2 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached graphviz-0.20.3-py3-none-any.whl (47 kB)
Downloading plotly-5.23.0-py3-none-any.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading tenacity-9.0.0-py3-none-any.whl (28 kB)
Installing collected packages: tenacity, graphviz, plotly, c

In [14]:
from catboost import CatBoostClassifier

# Instantiate the model
catboost_model = CatBoostClassifier(iterations=500, learning_rate=0.1, depth=6, verbose=0)

# Train the model
catboost_model.fit(X_train, y_train)

# Evaluate the model
catboost_model.score(X_test, y_test)

0.7456666666666667

## 10. XGBoost

In [16]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.1
Note: you may need to restart the kernel to use updated packages.


In [18]:
import xgboost as xgb
from xgboost import XGBClassifier

# Instantiate the model
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Evaluate the model
xgb_model.score(X_test, y_test)

0.7108666666666666

## 11. AdaBoost

In [35]:
from sklearn.ensemble import AdaBoostClassifier

# Instantiate the model
ada_boost = AdaBoostClassifier()

# Train the model
ada_boost.fit(X_train, y_train)

# Evaluate the model
ada_boost.score(X_test, y_test)



0.6528

## 12. Extra Trees Classifier (Extremely Randomized Trees)

In [37]:
from sklearn.ensemble import ExtraTreesClassifier

# Instantiate the model
extra_trees = ExtraTreesClassifier()

# Train the model
extra_trees.fit(X_train, y_train)

# Evaluate the model
extra_trees.score(X_test, y_test)

0.7556666666666667

## 13. Voting Classifier

In [42]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Instantiate individual models
extra_trees = ExtraTreesClassifier()
rf = RandomForestClassifier()
cb = CatBoostClassifier()

# Instantiate the Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('extra_trees', extra_trees),
    ('rf', rf),
    ('cb', cb)
], voting='soft')  # or 'hard'

# Train the model
voting_clf.fit(X_train, y_train)

# Evaluate the model
voting_clf.score(X_test, y_test)

Learning rate set to 0.098271
0:	learn: 1.0301641	total: 15ms	remaining: 15s
1:	learn: 0.9767045	total: 22.6ms	remaining: 11.3s
2:	learn: 0.9327783	total: 34ms	remaining: 11.3s
3:	learn: 0.8960366	total: 47.8ms	remaining: 11.9s
4:	learn: 0.8666921	total: 56.5ms	remaining: 11.2s
5:	learn: 0.8423598	total: 67ms	remaining: 11.1s
6:	learn: 0.8200999	total: 75.4ms	remaining: 10.7s
7:	learn: 0.8020749	total: 85.1ms	remaining: 10.6s
8:	learn: 0.7863088	total: 96.4ms	remaining: 10.6s
9:	learn: 0.7723316	total: 109ms	remaining: 10.8s
10:	learn: 0.7604584	total: 118ms	remaining: 10.6s
11:	learn: 0.7495794	total: 127ms	remaining: 10.4s
12:	learn: 0.7401943	total: 135ms	remaining: 10.2s
13:	learn: 0.7316936	total: 143ms	remaining: 10.1s
14:	learn: 0.7236382	total: 149ms	remaining: 9.82s
15:	learn: 0.7168987	total: 156ms	remaining: 9.58s
16:	learn: 0.7109210	total: 163ms	remaining: 9.45s
17:	learn: 0.7056452	total: 172ms	remaining: 9.37s
18:	learn: 0.7004624	total: 181ms	remaining: 9.36s
19:	learn:

0.7843333333333333

## 14. Deep Learning

In [None]:
# pip install tensorflow

In [19]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras import models, layers, regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

In [20]:
X_train.shape

(70000, 46)

In [21]:
# Define the model
model = Sequential()
model.add(layers.Dense(25, activation='relu', input_dim=46))  # Input layer with 25 neurons and input_dim=46 (features in X_train)
model.add(layers.Dense(10, activation='relu'))  # Hidden layer with 10 neurons
model.add(layers.Dense(1, activation='sigmoid'))  # Binary because our target has been encoded and its shape is 1

# Compile the model
model.compile(
    optimizer=Adam(),
    loss=CategoricalCrossentropy(),  # Correct usage
    metrics=['accuracy']
)

# Define EarlyStopping callback
es = EarlyStopping(
    patience=30,
    restore_best_weights = True
)

# Train the model
history = model.fit(
    X_train,
    y_train,
    validation_split = 0.3,
    epochs = 20, # Try increasing to 500 later
    batch_size = 16, 
    verbose = 1, 
    callbacks = [es]
)

# Evaluate the model
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
print(f'Loss: {loss:.4f}, Accuracy: {accuracy:.4f}')

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m2999/3063[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 200us/step - accuracy: 0.2923 - loss: 0.0000e+00

  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m3063/3063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 282us/step - accuracy: 0.2923 - loss: 0.0000e+00 - val_accuracy: 0.2887 - val_loss: 0.0000e+00
Epoch 2/20
[1m3063/3063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 264us/step - accuracy: 0.2872 - loss: 0.0000e+00 - val_accuracy: 0.2887 - val_loss: 0.0000e+00
Epoch 3/20
[1m3063/3063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 264us/step - accuracy: 0.2895 - loss: 0.0000e+00 - val_accuracy: 0.2887 - val_loss: 0.0000e+00
Epoch 4/20
[1m3063/3063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 264us/step - accuracy: 0.2906 - loss: 0.0000e+00 - val_accuracy: 0.2887 - val_loss: 0.0000e+00
Epoch 5/20
[1m3063/3063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 264us/step - accuracy: 0.2851 - loss: 0.0000e+00 - val_accuracy: 0.2887 - val_loss: 0.0000e+00
Epoch 6/20
[1m3063/3063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 269us/step - accuracy: 0.2903 - loss: 0.0000e+00 - val_accurac

## 15. Random Forest Tuning

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.metrics import accuracy_score

# Define the search space for hyperparameters
param_distributions = {
    'n_estimators': randint(100, 300),  # Number of trees in the forest
    'max_depth': [None, 10, 20],        # Maximum depth of trees
    'min_samples_split': randint(2, 10), # Minimum number of samples to split a node
    'min_samples_leaf': randint(1, 5),   # Minimum number of samples at a leaf node
}

# Instantiate RandomForest and RandomizedSearchCV
rf = RandomForestClassifier(random_state=42)
search = RandomizedSearchCV(
    rf,
    param_distributions,
    n_iter=10,                   # Number of random combinations to try
    cv=5,                       # Number of cross-validation folds
    scoring='accuracy',         # Metric for evaluating the model
    n_jobs=-1,                  # Use all available cores
    random_state=42,
    verbose=1
)

# Train the model with hyperparameter search
search.fit(X_train, y_train)

# Display the best parameters
print("Best hyperparameters:", search.best_params_)

# Evaluate the best model
best_rf = search.best_estimator_
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 157}
Accuracy: 0.7972


## 16. Feature Importance

In [33]:
from sklearn.inspection import permutation_importance

In [43]:
'''# Perform the permutation
permutation_score = permutation_importance(search, X_train, y_train, n_repeats=2)

# Unstack results showing the decrease in performance after shuffling features
importance_df = pd.DataFrame(np.vstack((X_train.columns, permutation_score.importances_mean)).T)
importance_df.columns=['feature','score importance']

# Show the important features
importance_df.sort_values(by="score importance", ascending = False)
'''

'# Perform the permutation\npermutation_score = permutation_importance(search, X_train, y_train, n_repeats=2)\n\n# Unstack results showing the decrease in performance after shuffling features\nimportance_df = pd.DataFrame(np.vstack((X_train.columns, permutation_score.importances_mean)).T)\nimportance_df.columns=[\'feature\',\'score importance\']\n\n# Show the important features\nimportance_df.sort_values(by="score importance", ascending = False)\n'

In [None]:
! git add .
! git commit -m "Modeling_24082024"
! git push origin greg_branch