In [22]:
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier


# Constants

In [14]:
RANDOM_STATE = 42
TEST_SIZE = 0.2


# Drop the unuseful columns

In [15]:
train_data = pd.read_csv("../data/processed/train_processed.csv")
columns_to_drop = ["ID", "Customer_ID", "Month", "Name", "SSN"] # Nominal columns that don't matter
train_data.drop(columns_to_drop, axis=1, inplace=True)



# Label Encoding

In [16]:
categorical_columns = ['Occupation','Type_of_Loan','Credit_Mix','Payment_of_Min_Amount','Payment_Behaviour','Credit_Score']

label_encoder = LabelEncoder()
for column in categorical_columns:
    train_data[column] = label_encoder.fit_transform(train_data[column])

train_data.head()


Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,23.0,12,19114.12,1824.843333,3.0,4,3,4.0,128,3.0,...,3,809.98,26.82262,265.0,1,49.574949,80.415295,2,312.494089,0
1,23.0,12,19114.12,3093.745,3.0,4,3,4.0,128,18.0,...,1,809.98,31.94496,,1,49.574949,118.280222,3,284.629162,0
2,33.0,12,19114.12,3093.745,3.0,4,3,4.0,128,3.0,...,1,809.98,28.609352,267.0,1,49.574949,81.699521,4,331.209863,0
3,23.0,12,19114.12,3093.745,3.0,4,3,4.0,128,5.0,...,1,809.98,31.377862,268.0,1,49.574949,199.458074,5,223.45131,0
4,23.0,12,19114.12,1824.843333,3.0,4,3,4.0,128,6.0,...,1,809.98,24.797347,269.0,1,49.574949,41.420153,1,341.489231,0


In [17]:
X = train_data.drop("Credit_Score", axis=1)
y = train_data["Credit_Score"]


In [18]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)


# Models building

## Split data


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE,stratify=y)


## Helper function


In [20]:
def evaluate_model(y_true, y_pred, labels=None, title="Confusion Matrix", cmap="Greens"):
    print("Classification Report")
    print(classification_report(y_true, y_pred))

    print("\n---------------------------------------------\n")
    cm = confusion_matrix(y_true, y_pred)

    ax = sns.heatmap(
        cm,
        annot=True,
        cmap=cmap,
        fmt=".0f"
    )

    ax.set_xlabel("Predicted labels")
    ax.set_ylabel("True labels")
    ax.set_title(title)


## Models


In [25]:
classifiers = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('CatBoost', CatBoostClassifier(verbose=0))
]

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_macro',
    'recall': 'recall_macro'
}

for name, clf in classifiers:
    scores = cross_validate(
        clf,
        X_train,
        y_train,
        cv=5,
        scoring=scoring,
        n_jobs=-1
    )

    avg_accuracy = scores['test_accuracy'].mean()
    avg_precision = scores['test_precision'].mean()
    avg_recall = scores['test_recall'].mean()

    print(f'Classifier: {name}')
    print(f'Average Accuracy: {avg_accuracy:.4f}')
    print(f'Average Precision: {avg_precision:.4f}')
    print(f'Average Recall: {avg_recall:.4f}')
    print('-----------------------')


Classifier: Decision Tree
Average Accuracy: 0.6734
Average Precision: 0.6500
Average Recall: 0.6515
-----------------------
Classifier: Random Forest
Average Accuracy: 0.7813
Average Precision: 0.7674
Average Recall: 0.7659
-----------------------
Classifier: CatBoost
Average Accuracy: 0.7434
Average Precision: 0.7259
Average Recall: 0.7226
-----------------------
