* Problem: Predict whether a customer will default on a loan based on their financial and demographic info. 
* ML Framework: scikit-learn
* Link to dataset: https://www.kaggle.com/datasets/uciml/default-of-credit-card-clients-dataset?resource=download 

In [None]:
#=========IMPORT LIBRARIES=========
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklear.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
#=========LOAD DATA=========
#Example: local CSV
data = pd.read_csv("data/UCI_Credit_Card.csv")
#not needed to train and test with
data.drop(columns=['ID'])

#Quick Look
data.head()
data.info()
data.describe()

In [None]:
#=========PREPROCESSING=========
# Handle missing values        
# Encode Categorical features  
# Scale numerical features     

#Example of preprocessing pipeline
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
numeric_features = [col for col in data.columns if col not in categorical_features + ['default.payment.next.month']]

# preprocessor - a preprocessor is basically a *set of instructions that tells the model how to handle 
# raw data before training*. Real-world data is not usually ready to go straight into machine learning model. 
# It is the data cleaner + translator that makes messy raw data usable for ML.
# for ex: 
#   * some numbers might be on different scales (think like income in thousands, but then age in years, credit score in 3 dig.)
#   * some features are categorical like martial status which a model cant read as text
#   * some columns might have missing values
# 
# to solve this the preprocessor
#   * scales numeric features -> so features are comparable in test_size
#   * encodes categorical features -> turning words into numbers
#   * handle missing data, feature selection, etc.


# StandardScaler is applied to numeric columns which basically means making them 0, std 1.
# OneHotEncoder is applied to categorical columns which converts text to binary columns.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [None]:
#=========SPLIT DATA=========

#This part does literally what it says:
#Drop 'Defaulted' from the data set
X = data.drop(columns=['default.payment.next.month'])
#'y' will only extrapolate the target column 'default.payment.next.month'
y = data['default.payment.next.month']

#The train_test_split function splits the dataset into two parts
#    * Training set (what the model learns from)
#    * Test set (unseen data to check if the model generalizes)
#
#Parameters:
#   * X,y -> input features and target
#   * test_size = 0.2 -> 20% of the data goes to test, 80% to train
#   * random_state = 42 -> ensures the split is the same every time (reproducibility)
#
#Result
#   * X_train = training input features
#   * X_test = test input features
#   * y_train = training target values
#   * y_test = test target values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#=========BUILD MODEL=========

#Random Forest Classifier inside a pipeline

#What random_state does:
#   * random_state is a seed for the random number generator used by functions like train_test_split,
#     shuffle, or any algorithim that has a randomness (like RandomForest or KMeans).
#   * Setting it ensures reproducibility: every time the code gets ran with the same random_state,
#     it ensure the same split or results. W/o it, each run could give slightly different outputs. 
#     Using 42 ensures the generator starts at the same point every time. "Randomly picking rows the same way
#     each run.      
#
#RandomForestClassifier is a type of ensemble machine learning module used for classification tasks
#like predicting discrete categories such as yes/no, 0/1, etc.
#"forest" meaning lots of decision trees.
#
clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

clf.fit(X_train, y_train)

In [None]:
#=========EVALUATE MODEL=========

#predict() basically 
#   * takes each test row
#   * applies the learned weight and rules from earlier
#   * produces a probability
#   * converts probability -> 0 or 1 (threshold = 0.5 by default)
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
#=========SAVE MODEL=========

import joblib
joblib.dump(clf, 'models/credit_default_model.pkl')