# **Classification**

## Objectives
+ Fit and evaluate a classification model to predict loan default.

## Inputs
+ outputs/datasets/collection/LoanDefault.csv
+ Instructions on which variables to use for data cleaning and feature engineering.

## Outputs
+ Train set (features and target)
+ Test set (features and target)
+ Modeling pipeline
+ Feature importance plot

---


Start by changing working directory

In [None]:
import os

current_dir = os.getcwd() # get the current working directory
current_dir

In [None]:
os.chdir(os.path.dirname(current_dir)) # change directory to parent directory
print("The directory you are in is:", os.getcwd()) # print current directory

## Load Data

In [None]:
import numpy as np
import pandas as pd

df = (pd.read_csv("outputs/datasets/collection/LoanDefault.csv")
      .drop(labels=["ID", "year"], axis=1))

df.head()

## ML Pipeline with data

### Pipeline for Data Cleaning and Feature Engineering

A custom transformer was created to impute missing values in the LTV feature based on the relationship between loan amount and property value. Then, the pipeline for data cleaning and engineering is created.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class LTVImputer(BaseEstimator, TransformerMixin):
    """
    Class to impute missing values in the LTV column of a DataFrame.
    """
    def fit(self, X, y=None):
        """
        Doesn't need fitting, it just needs to apply the transformation.
        """
        return self

    def transform(self, X):
        """
        Transform the data by imputing missing values in the LTV column
        using the formula.
        """
        X_copy = X.copy()
        X_copy.loc[X_copy["LTV"].isnull(), "LTV"] = (
            X_copy["loan_amount"] / X_copy["property_value"]
            ) * 100
        return X_copy

In [None]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer, ArbitraryNumberImputer
from feature_engine.outliers import OutlierTrimmer
from feature_engine.encoding import OrdinalEncoder
from feature_engine.transformation import LogTransformer, YeoJohnsonTransformer

def PipelineDataCleaningAndEngineering():

    # Categorical features and numerical features
    categorical_variables = ['loan_limit','Gender','approv_in_adv',
                             'loan_type','loan_purpose','Credit_Worthiness',
                             'open_credit','business_or_commercial',
                             'Neg_ammortization','interest_only',
                             'lump_sum_payment','construction_type',
                             'occupancy_type','Secured_by','total_units',
                             'credit_type','co-applicant_credit_type',
                             'age','submission_of_application','Region',
                             'Security_Type']
    median_variables = ["Upfront_charges",
                    "rate_of_interest",
                    "property_value",
                    "income"]
    mean_variables = ["Interest_rate_spread", "dtir1"]
    var_to_log = ["loan_amount", "property_value"]
    var_to_yeo = ["rate_of_interest"]

    # Impute missing values for categorical features
    categorical_imputer = CategoricalImputer(imputation_method="frequent")

    # Impute missing values for numerical features
    numerical_imputer_median = MeanMedianImputer(imputation_method="median", variables=median_variables)
    numerical_imputer_mean = MeanMedianImputer(imputation_method="mean", variables=mean_variables)
    numerical_imputer_max = ArbitraryNumberImputer(arbitrary_number=360.0, variables=["term"])

    # Outlier removal
    outlier_trimmer = OutlierTrimmer(capping_method="quantiles", fold=0.05, variables=["LTV"])

    # Encoding
    encoder = OrdinalEncoder(encoding_method="arbitrary", variables=categorical_variables)

    # Variable transformations (log and Yeo-Johnson)
    log_transf = LogTransformer(variables=var_to_log)
    yeo_transf = YeoJohnsonTransformer(variables=var_to_yeo)

    pipeline_base = Pipeline([
        ("CategoricalImputer", categorical_imputer),
        ("NumericalImputerMedian", numerical_imputer_median),
        ("NumericalImputerMean", numerical_imputer_mean),
        ("NumericalImputerMax", numerical_imputer_max),
        ("LTVImputer", LTVImputer()),
        ("OutlierTrimmer", outlier_trimmer),
        ("Encoder", encoder),
        ("LogTransformer", log_transf),
        ("YeoJohnsonTransformer", yeo_transf)
    ])

    return pipeline_base

PipelineDataCleaningAndEngineering()

## ML Pipeline for Modelling and Hyperparameter Optimisation

In [None]:
# Feat Scaling
from sklearn.preprocessing import StandardScaler

# Feat Selection
from sklearn.feature_selection import SelectFromModel

# ML algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier


def PipelineClf(model):
    pipeline_base = Pipeline([
        ("scaler", StandardScaler()),
        ("feat_selection", SelectFromModel(model)),
        ("model", model),
    ])

    return pipeline_base

Using a custom function provided by Code Institute, we performed a grid search across several algorithms with standard hyperparameters to identify the best-performing model for our case. Both tree-based models and logistic regression were selected, as this is a binary classification problem based on structured tabular data.

In [None]:
from sklearn.model_selection import GridSearchCV


class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")

            model = PipelineClf(self.models[key])
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, )
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        columns = ['estimator', 'min_score',
                   'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        return df[columns], self.grid_searches

## Split dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['Status'], axis=1),
    df['Status'],
    test_size=0.2,
    random_state=0,
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

After splitting the dataset into training and test sets, we applied fitting and transformation steps. Since some rows were removed during preprocessing (due to outlier removal), we used `.loc` to realign the target variables with their corresponding feature sets, ensuring that each feature row is correctly matched to its target.

In [None]:
pipeline_data_cleaning_feat_eng = PipelineDataCleaningAndEngineering()
X_train = pipeline_data_cleaning_feat_eng.fit_transform(X_train)
y_train = y_train.loc[X_train.index]
X_test = pipeline_data_cleaning_feat_eng.transform(X_test)
y_test = y_test.loc[X_test.index]
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In this dataset, no specific techniques to handle class imbalance were applied, as the chosen model is already capable of making accurate predictions without the need for additional adjustments.

---

## GridSearch CV

Use standard hyperparameters to find the best algorithm.

In [None]:
models_quick_search = {
    "LogisticRegression": LogisticRegression(random_state=0),
    "XGBClassifier": XGBClassifier(random_state=0),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
    "RandomForestClassifier": RandomForestClassifier(random_state=0),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=0),
    "AdaBoostClassifier": AdaBoostClassifier(random_state=0),
}

params_quick_search = {
    "LogisticRegression": {},
    "XGBClassifier": {},
    "DecisionTreeClassifier": {},
    "RandomForestClassifier": {},
    "GradientBoostingClassifier": {},
    "ExtraTreesClassifier": {},
    "AdaBoostClassifier": {},
}

Run the custom function and check the results for each algorithm

In [None]:
from sklearn.metrics import make_scorer, recall_score
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train,
           scoring = make_scorer(recall_score, pos_label=1),
           n_jobs=-1, cv=5)

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary 

All models, except for linear regression, performed excellently. The next step to finalize the model choice is to analyze the most important features and their contributions.

### Assess feature importance

The ExtraTreesClassifier has emerged as the best estimator, as it utilizes a wider range of features. Upon reviewing the confusion matrix, it showed the fewest incorrect predictions compared to other models.

In [None]:
import matplotlib.pyplot as plt

pipeline_clf = grid_search_pipelines["ExtraTreesClassifier"].best_estimator_

df_feature_importance = (pd.DataFrame(data={
    'Feature': X_train.columns[pipeline_clf['feat_selection'].get_support()],
    'Importance': pipeline_clf['model'].feature_importances_})
    .sort_values(by='Importance', ascending=False)
)

# re-assign best_features order
best_features = df_feature_importance['Feature'].to_list()

df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()

Cross-validation was also applied to determine the values for recall and F1 score, as these are the most crucial metrics for this study. Both metrics achieved a perfect score of 1, which is excellent.

In [None]:
from sklearn.model_selection import cross_val_score

scores_recall = cross_val_score(pipeline_clf, X_train, y_train, cv=5,scoring="recall")
scores_f1 = cross_val_score(pipeline_clf, X_train, y_train, cv=5, scoring="f1")

print("""%0.2f recall with a standard deviation of %0.2f 
and %0.2f F1 with standard deviation of %0.2f""" % 
(scores_recall.mean(), scores_recall.std(), 
 scores_f1.mean(), scores_f1.std()))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Create confusion matrices

sets = [(X_train, y_train, "Train set"), (X_test, y_test, "Test set")]

for X_set, y_set, name in sets:
    predictions = pipeline_clf.predict(X_set)
    cm = confusion_matrix(y_set, predictions)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                    display_labels=["Not Defaulted", "Defaulted"])
    disp.plot(cmap="gist_stern")
    plt.title(f"Confusion Matrix for {name}")
    plt.show()

There is no need for a detailed report from the confusion matrix, as the model is predicting with near-perfect accuracy.

Now, we will simplify the pipeline by selecting only the most important features, which will help optimize computational efficiency.

In [None]:
best_features

## Refit  using only best features

In [None]:
def PipelineOptimizedModel():
    pipeline_base = Pipeline([
        ("scaler", StandardScaler()),
        ("ExtraTreesClassifier", ExtraTreesClassifier()),
    ])

    return pipeline_base

## Impute and Split dataset

Imputation had to be performed separately, as integrating it into the pipeline was causing unexplainable issues.

In [None]:
df_cleaned = df.copy().filter(items=(best_features+["Status"]))

impute_categorical = CategoricalImputer(imputation_method="frequent")
df_cleaned = impute_categorical.fit_transform(df_cleaned)

ordinal_encode = OrdinalEncoder(encoding_method="arbitrary")
df_cleaned = ordinal_encode.fit_transform(df_cleaned)

impute_meadian = MeanMedianImputer(
                                imputation_method="median",
                                variables=["Upfront_charges",
                                           "rate_of_interest"])
df_cleaned = impute_meadian.fit_transform(df_cleaned)

impute_mean = MeanMedianImputer(
                                imputation_method="mean",
                                variables=["Interest_rate_spread", "dtir1"])
df_cleaned = impute_mean.fit_transform(df_cleaned)

In [None]:
df_cleaned.isnull().sum() #check for nulls, confirming the imputation worked

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_cleaned.drop(['Status'], axis=1),
    df_cleaned["Status"],
    test_size=0.2,
    random_state=0,
)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
pipeline_optimized_model = PipelineOptimizedModel()
pipeline_optimized_model.fit(X_train, y_train)

As anticipated, the model's performance remained unchanged, as these were the features utilized even when others were available.

In [None]:
sets = [(X_train, y_train, "Train set"), (X_test, y_test, "Test set")]

for X_set, y_set, name in sets:
    predictions = pipeline_optimized_model.predict(X_set)
    cm = confusion_matrix(y_set, predictions)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=["Not Defaulted", "Defaulted"])
    disp.plot(cmap="gist_stern", values_format='d')
    plt.title(f"Confusion Matrix for {name}")
    plt.show()

## Push to repo


We will generate the following files

+ Train set
+ Test set
+ Modeling pipeline
+ features importance plot

In [None]:
import joblib

version = 'v1'
file_path = f'outputs/ml_pipeline/predict_status/{version}'

try:
  os.makedirs(name=file_path)
except Exception as e:
  print(e)

In [None]:
X_train.to_csv(f"{file_path}/X_train.csv", index=False)
y_train.to_csv(f"{file_path}/y_train.csv", index=False)
X_test.to_csv(f"{file_path}/X_test.csv", index=False)
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

In [None]:
pipeline_optimized_model 

In [None]:
joblib.dump(value=pipeline_optimized_model, filename=f"{file_path}/pipeline_optimized_model.pkl")

In [None]:
df_feature_importance.plot(kind='bar',x='Feature',y='Importance')
plt.savefig(f'{file_path}/features_importance.png', bbox_inches='tight')