# Model Training

## Import required packages

In [3]:
# import xgboost
import xgboost as xgb
# import lightgbm
import lightgbm as lgb
# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
# sklearn packages
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
# from feature_engine.selection import DropCorrelatedFeatures
from sklearn.svm import LinearSVC
# miscellaneous
import os.path
from pprint import pprint
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Switch to project directory

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd drive/MyDrive/COMSW4995_32 AML/AML Final Project/home-credit-default-risk
%cd home-credit-default-risk

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1Aq4j1LDPosstk7lY4v_Ck2m1_6P4wfJb/AML Final Project/home-credit-default-risk


## Create directories if necessary

In [4]:
# Check if correlation_matrices directory exists, if not create directory
if not os.path.exists('correlation_matrices'):
    os.makedirs('correlation_matrices')

# Check if models directory exists, if not create directory
if not os.path.exists('models'):
    os.makedirs('models')

## Define model and search space


When attempting a new model, the below code is the only thing you need to change.

- Model name (what should the model be saved as?)
- Correlation threshold (correlation threshold used to filter features)
- Number of search trials (for hyperparameter tuning)
- Define the search space (for hyperparameter tuning):
    - Hyperparameters to search over
    - Range of values to search over for each hyperparameter
- Define the model pipeline (what machine learning method(s) to use (e.g. XGBoost, Logistic Regression, etc.))
- For this problem set, we trained Logistic Regression, XGBoost, Random Forest, and LightGBM as shown below.

In [5]:
# # XGBoost
# model_name = 'xgboost_wo_pca'
# num_search_trials = 50
# correlation_threshold = 0

# def define_search_space():
#     return {
#         # 'n_components': hp.quniform('n_components', 5, X_train.shape[1], 1),
#         'max_depth': hp.quniform('max_depth', 3, 30, 1),
#         'gamma': hp.uniform ('gamma', 1, 9),
#         'reg_alpha' : hp.quniform('reg_alpha', 40, 180, 1),
#         'reg_lambda' : hp.uniform('reg_lambda', 0, 1),
#         'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1),
#         'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
#         'seed': 0
#     }

# def define_model(inp):
#     return Pipeline([
#             # ('pca', PCA(n_components=int(inp['n_components']), random_state=42)),
#             ('xgb', xgb.XGBClassifier(n_estimators=180, max_depth = int(inp['max_depth']), gamma = inp['gamma'],
#                                     reg_alpha = int(inp['reg_alpha']),min_child_weight=int(inp['min_child_weight']),
#                                     colsample_bytree=int(inp['colsample_bytree']), random_state=42))#, eval_metric=roc_auc_score, early_stopping_rounds=10)  )
#     ])

# XGBoost
model_name = 'xgboost_plain'
num_search_trials = 1
correlation_threshold = 0

def define_search_space():
    return {'seed': 0}

def define_model(inp):
    return Pipeline([
            ('xgb', xgb.XGBClassifier(random_state=42))#, eval_metric=roc_auc_score, early_stopping_rounds=10)  )
    ])



# # Logistic Regression
# model_name = 'logistic_regression_wo_pca'
# num_search_trials = 10
# correlation_threshold = 0

# def define_search_space():
#     return {
#         # 'n_components': hp.quniform('n_components', 5, X.shape[1], 1),
#         'C': hp.uniform('C', 0.001, 100)
#     }

# def define_model(inp):
#     return Pipeline([
#             # ('pca', PCA(n_components=int(inp['n_components']), random_state=42)),
#             ('logistic', LogisticRegression(penalty='l1', solver='saga', C=inp['C'], random_state=42))
#     ])

# # Logistic Regression
# model_name = 'logistic_regression_plain'
# num_search_trials = 1
# correlation_threshold = 0

# def define_search_space():
#     return {'seed': 0}

# def define_model(inp):
#     return Pipeline([
#             ('logistic', LogisticRegression(penalty='l1', solver='saga', random_state=42))
#     ])

# # Random Forest
# model_name = 'random_forest_plain'
# num_search_trials = 1
# correlation_threshold = 0

# def define_search_space():
#     return {'seed': 0}

# def define_model(inp):
#     return Pipeline([
#         ('random_forest', RandomForestClassifier(
#             n_estimators=100,  # Number of trees
#             max_depth=10,
#             random_state=42,
#             class_weight='balanced'  # Handle class imbalance
#         ))
#     ])

# # Random Forest with hyperparameter tuning
# model_name = 'random_forest_tuned'
# num_search_trials = 5
# correlation_threshold = 0

# def define_search_space():
#     return {
#         'n_estimators': hp.quniform('n_estimators', 50, 500, 10),
#         'max_depth': hp.quniform('max_depth', 3, 30, 1),
#         'min_samples_split': hp.quniform('min_samples_split', 2, 20, 1),
#         'seed': 0
#     }

# def define_model(inp):
#     return Pipeline([
#         ('random_forest', RandomForestClassifier(
#             n_estimators=int(inp['n_estimators']),  # Number of trees
#             max_depth=int(inp['max_depth']),  # Maximum tree depth
#             min_samples_split=int(inp['min_samples_split']),  # Minimum samples to split a node
#             random_state=42,
#             class_weight='balanced'  # Handle class imbalance
#         ))
#     ])

# # LightGBM
# model_name = 'lightgbm_tuned'
# num_search_trials = 10
# correlation_threshold = 0

# def define_search_space():
#     return {
#         'max_depth': hp.quniform('max_depth', 3, 30, 1),
#         'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
#         'num_leaves': hp.quniform('num_leaves', 20, 200, 1),
#         'min_child_samples': hp.quniform('min_child_samples', 1, 50, 1),
#         'subsample': hp.uniform('subsample', 0.5, 1),
#         'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
#         'reg_alpha': hp.uniform('reg_alpha', 0, 1),
#         'reg_lambda': hp.uniform('reg_lambda', 0, 1),
#         'seed': 0
#     }

# def define_model(inp):
#     return Pipeline([
#         ('lgb', lgb.LGBMClassifier(
#             n_estimators=180,
#             max_depth=int(inp['max_depth']),
#             learning_rate=inp['learning_rate'],
#             num_leaves=int(inp['num_leaves']),
#             min_child_samples=int(inp['min_child_samples']),
#             subsample=inp['subsample'],
#             colsample_bytree=inp['colsample_bytree'],
#             reg_alpha=inp['reg_alpha'],
#             reg_lambda=inp['reg_lambda'],
#             random_state=42
#         ))
#     ])


In [6]:
# # lightGBM model
# model_name = 'lightgbm_tuned'
# num_search_trials = 10
# correlation_threshold = 0

# def define_search_space():
#     return {
#         'max_depth': hp.quniform('max_depth', 3, 30, 1),
#         'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
#         'num_leaves': hp.quniform('num_leaves', 20, 200, 1),
#         'min_child_samples': hp.quniform('min_child_samples', 1, 50, 1),
#         'subsample': hp.uniform('subsample', 0.5, 1),
#         'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
#         'reg_alpha': hp.uniform('reg_alpha', 0, 1),
#         'reg_lambda': hp.uniform('reg_lambda', 0, 1),
#         'seed': 0
#     }

# def define_model(inp):
#     return Pipeline([
#         ('lgb', lgb.LGBMClassifier(
#             n_estimators=180,
#             max_depth=int(inp['max_depth']),
#             learning_rate=inp['learning_rate'],
#             num_leaves=int(inp['num_leaves']),
#             min_child_samples=int(inp['min_child_samples']),
#             subsample=inp['subsample'],
#             colsample_bytree=inp['colsample_bytree'],
#             reg_alpha=inp['reg_alpha'],
#             reg_lambda=inp['reg_lambda'],
#             random_state=42
#         ))
#     ])


## Correlation thresholding helper function

If `correlation_threshold` > 0, then for each pair of highly correlated features (if the absolute value of their correlation is greater than `correlation_threshold`), remove one of those features.

In [7]:
def remove_correlated_features(X, thresh, corr_matrix):
    df = pd.DataFrame(X)
    if corr_matrix is None:
        corr_matrix = df.corr()
    cols = df.columns
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            col1 = corr_matrix.columns[i]
            col2 = corr_matrix.columns[j]
            correlation = corr_matrix.iloc[i, j]
            if abs(correlation) > thresh:
                if col2 in cols:
                    cols = cols.drop(col2)
    return cols, corr_matrix

## Train models on each dataset

In [8]:
models = dict()
models['hyperparameters'] = dict()
models['model'] = dict()
models['auroc_scores'] = dict()
models['IDs'] = dict()
models['preprocessor'] = dict()
models['columns'] = dict()

application_train = pd.read_csv('train_val_data/train.csv')
id_and_target = application_train[['SK_ID_CURR', 'TARGET']]

model_file = 'models/'+model_name+'_models.pickle'
if os.path.exists(model_file):
    with open(model_file, 'rb') as handle:
        models = pickle.load(handle)

files = ['application_train.csv','bureau.csv','previous_application.csv','POS_CASH_balance.csv','installments_payments.csv','credit_card_balance.csv']
for file in files:
    name = file.split('.')[0]
    print(name)
    if name in models['auroc_scores']:
        print('    Already trained')
        continue
    if name == 'application_train':
        models['IDs'][name] = list(application_train['SK_ID_CURR'].unique())

        X = application_train.drop(columns=['SK_ID_CURR','TARGET'])
        y = application_train['TARGET']

        # Identify numeric and categorical columns
        numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = X.select_dtypes(include=['object']).columns

        # Create transformers for numeric and categorical features
        numeric_transformer = Pipeline([
            # ('imputer', KNNImputer()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        # Use ColumnTransformer to apply transformations to the correct columns
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        # Split the data into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

        X_train = preprocessor.fit_transform(X_train)
        X_val = preprocessor.transform(X_val)
    else:
        df = pd.read_csv('transformed/'+file.split('.')[0]+'_transformed.csv')
        models['IDs'][name] = list(df['SK_ID_CURR'].unique())
        df = df.merge(id_and_target, on='SK_ID_CURR', how='inner')

        X = df.drop(columns=['SK_ID_CURR','TARGET'])
        y = df['TARGET']

        transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
            ('scaler', StandardScaler())
        ])

        # Use ColumnTransformer to apply transformations to the correct columns
        preprocessor = ColumnTransformer(
            transformers=[
                ('transformer', transformer, X.columns)
            ]
        )

        # Split the data into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

        X_train = preprocessor.fit_transform(X_train)
        X_val = preprocessor.transform(X_val)

    if correlation_threshold > 0:
        if os.path.exists('correlation_matrices/'+name+'_correlation_matrix.csv'):
            corr_matrix = pd.read_csv('correlation_matrices/'+name+'_correlation_matrix.csv')
            models['columns'][name], _ = remove_correlated_features(X_train, correlation_threshold, corr_matrix)
        else:
            models['columns'][name], corr_matrix = remove_correlated_features(X_train, correlation_threshold, None)
            corr_matrix.to_csv('correlation_matrices/'+name+'_correlation_matrix.csv', index=False)
    else:
        models['columns'][name] = pd.DataFrame(X_train).columns
    X_train = X_train[:, models['columns'][name]]
    X_val = X_val[:, models['columns'][name]]

    models['preprocessor'][name] = preprocessor

    def objective(space):
        pipe = define_model(space)
        pipe.fit(X_train, y_train)
        pred = pipe.predict_proba(X_val)[:, 1]
        auroc = roc_auc_score(y_val, pred)
        return {'loss': -auroc, 'status': STATUS_OK }

    space = define_search_space()

    trials = Trials()

    models['hyperparameters'][name] = fmin(fn = objective,
                                             space = space,
                                             algo = tpe.suggest,
                                             max_evals = num_search_trials,
                                             trials = trials)
    # pprint(models['hyperparameters'][name])

    models['preprocessor'][name] = preprocessor
    models['model'][name] = define_model(models['hyperparameters'][name])
    models['model'][name].fit(np.concatenate((X_train, X_val), axis=0), np.concatenate((y_train, y_val), axis=0))
    models['auroc_scores'][name] = -trials.best_trial['result']['loss']
    with open(model_file, 'wb') as handle:
        pickle.dump(models, handle, protocol=pickle.HIGHEST_PROTOCOL)

# models['space'] = space
models['num_search_trials'] = num_search_trials
models['name'] = model_name
models['correlation_threshold'] = correlation_threshold
# pprint(models)
with open(model_file, 'wb') as handle:
    pickle.dump(models, handle, protocol=pickle.HIGHEST_PROTOCOL)

application_train
100%|██████████| 1/1 [00:16<00:00, 16.07s/trial, best loss: -0.742553648811635]
bureau
100%|██████████| 1/1 [00:06<00:00,  6.16s/trial, best loss: -0.647920596705094]
previous_application
100%|██████████| 1/1 [00:20<00:00, 20.39s/trial, best loss: -0.6628085425174505]
POS_CASH_balance
100%|██████████| 1/1 [00:06<00:00,  6.09s/trial, best loss: -0.5820431775692597]
installments_payments
100%|██████████| 1/1 [00:04<00:00,  4.49s/trial, best loss: -0.617462813416681]
credit_card_balance
100%|██████████| 1/1 [00:03<00:00,  3.28s/trial, best loss: -0.6348568691029306]
