In [20]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from sklearn.metrics import classification_report, accuracy_score
import joblib
import loguru
from scripts.helpers import *

logger = loguru.logger

ModuleNotFoundError: No module named 'scripts.helpers'

In [21]:
import pandas as pd

def map_loan_status(status):
    status_map = {
        'did not default': False,
        'defaulted': True
    }
    return status_map.get(str(status).lower(), None)

# Binning functions
def bin_age(age):
    bins = [0, 18, 30, 40, 50, float('inf')]
    labels = ['0-18', '19-30', '31-40', '41-50', '51+']
    return pd.cut(age, bins=bins, labels=labels, right=False).astype(str)

def bin_salary(salary):
    salary_bins = [0, 2273.93, 2665.44, 3146.58, 10000]
    salary_labels = ['Low', 'Medium-Low', 'Medium-High', 'High']
    return pd.cut(salary, bins=salary_bins, labels=salary_labels, right=False).astype(str)

# Custom transformer for age binning
def age_bin_transformer(X):
    return pd.DataFrame(X).apply(lambda col: bin_age(col)).values

# Custom transformer for salary binning
def salary_bin_transformer(X):
    return pd.DataFrame(X).apply(lambda col: bin_salary(col)).values


In [22]:
path = '../pipelines/data_processing_pipeline_20240710_170133.pkl'
pipeline = joblib.load(path)

In [38]:
X_train = pd.read_csv('../data/train/X_train.csv')
X_test = pd.read_csv('../data/test/X_test.csv')
y_train = pd.read_csv('../data/train/y_train.csv').values.ravel()
y_test = pd.read_csv('../data/test/y_test.csv').values.ravel()

In [39]:
X_train = pipeline.transform(X_train)
X_test= pipeline.transform(X_test)

In [41]:

# Define models and hyperparameters
models_and_parameters = {
    'Logistic Regression': (LogisticRegression(max_iter=1000), {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l2'],
        'solver': ['lbfgs']
    }),
    'Decision Tree': (DecisionTreeClassifier(), {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10]
    }),
    'Random Forest': (RandomForestClassifier(), {
        'n_estimators': [100, 200, 300],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    })
}

In [36]:
# Perform hyperparameter tuning using GridSearchCV

best_model = None
best_accuracy = 0

In [42]:

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report


# Train models using GridSearchCV
for name, (model, params) in models_and_parameters.items():
    logger.info(f"Training {name} with GridSearchCV.")
    grid_search = GridSearchCV(model, params, cv=3, n_jobs=-1, verbose=2, error_score='raise')
    try:
        # Validate parameter grid
        if not params:
            raise ValueError(f"Parameter grid for {name} is empty.")

        grid_search.fit(X_train, y_train)
        best_estimator = grid_search.best_estimator_
        y_pred = best_estimator.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)

        logger.info(f'Model: {name}')
        logger.info(f'Best Estimator: {best_estimator}')
        logger.info(f'Accuracy: {accuracy}')
        logger.info(f'Classification Report:\n{report}')
    except ValueError as e:
        logger.error(f"Failed to train {name} due to ValueError: {e}")
    except Exception as e:
        logger.error(f"An unexpected error occurred while training {name}: {e}")


[32m2024-07-10 17:30:26.015[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mTraining Logistic Regression with GridSearchCV.[0m


Fitting 3 folds for each of 5 candidates, totalling 15 fits


[32m2024-07-10 17:30:39.573[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [31m[1mAn unexpected error occurred while training Logistic Regression: Unable to allocate 310. MiB for an array with shape (26667, 1524) and data type float64[0m
[32m2024-07-10 17:30:39.573[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mTraining Decision Tree with GridSearchCV.[0m


Fitting 3 folds for each of 36 candidates, totalling 108 fits


[32m2024-07-10 17:30:52.503[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [31m[1mAn unexpected error occurred while training Decision Tree: Unable to allocate 310. MiB for an array with shape (26667, 1524) and data type float64[0m
[32m2024-07-10 17:30:52.503[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mTraining Random Forest with GridSearchCV.[0m


Fitting 3 folds for each of 216 candidates, totalling 648 fits


[32m2024-07-10 17:31:04.154[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [31m[1mAn unexpected error occurred while training Random Forest: Unable to allocate 310. MiB for an array with shape (26667, 1524) and data type float64[0m
