In [12]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [13]:
X_train = pd.read_csv('../data/train/X_train.csv')
X_test = pd.read_csv('../data/test/X_test.csv')
y_train = pd.read_csv('../data/train/y_train.csv').values.ravel()
y_test = pd.read_csv('../data/test/y_test.csv').values.ravel()

In [14]:
# Define models and their hyperparameter grids
models_and_parameters = {
    'Logistic Regression': (LogisticRegression(max_iter=1000), {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l2'],
        'solver': ['lbfgs']
    }),
    'Decision Tree': (DecisionTreeClassifier(), {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10]
    }),
    'Random Forest': (RandomForestClassifier(), {
        'n_estimators': [100, 200, 300],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }),
    'Gradient Boosting': (GradientBoostingClassifier(), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'max_depth': [3, 4, 5, 6],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }),
    'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'max_depth': [3, 4, 5, 6],
        'min_child_weight': [1, 2, 4],
        'gamma': [0, 0.1, 0.2, 0.3],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    })
}

In [15]:
# Perform hyperparameter tuning using GridSearchCV
best_model = None
best_accuracy = 0

In [16]:
for name, (model, params) in models_and_parameters.items():
    print(f"Training {name} with GridSearchCV...")
    grid_search = GridSearchCV(model, params, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    best_estimator = grid_search.best_estimator_
    y_pred = best_estimator.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f'Model: {name}')

Training Logistic Regression with GridSearchCV...
Fitting 3 folds for each of 5 candidates, totalling 15 fits


ValueError: 
All the 15 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1223, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\utils\validation.py", line 1273, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\utils\validation.py", line 1007, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\utils\_array_api.py", line 746, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: '  (0, 0)\t0.9990554172999345\n  (0, 1)\t-0.6421347788775653\n  (0, 2)\t-1.2112601094741615\n  (0, 3)\t-0.5573018614298109\n  (0, 4)\t-0.11733573938926847\n  (0, 5)\t-1.5753568786214338\n  (0, 6)\t-1.0716302190411113\n  (0, 7)\t1.0\n  (0, 409)\t1.0\n  (0, 1471)\t1.0\n  (0, 1472)\t1.0\n  (0, 1474)\t1.0\n  (0, 1483)\t1.0\n  (0, 1506)\t1.0\n  (0, 1510)\t1.0\n  (0, 1513)\t1.0\n  (0, 1517)\t1.0\n  (0, 1520)\t1.0'

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1223, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\utils\validation.py", line 1273, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\utils\validation.py", line 1007, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\sklearn\utils\_array_api.py", line 746, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\projects\python\Predicting-Probability-of-Default\claxon-env\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: '  (0, 0)\t1.124881036607231\n  (0, 1)\t-0.6421347788775653\n  (0, 2)\t0.2375272128106946\n  (0, 3)\t-0.023252508812940203\n  (0, 4)\t1.7330938362181858\n  (0, 5)\t-0.40730539321770665\n  (0, 6)\t-0.30652749043229954\n  (0, 9)\t1.0\n  (0, 1241)\t1.0\n  (0, 1471)\t1.0\n  (0, 1472)\t1.0\n  (0, 1473)\t1.0\n  (0, 1484)\t1.0\n  (0, 1495)\t1.0\n  (0, 1511)\t1.0\n  (0, 1514)\t1.0\n  (0, 1518)\t1.0\n  (0, 1522)\t1.0'
