In [18]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [19]:
X_train = pd.read_csv('../data/train/X_train.csv')
X_test = pd.read_csv('../data/test/X_test.csv')
y_train = pd.read_csv('../data/train/y_train.csv').values.ravel()
y_test = pd.read_csv('../data/test/y_test.csv').values.ravel()

In [20]:

# Define models and hyperparameters
models_and_parameters = {
    'Logistic Regression': (LogisticRegression(max_iter=1000), {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l2'],
        'solver': ['lbfgs']
    }),
    'Decision Tree': (DecisionTreeClassifier(), {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10]
    }),
    'Random Forest': (RandomForestClassifier(), {
        'n_estimators': [100, 200, 300],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    })
}

In [21]:
# Perform hyperparameter tuning using GridSearchCV

best_model = None
best_accuracy = 0

In [22]:
# Train models using GridSearchCV
for name, (model, params) in models_and_parameters.items():
    print(f"Training {name} with GridSearchCV.")
    grid_search = GridSearchCV(model, params, cv=3, n_jobs=-1, verbose=2, error_score='raise')
    try:
        grid_search.fit(X_train, y_train)
        best_estimator = grid_search.best_estimator_
        y_pred = best_estimator.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)

        print(f'Model: {name}')
        print(f'Best Estimator: {best_estimator}')
        print(f'Accuracy: {accuracy}')
        print(f'Classification Report: {report}')
    except ValueError as e:
        print(f"Failed to train {name} due to ValueError: {e}")

Training Logistic Regression with GridSearchCV.
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Failed to train Logistic Regression due to ValueError: could not convert string to float: '  (0, 0)\t1.124881036607231\n  (0, 1)\t-0.6421347788775653\n  (0, 2)\t0.2375272128106946\n  (0, 3)\t-0.023252508812940203\n  (0, 4)\t1.7330938362181858\n  (0, 5)\t-0.40730539321770665\n  (0, 6)\t-0.30652749043229954\n  (0, 9)\t1.0\n  (0, 1241)\t1.0\n  (0, 1471)\t1.0\n  (0, 1472)\t1.0\n  (0, 1473)\t1.0\n  (0, 1484)\t1.0\n  (0, 1495)\t1.0\n  (0, 1511)\t1.0\n  (0, 1514)\t1.0\n  (0, 1518)\t1.0\n  (0, 1522)\t1.0'
Training Decision Tree with GridSearchCV.
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Failed to train Decision Tree due to ValueError: could not convert string to float: '  (0, 0)\t0.9990554172999345\n  (0, 1)\t-0.6421347788775653\n  (0, 2)\t-1.2112601094741615\n  (0, 3)\t-0.5573018614298109\n  (0, 4)\t-0.11733573938926847\n  (0, 5)\t-1.5753568786214338\n  (0, 6)\t-1.0