In [12]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import  SMOTE
import loguru
from collections import Counter
from scripts.helpers import *
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, make_scorer

logger = loguru.logger
random_state=42

In [13]:
def map_loan_status(status):
    status_map = {
        'did not default': False,
        'defaulted': True
    }
    return status_map.get(str(status).lower(), None)

# Binning functions
def bin_age(age):
    bins = [0, 18, 30, 40, 50, float('inf')]
    labels = ['0-18', '19-30', '31-40', '41-50', '51+']
    return pd.cut(age, bins=bins, labels=labels, right=False).astype(str)

def bin_salary(salary):
    salary_bins = [0, 2273.93, 2665.44, 3146.58, 10000]
    salary_labels = ['Low', 'Medium-Low', 'Medium-High', 'High']
    return pd.cut(salary, bins=salary_bins, labels=salary_labels, right=False).astype(str)

# Custom transformer for age binning
def age_bin_transformer(X):
    return pd.DataFrame(X).apply(lambda col: bin_age(col)).values

# Custom transformer for salary binning
def salary_bin_transformer(X):
    return pd.DataFrame(X).apply(lambda col: bin_salary(col)).values


In [14]:
path = '../pipelines/data_processing_pipeline_20240710_170133.pkl'
pipeline = joblib.load(path)

In [15]:
# Load data
df = pd.read_csv('../data/processed/cleaned_data_20240621_143909.csv')
X = df.drop(columns=['target'])
y = df['target']

In [16]:
X_train = pd.read_csv('../data/train/X_train.csv')
X_test = pd.read_csv('../data/test/X_test.csv')
y_train = pd.read_csv('../data/train/y_train.csv').values.ravel()
y_test = pd.read_csv('../data/test/y_test.csv').values.ravel()

In [17]:
X_train = pipeline.transform(X_train)
X_test= pipeline.transform(X_test)

In [18]:


# Assuming X_train and y_train are your features and labels
sm = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

# Checking the new distribution
print('Original dataset shape %s' % Counter(y_train))
print('Resampled dataset shape %s' % Counter(y_train_resampled))


Original dataset shape Counter({False: 68066, True: 11934})
Resampled dataset shape Counter({False: 68066, True: 68066})


In [19]:
import random

# Assuming x_train_resampled and y_train_resampled are your resampled features and labels
# Set the desired sample size
sample_size = 4000

# Create a list of indices
indices = list(range(len(x_train_resampled)))

# Randomly select the indices for your sample
random_indices = random.sample(indices, sample_size)

# Obtain the random sample
x_train_sampled = x_train_resampled[random_indices]
y_train_sampled = y_train_resampled[random_indices]

# Checking the new distribution
print('Original dataset shape %s' % Counter(y_train_resampled))
print('Sampled dataset shape %s' % Counter(y_train_sampled))

Original dataset shape Counter({False: 68066, True: 68066})
Sampled dataset shape Counter({False: 2046, True: 1954})


In [20]:
# Define random state for reproducibility
random_state = 42
n_estimators = [30, 50, 80]
# Define models and hyperparameters
models_and_parameters = {
    'Logistic Regression': (LogisticRegression(max_iter=1000, class_weight='balanced', random_state=random_state), {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l2'],
        'solver': ['lbfgs']
    }),
    'Decision Tree': (DecisionTreeClassifier(class_weight='balanced', random_state=random_state), {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10]
    }),
    'Random Forest': (RandomForestClassifier(class_weight='balanced', random_state=random_state), {
        'n_estimators': n_estimators,
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }),
    'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=random_state), {
        'n_estimators': n_estimators,
        'max_depth': [3, 4, 5, 6, 7],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'subsample': [0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
    }),
    'Support Vector Machine': (SVC(class_weight='balanced', random_state=random_state), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'degree': [2, 3, 4],
        'gamma': ['scale', 'auto']
    }),
    'Gradient Boosting': (GradientBoostingClassifier(random_state=random_state), {
        'n_estimators': n_estimators,
        'max_depth': [3, 4, 5, 6],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'subsample': [0.7, 0.8, 0.9, 1.0],
        'min_samples_split': [2, 5, 10], 
        'min_samples_leaf': [1, 2, 4]
    })
}

The rationale behind the chosen parameters for each model:

### General Parameters
- **random_state = 42**: 
  - This parameter ensures reproducibility by setting a seed for random number generation. Using a fixed `random_state` value like 42 ensures that the results are consistent across different runs.

### Logistic Regression
- **max_iter=1000**: 
  - Sets the maximum number of iterations for the solver to converge. A higher value like 1000 ensures the solver has enough iterations to converge, especially important for complex datasets.
- **class_weight='balanced'**: 
  - Adjusts the weights inversely proportional to class frequencies. This helps in handling imbalanced datasets by giving more weight to the minority class.
- **C**: 
  - Inverse of regularization strength. Lower values specify stronger regularization. The range [0.01, 0.1, 1, 10, 100] helps find the optimal balance between underfitting and overfitting.
- **penalty='l2'**: 
  - Specifies the norm used in the penalization. L2 (Ridge) regularization is a common choice as it tends to produce more stable models.
- **solver='lbfgs'**: 
  - An optimization algorithm suitable for small datasets and supports L2 regularization. Chosen for its efficiency.

### Decision Tree
- **criterion**: 
  - Measures the quality of a split. 'gini' and 'entropy' are the two common criteria, allowing the model to choose the best split strategy.
- **max_depth**: 
  - Limits the depth of the tree. None means the nodes are expanded until all leaves are pure or until they contain less than `min_samples_split` samples. The range [10, 20, 30, 40, 50] helps prevent overfitting.
- **min_samples_split**: 
  - Minimum number of samples required to split an internal node. Values [2, 5, 10] control how sensitive the model is to splits, helping to manage overfitting.

### Random Forest
- **n_estimators**: 
  - Number of trees in the forest. More trees usually improve performance but also increase computation. Values [30, 50, 80] provide a range for balancing performance and computation cost.
- **criterion**: 
  - Same as Decision Tree, it defines the function to measure the quality of a split.
- **max_depth**: 
  - Similar to Decision Tree, it limits the depth of each tree.
- **min_samples_split**: 
  - Same as Decision Tree.
- **min_samples_leaf**: 
  - Minimum number of samples required to be at a leaf node. Values [1, 2, 4] help ensure leaves have enough samples, which can reduce overfitting.

### XGBoost
- **n_estimators**: 
  - Same as Random Forest.
- **max_depth**: 
  - Limits the depth of each tree, similar to Decision Tree.
- **learning_rate**: 
  - Step size shrinkage used to prevent overfitting. Values [0.01, 0.1, 0.2, 0.3] help find the right balance between learning speed and accuracy.
- **subsample**: 
  - Fraction of samples used to fit each tree. Values [0.7, 0.8, 0.9, 1.0] help in reducing overfitting by introducing randomness.
- **colsample_bytree**: 
  - Fraction of features used for each tree. Values [0.7, 0.8, 0.9, 1.0] ensure some features are always sampled, adding diversity to the model.

### Support Vector Machine (SVM)
- **C**: 
  - Regularization parameter. The range [0.1, 1, 10] helps find the optimal margin between classes.
- **kernel**: 
  - Specifies the kernel type to be used in the algorithm. 'linear', 'rbf', and 'poly' provide a variety of transformation functions to try.
- **degree**: 
  - Degree of the polynomial kernel function (‘poly’). Relevant only for 'poly' kernel, values [2, 3, 4] allow testing the complexity of the polynomial.
- **gamma**: 
  - Kernel coefficient for ‘rbf’, ‘poly’, and ‘sigmoid’. Values ['scale', 'auto'] help control the influence of individual training examples.

### Gradient Boosting
- **n_estimators**: 
  - Same as Random Forest.
- **max_depth**: 
  - Limits the depth of each tree, same as Decision Tree.
- **learning_rate**: 
  - Same as XGBoost.
- **subsample**: 
  - Same as XGBoost.
- **min_samples_split**: 
  - Same as Decision Tree.
- **min_samples_leaf**: 
  - Same as Random Forest.

These parameters are chosen based on common practices and the need to balance between model complexity and computational efficiency. The ranges provided allow sufficient flexibility to find the optimal hyperparameters during the tuning process.

In [21]:
# Perform hyperparameter tuning using GridSearchCV

best_model = None
best_accuracy = 0

In [22]:
# Train models using GridSearchCV
for name, (model, params) in models_and_parameters.items():
    logger.info(f"Training {name} with GridSearchCV.")
    grid_search = GridSearchCV(model, params, cv=3, n_jobs=-1, verbose=2, error_score='raise')
    
    try:
        # Validate parameter grid
        if not params:
            raise ValueError(f"Parameter grid for {name} is empty.")
        
        # Use joblib to specify the backend
        with joblib.parallel_backend('threading'):
            grid_search.fit(x_train_sampled, y_train_sampled)
        
        best_estimator = grid_search.best_estimator_
        y_pred = best_estimator.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)

        best_estimator = grid_search.best_estimator_
        y_pred = best_estimator.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
        cv_results = cross_val_score(best_estimator, X_train, y_train, cv=skf, scoring=make_scorer(roc_auc_score))

        logger.info(f'Model: {name}')
        logger.info(f'Best Estimator: {best_estimator}')
        logger.info(f'Accuracy: {accuracy}')
        logger.info(f'Classification Report:\n{report}')
        logger.info(f'Cross-Validation ROC-AUC scores: {cv_results}')
        logger.info(f'Mean ROC-AUC: {np.mean(cv_results)}')
        logger.info(f'Standard Deviation of ROC-AUC: {np.std(cv_results)}')
    
    
    except ValueError as e:
        logger.error(f"Failed to train {name} due to ValueError: {e}")
    except Exception as e:
        logger.error(f"An unexpected error occurred while training {name}: {e}")

[32m2024-07-19 08:47:11.086[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mTraining Logistic Regression with GridSearchCV.[0m


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END ......................C=1, penalty=l2, solver=lbfgs; total time=   0.4s
[CV] END ....................C=0.1, penalty=l2, solver=lbfgs; total time=   0.4s
[CV] END ......................C=1, penalty=l2, solver=lbfgs; total time=   0.4s
[CV] END ....................C=0.1, penalty=l2, solver=lbfgs; total time=   0.4s
[CV] END ...................C=0.01, penalty=l2, solver=lbfgs; total time=   0.4s
[CV] END ...................C=0.01, penalty=l2, solver=lbfgs; total time=   0.5s
[CV] END ...................C=0.01, penalty=l2, solver=lbfgs; total time=   0.5s
[CV] END ....................C=0.1, penalty=l2, solver=lbfgs; total time=   0.5s
[CV] END ......................C=1, penalty=l2, solver=lbfgs; total time=   0.2s
[CV] END .....................C=10, penalty=l2, solver=lbfgs; total time=   0.2s
[CV] END .....................C=10, penalty=l2, solver=lbfgs; total time=   0.3s
[CV] END ....................C=100, penalty=l2, s

[32m2024-07-19 08:47:26.607[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mModel: Logistic Regression[0m
[32m2024-07-19 08:47:26.614[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mBest Estimator: LogisticRegression(C=0.01, class_weight='balanced', max_iter=1000,
                   random_state=42)[0m
[32m2024-07-19 08:47:26.618[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAccuracy: 1.0[0m
[32m2024-07-19 08:47:26.621[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m32[0m - [1mClassification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00     17068
        True       1.00      1.00      1.00      2932

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000
[0m
[32m2024-07-19 08:47:26.634[0m | [1mINF

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END criterion=gini, max_depth=None, min_samples_split=2; total time=   0.3s
[CV] END criterion=gini, max_depth=None, min_samples_split=5; total time=   0.3s
[CV] END criterion=gini, max_depth=None, min_samples_split=10; total time=   0.3s
[CV] END criterion=gini, max_depth=None, min_samples_split=5; total time=   0.3s
[CV] END criterion=gini, max_depth=None, min_samples_split=10; total time=   0.3s
[CV] END criterion=gini, max_depth=None, min_samples_split=2; total time=   0.3s
[CV] END criterion=gini, max_depth=None, min_samples_split=5; total time=   0.4s
[CV] END criterion=gini, max_depth=None, min_samples_split=2; total time=   0.4s
[CV] END criterion=gini, max_depth=None, min_samples_split=10; total time=   0.2s
[CV] END .criterion=gini, max_depth=10, min_samples_split=10; total time=   0.2s
[CV] END ..criterion=gini, max_depth=10, min_samples_split=5; total time=   0.2s
[CV] END ..criterion=gini, max_depth=10, min

[32m2024-07-19 08:48:05.927[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mModel: Decision Tree[0m
[32m2024-07-19 08:48:05.928[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mBest Estimator: DecisionTreeClassifier(class_weight='balanced', random_state=42)[0m
[32m2024-07-19 08:48:05.929[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAccuracy: 1.0[0m
[32m2024-07-19 08:48:05.931[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m32[0m - [1mClassification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00     17068
        True       1.00      1.00      1.00      2932

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000
[0m
[32m2024-07-19 08:48:05.932[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>

Fitting 3 folds for each of 216 candidates, totalling 648 fits
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=30; total time=   1.4s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=30; total time=   1.5s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=30; total time=   1.5s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.1s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.2s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.3s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30; total time=   1.1s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30; total time= 

[32m2024-07-19 08:52:32.621[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mModel: Random Forest[0m
[32m2024-07-19 08:52:32.622[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mBest Estimator: RandomForestClassifier(class_weight='balanced', n_estimators=30,
                       random_state=42)[0m
[32m2024-07-19 08:52:32.623[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAccuracy: 1.0[0m
[32m2024-07-19 08:52:32.624[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m32[0m - [1mClassification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00     17068
        True       1.00      1.00      1.00      2932

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000
[0m
[32m2024-07-19 08:52:32.625[0m | [1mINFO   

Fitting 3 folds for each of 960 candidates, totalling 2880 fits
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=30, subsample=0.8; total time=   5.6s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=30, subsample=0.8; total time=   5.8s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=30, subsample=0.7; total time=   6.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=30, subsample=0.9; total time=   6.5s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=30, subsample=0.9; total time=   6.5s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=30, subsample=0.8; total time=   6.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=30, subsample=0.7; total time=   7.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=30, subsample=0.7; total time=   7.0s
[CV] END colsamp

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[32m2024-07-19 09:43:28.353[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mModel: XGBoost[0m
[32m2024-07-19 09:43:28.356[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mBest Estimator: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.01, max_bin=None, max

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV] END ........C=0.1, degree=2, gamma=scale, kernel=linear; total time=  14.8s
[CV] END ..........C=0.1, degree=2, gamma=scale, kernel=poly; total time=  15.6s
[CV] END ........C=0.1, degree=2, gamma=scale, kernel=linear; total time=  16.6s
[CV] END ........C=0.1, degree=2, gamma=scale, kernel=linear; total time=  17.0s
[CV] END ...........C=0.1, degree=2, gamma=scale, kernel=rbf; total time=  16.5s
[CV] END ..........C=0.1, degree=2, gamma=scale, kernel=poly; total time=  17.6s
[CV] END ...........C=0.1, degree=2, gamma=scale, kernel=rbf; total time=  19.4s
[CV] END ...........C=0.1, degree=2, gamma=scale, kernel=rbf; total time=  18.9s
[CV] END .........C=0.1, degree=2, gamma=auto, kernel=linear; total time=  13.1s
[CV] END .........C=0.1, degree=2, gamma=auto, kernel=linear; total time=  16.6s
[CV] END ..........C=0.1, degree=2, gamma=scale, kernel=poly; total time=  17.8s
[CV] END .........C=0.1, degree=2, gamma=auto, 

[32m2024-07-19 09:54:18.669[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mModel: Support Vector Machine[0m
[32m2024-07-19 09:54:18.670[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mBest Estimator: SVC(C=0.1, class_weight='balanced', degree=2, kernel='linear', random_state=42)[0m
[32m2024-07-19 09:54:18.671[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAccuracy: 1.0[0m
[32m2024-07-19 09:54:18.673[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m32[0m - [1mClassification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00     17068
        True       1.00      1.00      1.00      2932

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000
[0m
[32m2024-07-19 09:54:18.675[0m | [1mINFO    [0m | [36m__m

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=30, subsample=0.7; total time=   4.7s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=30, subsample=0.7; total time=   5.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=30, subsample=0.8; total time=   5.5s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=30, subsample=0.7; total time=   5.5s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=30, subsample=0.8; total time=   6.0s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=30, subsample=0.8; total time=   6.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=30, subsample=0.9; total time=   6.

[32m2024-07-19 13:59:33.499[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mModel: Gradient Boosting[0m
[32m2024-07-19 13:59:33.501[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mBest Estimator: GradientBoostingClassifier(learning_rate=0.01, n_estimators=30, random_state=42,
                           subsample=0.7)[0m
[32m2024-07-19 13:59:33.503[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mAccuracy: 1.0[0m
[32m2024-07-19 13:59:33.505[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m32[0m - [1mClassification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00     17068
        True       1.00      1.00      1.00      2932

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000
[0m
[32m2024-07-19 13:59:33