# Data Loading and Exploration


In [6]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline  # Import Pipeline
from sklearn.impute import SimpleImputer



import warnings
warnings.filterwarnings('ignore')



# Read the dataset


In [7]:

df = pd.read_csv('../datasets/PCOS_data.csv')


# Display basic information and the first few rows


In [8]:
print(df.info())
display(df.head())

# Generate descriptive statistics
display(df.describe())

# Check for missing values
print(df.isnull().sum())

# Normalize column names
df.columns = df.columns.str.strip().str.replace(' ', '_')
print(df.columns.tolist())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 44 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sl. No                  541 non-null    int64  
 1   Patient File No.        541 non-null    int64  
 2   PCOS (Y/N)              541 non-null    int64  
 3    Age (yrs)              541 non-null    int64  
 4   Weight (Kg)             541 non-null    float64
 5   Height(Cm)              541 non-null    float64
 6   BMI                     541 non-null    float64
 7   Blood Group             541 non-null    int64  
 8   Pulse rate(bpm)         541 non-null    int64  
 9   RR (breaths/min)        541 non-null    int64  
 10  Hb(g/dl)                541 non-null    float64
 11  Cycle(R/I)              541 non-null    int64  
 12  Cycle length(days)      541 non-null    int64  
 13  Marraige Status (Yrs)   540 non-null    float64
 14  Pregnant(Y/N)           541 non-null    in

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
0,1,1,0,28,44.6,152.0,19.3,15,78,22,...,0,1.0,0,110,80,3,3,18.0,18.0,8.5
1,2,2,0,36,65.0,161.5,24.9,15,74,20,...,0,0.0,0,120,70,3,5,15.0,14.0,3.7
2,3,3,1,33,68.8,165.0,25.3,11,72,18,...,1,1.0,0,120,80,13,15,18.0,20.0,10.0
3,4,4,0,37,65.0,148.0,29.7,13,72,20,...,0,0.0,0,120,70,2,2,15.0,14.0,7.5
4,5,5,0,25,52.0,161.0,20.1,11,72,18,...,0,0.0,0,120,80,3,4,16.0,14.0,7.0


Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
count,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,...,541.0,540.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0
mean,271.0,271.0,0.327172,31.430684,59.637153,156.484835,24.307579,13.802218,73.247689,19.243993,...,0.489834,0.514815,0.247689,114.661738,76.927911,6.12939,6.641405,15.018115,15.451701,8.475915
std,156.317519,156.317519,0.469615,5.411006,11.028287,6.033545,4.055129,1.840812,4.430285,1.688629,...,0.500359,0.500244,0.43207,7.384556,5.574112,4.229294,4.436889,3.566839,3.318848,2.165381
min,1.0,1.0,0.0,20.0,31.0,137.0,12.4,11.0,13.0,16.0,...,0.0,0.0,0.0,12.0,8.0,0.0,0.0,0.0,0.0,0.0
25%,136.0,136.0,0.0,28.0,52.0,152.0,21.6,13.0,72.0,18.0,...,0.0,0.0,0.0,110.0,70.0,3.0,3.0,13.0,13.0,7.0
50%,271.0,271.0,0.0,31.0,59.0,156.0,24.2,14.0,72.0,18.0,...,0.0,1.0,0.0,110.0,80.0,5.0,6.0,15.0,16.0,8.5
75%,406.0,406.0,1.0,35.0,65.0,160.0,26.6,15.0,74.0,20.0,...,1.0,1.0,0.0,120.0,80.0,9.0,10.0,18.0,18.0,9.8
max,541.0,541.0,1.0,48.0,108.0,180.0,38.9,18.0,82.0,28.0,...,1.0,1.0,1.0,140.0,100.0,22.0,20.0,24.0,24.0,18.0


Sl. No                    0
Patient File No.          0
PCOS (Y/N)                0
 Age (yrs)                0
Weight (Kg)               0
Height(Cm)                0
BMI                       0
Blood Group               0
Pulse rate(bpm)           0
RR (breaths/min)          0
Hb(g/dl)                  0
Cycle(R/I)                0
Cycle length(days)        0
Marraige Status (Yrs)     1
Pregnant(Y/N)             0
No. of abortions          0
  I   beta-HCG(mIU/mL)    0
II    beta-HCG(mIU/mL)    0
FSH(mIU/mL)               0
LH(mIU/mL)                0
FSH/LH                    0
Hip(inch)                 0
Waist(inch)               0
Waist:Hip Ratio           0
TSH (mIU/L)               0
AMH(ng/mL)                0
PRL(ng/mL)                0
Vit D3 (ng/mL)            0
PRG(ng/mL)                0
RBS(mg/dl)                0
Weight gain(Y/N)          0
hair growth(Y/N)          0
Skin darkening (Y/N)      0
Hair loss(Y/N)            0
Pimples(Y/N)              0
Fast food (Y/N)     

# Data Preprocessing


In [9]:
# Normalize column names
df.columns = df.columns.str.strip().str.replace(' ', '_')

# Handle numeric columns (coerce non-numeric to NaN)
numeric_columns = ['BMI', 'Age_(yrs)', 'Weight_(Kg)', 'Waist:Hip_Ratio', 'I___beta-HCG(mIU/mL)', 
                   'II____beta-HCG(mIU/mL)', 'FSH(mIU/mL)', 'LH(mIU/mL)', 'AMH(ng/mL)', 
                   'Cycle_length(days)', 'Endometrium_(mm)', 'BP__Systolic_(mmHg)', 'BP__Diastolic_(mmHg)']

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col].replace(r'^\.+$', '', regex=True), errors='coerce')
    
# Impute missing values in numeric columns with the median
imputer = SimpleImputer(strategy='median')
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

# Fill the single missing value in 'Marraige_Status_(Yrs)' with the median
df['Marraige_Status_(Yrs)'].fillna(df['Marraige_Status_(Yrs)'].median(), inplace=True)

# Handle categorical columns and fill missing values with mode
categorical_columns = ['PCOS_(Y/N)', 'Pregnant(Y/N)', 'Weight_gain(Y/N)', 'hair_growth(Y/N)', 
                       'Skin_darkening_(Y/N)', 'Hair_loss(Y/N)', 'Pimples(Y/N)', 
                       'Fast_food_(Y/N)', 'Reg.Exercise(Y/N)', 'Blood_Group']

for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# Encode categorical columns
labelencoder = LabelEncoder()
df[categorical_columns] = df[categorical_columns].apply(labelencoder.fit_transform)


# Split the dataset


In [10]:
X = df.drop('PCOS_(Y/N)', axis=1)
y = df['PCOS_(Y/N)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Define preprocessing pipeline (imputation + scaling)


In [11]:
# Define preprocessing pipeline (imputation + scaling)
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
    ('scaler', StandardScaler())                   # Standard scaling of features
])

# Preprocess the training and test sets


In [12]:
# Preprocess the training and test sets
X_train_preprocessed = pipeline.fit_transform(X_train)
X_test_preprocessed = pipeline.transform(X_test)


# Apply SMOTE only on the preprocessed training data


In [13]:
# Apply SMOTE only on the preprocessed training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_preprocessed, y_train)


In [14]:
# Display the class distribution after SMOTE
print(f"Class distribution after SMOTE:\n{pd.Series(y_resampled).value_counts()}")


Class distribution after SMOTE:
PCOS_(Y/N)
0    291
1    291
Name: count, dtype: int64


# Model Training 

In [15]:
# Define classifiers
classifiers = {
    'KNN (k=3)': KNeighborsClassifier(n_neighbors=3),
    'KNN (k=5)': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree (gini)': DecisionTreeClassifier(criterion='gini'),
    'Decision Tree (entropy)': DecisionTreeClassifier(criterion='entropy'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Naive Bayes': GaussianNB(),
    'Dummy Classifier': DummyClassifier(strategy='most_frequent'),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42)
}


In [16]:
# Train and evaluate classifiers
results = {}
for name, clf in classifiers.items():
    clf.fit(X_resampled, y_resampled)  # Use resampled training data
    y_pred = clf.predict(X_test_preprocessed)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f'{name} - Accuracy: {accuracy:.4f}')


KNN (k=3) - Accuracy: 0.8532
KNN (k=5) - Accuracy: 0.8807
Decision Tree (gini) - Accuracy: 0.8349
Decision Tree (entropy) - Accuracy: 0.8807
Random Forest - Accuracy: 0.9266
Naive Bayes - Accuracy: 0.7890
Dummy Classifier - Accuracy: 0.6697
XGBoost - Accuracy: 0.9266
[LightGBM] [Info] Number of positive: 291, number of negative: 291
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000243 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3611
[LightGBM] [Info] Number of data points in the train set: 582, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM - Accuracy: 0.9174
CatBoost - Accuracy: 0.9174


# Display the results


In [17]:
# Display the results
results_df = pd.DataFrame(list(results.items()), columns=['Classifier', 'Accuracy'])
print(results_df.sort_values(by='Accuracy', ascending=False))


                Classifier  Accuracy
4            Random Forest  0.926606
7                  XGBoost  0.926606
8                 LightGBM  0.917431
9                 CatBoost  0.917431
1                KNN (k=5)  0.880734
3  Decision Tree (entropy)  0.880734
0                KNN (k=3)  0.853211
2     Decision Tree (gini)  0.834862
5              Naive Bayes  0.788991
6         Dummy Classifier  0.669725


In [21]:
import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Define cross-validation method


In [22]:
# Define cross-validation method
cv = StratifiedKFold(n_splits=5)

# Hyperparameter tuning


Decision Tree

In [23]:
# Hyperparameter tuning for Decision Tree
def dt_objective(trial):
    params = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
    }
    dt = DecisionTreeClassifier(**params, random_state=42)
    score = cross_val_score(dt, X_resampled, y_resampled, cv=cv, scoring='accuracy').mean()
    return score

study_dt = optuna.create_study(direction='maximize')
study_dt.optimize(dt_objective, n_trials=50)
best_dt_params = study_dt.best_trial.params
print(f"Best Decision Tree params: {best_dt_params}")

[I 2024-10-15 17:03:22,533] A new study created in memory with name: no-name-3ce2dc9c-3381-4ca1-8634-392214ef5774
[I 2024-10-15 17:03:22,559] Trial 0 finished with value: 0.848894783377542 and parameters: {'criterion': 'entropy', 'max_depth': 4, 'min_samples_split': 5, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.848894783377542.
[I 2024-10-15 17:03:22,585] Trial 1 finished with value: 0.873017978190392 and parameters: {'criterion': 'entropy', 'max_depth': 14, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.873017978190392.
[I 2024-10-15 17:03:22,611] Trial 2 finished with value: 0.8712938402593575 and parameters: {'criterion': 'entropy', 'max_depth': 15, 'min_samples_split': 9, 'min_samples_leaf': 8}. Best is trial 1 with value: 0.873017978190392.
[I 2024-10-15 17:03:22,636] Trial 3 finished with value: 0.8576186265841439 and parameters: {'criterion': 'gini', 'max_depth': 14, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 1 with va

Best Decision Tree params: {'criterion': 'entropy', 'max_depth': 12, 'min_samples_split': 6, 'min_samples_leaf': 1}


XGBoost

In [24]:
# Hyperparameter tuning for XGBoost
def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }
    xgb = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=42)
    score = cross_val_score(xgb, X_resampled, y_resampled, cv=cv, scoring='accuracy').mean()
    return score

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(xgb_objective, n_trials=50)
best_xgb_params = study_xgb.best_trial.params
print(f"Best XGBoost params: {best_xgb_params}")

[I 2024-10-15 17:03:34,067] A new study created in memory with name: no-name-d9bae865-f65b-4292-b61b-82a2f7ee5eac
[I 2024-10-15 17:03:34,790] Trial 0 finished with value: 0.9159593280282936 and parameters: {'n_estimators': 320, 'max_depth': 6, 'learning_rate': 0.037439734530466534, 'subsample': 0.5463890901293114, 'colsample_bytree': 0.5226242419435136}. Best is trial 0 with value: 0.9159593280282936.
[I 2024-10-15 17:03:35,937] Trial 1 finished with value: 0.922826407309166 and parameters: {'n_estimators': 450, 'max_depth': 5, 'learning_rate': 0.014809595173335584, 'subsample': 0.9116657018238024, 'colsample_bytree': 0.9733492812515372}. Best is trial 1 with value: 0.922826407309166.
[I 2024-10-15 17:03:36,269] Trial 2 finished with value: 0.9125405246094902 and parameters: {'n_estimators': 226, 'max_depth': 3, 'learning_rate': 0.1620805701415618, 'subsample': 0.5366956796796892, 'colsample_bytree': 0.7658875218515628}. Best is trial 1 with value: 0.922826407309166.
[I 2024-10-15 17:0

Best XGBoost params: {'n_estimators': 301, 'max_depth': 7, 'learning_rate': 0.1339132196456812, 'subsample': 0.7307670243057154, 'colsample_bytree': 0.8481798842456956}


LightGBM

In [25]:
# Hyperparameter tuning for LightGBM
def lgbm_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0)
    }
    lgbm = LGBMClassifier(**params, random_state=42)
    score = cross_val_score(lgbm, X_resampled, y_resampled, cv=cv, scoring='accuracy').mean()
    return score

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(lgbm_objective, n_trials=50)
best_lgbm_params = study_lgbm.best_trial.params
print(f"Best LightGBM params: {best_lgbm_params}")

[I 2024-10-15 17:03:59,794] A new study created in memory with name: no-name-f4a79b84-2649-4279-919e-01ff5a6b1df3


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000372 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:00,107] Trial 0 finished with value: 0.9227969348659004 and parameters: {'n_estimators': 491, 'max_depth': 4, 'learning_rate': 0.2463802831204772, 'num_leaves': 148, 'subsample': 0.5671355579303808}. Best is trial 0 with value: 0.9227969348659004.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000387 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000346 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [In

[I 2024-10-15 17:04:00,415] Trial 1 finished with value: 0.9331122900088417 and parameters: {'n_estimators': 434, 'max_depth': 5, 'learning_rate': 0.21177786800557696, 'num_leaves': 101, 'subsample': 0.8450801631022764}. Best is trial 1 with value: 0.9331122900088417.
[I 2024-10-15 17:04:00,604] Trial 2 finished with value: 0.9194076038903625 and parameters: {'n_estimators': 155, 'max_depth': 7, 'learning_rate': 0.23656625160099828, 'num_leaves': 70, 'subsample': 0.9059163980405882}. Best is trial 1 with value: 0.9331122900088417.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:00,769] Trial 3 finished with value: 0.9211170055997642 and parameters: {'n_estimators': 218, 'max_depth': 3, 'learning_rate': 0.05997895332628173, 'num_leaves': 131, 'subsample': 0.8841455711344229}. Best is trial 1 with value: 0.9331122900088417.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000322 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:01,426] Trial 4 finished with value: 0.9296934865900383 and parameters: {'n_estimators': 376, 'max_depth': 9, 'learning_rate': 0.0453765102879574, 'num_leaves': 80, 'subsample': 0.7656288991462747}. Best is trial 1 with value: 0.9331122900088417.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000311 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [In

[I 2024-10-15 17:04:01,718] Trial 5 finished with value: 0.9331417624521071 and parameters: {'n_estimators': 486, 'max_depth': 6, 'learning_rate': 0.2736965825794872, 'num_leaves': 89, 'subsample': 0.5199193409395533}. Best is trial 5 with value: 0.9331417624521071.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000346 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000389 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:01,914] Trial 6 finished with value: 0.9228558797524314 and parameters: {'n_estimators': 102, 'max_depth': 8, 'learning_rate': 0.08034248478788046, 'num_leaves': 108, 'subsample': 0.5372212521171067}. Best is trial 5 with value: 0.9331417624521071.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000213 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:02,184] Trial 7 finished with value: 0.926259946949602 and parameters: {'n_estimators': 494, 'max_depth': 3, 'learning_rate': 0.2292421533472183, 'num_leaves': 74, 'subsample': 0.6912843748941573}. Best is trial 5 with value: 0.9331417624521071.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000343 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000352 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:02,518] Trial 8 finished with value: 0.93140288829944 and parameters: {'n_estimators': 189, 'max_depth': 8, 'learning_rate': 0.09192180409327189, 'num_leaves': 141, 'subsample': 0.5189755997964665}. Best is trial 5 with value: 0.9331417624521071.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000341 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:02,948] Trial 9 finished with value: 0.9279988211022692 and parameters: {'n_estimators': 256, 'max_depth': 10, 'learning_rate': 0.04681485790659807, 'num_leaves': 53, 'subsample': 0.7082707085726225}. Best is trial 5 with value: 0.9331417624521071.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:03,182] Trial 10 finished with value: 0.926259946949602 and parameters: {'n_estimators': 337, 'max_depth': 6, 'learning_rate': 0.2990139929218922, 'num_leaves': 24, 'subsample': 0.62690540945443}. Best is trial 5 with value: 0.9331417624521071.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000179 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000172 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [In

[I 2024-10-15 17:04:03,514] Trial 11 finished with value: 0.9331270262304745 and parameters: {'n_estimators': 420, 'max_depth': 6, 'learning_rate': 0.16683170710572792, 'num_leaves': 105, 'subsample': 0.8040264756641288}. Best is trial 5 with value: 0.9331417624521071.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2889
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000328 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:03,871] Trial 12 finished with value: 0.926259946949602 and parameters: {'n_estimators': 411, 'max_depth': 6, 'learning_rate': 0.13381487994118454, 'num_leaves': 107, 'subsample': 0.8040579397712095}. Best is trial 5 with value: 0.9331417624521071.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000383 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000288 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:04,197] Trial 13 finished with value: 0.9245505452402005 and parameters: {'n_estimators': 444, 'max_depth': 5, 'learning_rate': 0.1511705255897364, 'num_leaves': 118, 'subsample': 0.9968616507721226}. Best is trial 5 with value: 0.9331417624521071.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000365 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:04,481] Trial 14 finished with value: 0.9348364279398762 and parameters: {'n_estimators': 312, 'max_depth': 7, 'learning_rate': 0.17579173555875183, 'num_leaves': 91, 'subsample': 0.6345451899732333}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000217 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:04,705] Trial 15 finished with value: 0.9313734158561745 and parameters: {'n_estimators': 302, 'max_depth': 7, 'learning_rate': 0.28592347971567084, 'num_leaves': 56, 'subsample': 0.6165228206912345}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2889
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:05,006] Trial 16 finished with value: 0.9296934865900383 and parameters: {'n_estimators': 340, 'max_depth': 8, 'learning_rate': 0.18215502024736158, 'num_leaves': 86, 'subsample': 0.614697438892397}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000215 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:05,285] Trial 17 finished with value: 0.9296787503684054 and parameters: {'n_estimators': 253, 'max_depth': 5, 'learning_rate': 0.11916832442759231, 'num_leaves': 92, 'subsample': 0.6609996812509722}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:05,828] Trial 18 finished with value: 0.9279693486590039 and parameters: {'n_estimators': 372, 'max_depth': 7, 'learning_rate': 0.01489692359331693, 'num_leaves': 60, 'subsample': 0.5067659055547333}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000190 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:06,104] Trial 19 finished with value: 0.9331270262304743 and parameters: {'n_estimators': 288, 'max_depth': 9, 'learning_rate': 0.183429370109202, 'num_leaves': 38, 'subsample': 0.5679372523614199}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:06,383] Trial 20 finished with value: 0.9279546124373711 and parameters: {'n_estimators': 471, 'max_depth': 4, 'learning_rate': 0.2716382978963291, 'num_leaves': 119, 'subsample': 0.5694082715370572}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2889
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:06,681] Trial 21 finished with value: 0.9194076038903625 and parameters: {'n_estimators': 403, 'max_depth': 6, 'learning_rate': 0.19263803737169574, 'num_leaves': 95, 'subsample': 0.7451127097674318}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000403 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000390 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:07,022] Trial 22 finished with value: 0.9331270262304745 and parameters: {'n_estimators': 462, 'max_depth': 6, 'learning_rate': 0.16743257893673016, 'num_leaves': 120, 'subsample': 0.7903566156819485}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:07,404] Trial 23 finished with value: 0.9296640141467728 and parameters: {'n_estimators': 402, 'max_depth': 7, 'learning_rate': 0.11447092486692366, 'num_leaves': 90, 'subsample': 0.7254204844348342}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [In

[I 2024-10-15 17:04:07,642] Trial 24 finished with value: 0.917683465959328 and parameters: {'n_estimators': 342, 'max_depth': 5, 'learning_rate': 0.25911623897868663, 'num_leaves': 70, 'subsample': 0.6814652719240084}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2889
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:07,917] Trial 25 finished with value: 0.9194076038903626 and parameters: {'n_estimators': 435, 'max_depth': 4, 'learning_rate': 0.21969324136620144, 'num_leaves': 107, 'subsample': 0.8239344632134827}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:08,297] Trial 26 finished with value: 0.9279546124373711 and parameters: {'n_estimators': 499, 'max_depth': 6, 'learning_rate': 0.15741922590639187, 'num_leaves': 130, 'subsample': 0.6607758773469193}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:08,604] Trial 27 finished with value: 0.9228411435307986 and parameters: {'n_estimators': 371, 'max_depth': 8, 'learning_rate': 0.2082595950426544, 'num_leaves': 99, 'subsample': 0.9290016890090895}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info]

[I 2024-10-15 17:04:08,947] Trial 28 finished with value: 0.9297082228116709 and parameters: {'n_estimators': 303, 'max_depth': 7, 'learning_rate': 0.1396619406394946, 'num_leaves': 83, 'subsample': 0.5997669721913393}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000178 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000336 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:09,237] Trial 29 finished with value: 0.9228411435307986 and parameters: {'n_estimators': 469, 'max_depth': 5, 'learning_rate': 0.2563348743386129, 'num_leaves': 78, 'subsample': 0.5482571694549058}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000190 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [In

[I 2024-10-15 17:04:09,564] Trial 30 finished with value: 0.9296787503684056 and parameters: {'n_estimators': 428, 'max_depth': 9, 'learning_rate': 0.19800168073908198, 'num_leaves': 63, 'subsample': 0.7707546949107996}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2889
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:09,901] Trial 31 finished with value: 0.9279840848806366 and parameters: {'n_estimators': 472, 'max_depth': 6, 'learning_rate': 0.17246038139916806, 'num_leaves': 118, 'subsample': 0.7931079331278976}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000305 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [In

[I 2024-10-15 17:04:10,244] Trial 32 finished with value: 0.9296934865900383 and parameters: {'n_estimators': 455, 'max_depth': 6, 'learning_rate': 0.16956111781121422, 'num_leaves': 130, 'subsample': 0.8562470417115161}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000333 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000213 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [In

[I 2024-10-15 17:04:10,569] Trial 33 finished with value: 0.9245652814618331 and parameters: {'n_estimators': 466, 'max_depth': 7, 'learning_rate': 0.24099305865428622, 'num_leaves': 113, 'subsample': 0.8483861032076669}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000294 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:10,954] Trial 34 finished with value: 0.9296787503684056 and parameters: {'n_estimators': 419, 'max_depth': 5, 'learning_rate': 0.11349443739472226, 'num_leaves': 102, 'subsample': 0.7434534055156522}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000335 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[Li

[I 2024-10-15 17:04:11,284] Trial 35 finished with value: 0.9331417624521071 and parameters: {'n_estimators': 379, 'max_depth': 6, 'learning_rate': 0.1532680262733664, 'num_leaves': 143, 'subsample': 0.8809122638676553}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000217 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [In

[I 2024-10-15 17:04:11,628] Trial 36 finished with value: 0.9227969348659004 and parameters: {'n_estimators': 355, 'max_depth': 4, 'learning_rate': 0.09020419535327241, 'num_leaves': 145, 'subsample': 0.9537530147769395}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:11,975] Trial 37 finished with value: 0.9279840848806366 and parameters: {'n_estimators': 394, 'max_depth': 7, 'learning_rate': 0.14335723008451834, 'num_leaves': 137, 'subsample': 0.9066812414605164}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000283 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:12,259] Trial 38 finished with value: 0.9313881520778071 and parameters: {'n_estimators': 322, 'max_depth': 8, 'learning_rate': 0.22000440843016597, 'num_leaves': 46, 'subsample': 0.8870183740622145}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000191 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:12,497] Trial 39 finished with value: 0.9210875331564987 and parameters: {'n_estimators': 272, 'max_depth': 5, 'learning_rate': 0.20448932308113496, 'num_leaves': 149, 'subsample': 0.8625877962529994}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000363 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:12,834] Trial 40 finished with value: 0.9262452107279694 and parameters: {'n_estimators': 222, 'max_depth': 6, 'learning_rate': 0.06531715116959613, 'num_leaves': 125, 'subsample': 0.8224757625301089}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:13,159] Trial 41 finished with value: 0.9279693486590037 and parameters: {'n_estimators': 386, 'max_depth': 6, 'learning_rate': 0.15821712292633952, 'num_leaves': 98, 'subsample': 0.7879638984829007}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000215 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [In

[I 2024-10-15 17:04:13,542] Trial 42 finished with value: 0.9296640141467728 and parameters: {'n_estimators': 488, 'max_depth': 6, 'learning_rate': 0.13066172446832183, 'num_leaves': 136, 'subsample': 0.8221001351414243}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000276 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:13,897] Trial 43 finished with value: 0.9245505452402003 and parameters: {'n_estimators': 447, 'max_depth': 7, 'learning_rate': 0.1697008636611905, 'num_leaves': 124, 'subsample': 0.7144642582251914}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000201 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-10-15 17:04:14,085] Trial 44 finished with value: 0.9279840848806366 and parameters: {'n_estimators': 115, 'max_depth': 6, 'learning_rate': 0.09868894860745006, 'num_leaves': 112, 'subsample': 0.5338988870993214}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

[I 2024-10-15 17:04:14,402] Trial 45 finished with value: 0.9245358090185677 and parameters: {'n_estimators': 426, 'max_depth': 7, 'learning_rate': 0.1876535951799126, 'num_leaves': 103, 'subsample': 0.7674972554705426}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [In

[I 2024-10-15 17:04:14,687] Trial 46 finished with value: 0.9279693486590039 and parameters: {'n_estimators': 452, 'max_depth': 5, 'learning_rate': 0.23073819360996212, 'num_leaves': 88, 'subsample': 0.6411728832948704}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:14,937] Trial 47 finished with value: 0.9245505452402003 and parameters: {'n_estimators': 362, 'max_depth': 3, 'learning_rate': 0.1520702575394053, 'num_leaves': 78, 'subsample': 0.9571264971418689}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2889
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000170 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:15,369] Trial 48 finished with value: 0.9296787503684056 and parameters: {'n_estimators': 485, 'max_depth': 8, 'learning_rate': 0.12320893525827502, 'num_leaves': 141, 'subsample': 0.5828924967335557}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2024-10-15 17:04:15,637] Trial 49 finished with value: 0.9296787503684056 and parameters: {'n_estimators': 319, 'max_depth': 5, 'learning_rate': 0.1647083397921142, 'num_leaves': 112, 'subsample': 0.8871232450727171}. Best is trial 14 with value: 0.9348364279398762.


[LightGBM] [Info] Number of positive: 233, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2749
[LightGBM] [Info] Number of data points in the train set: 466, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best LightGBM params: {'n_estimators': 312, 'max_depth': 7, 'learning_rate': 0.17579173555875183, 'num_leaves': 91, 'subsample': 0.6345451899732333}


CatBoost

In [26]:
# Hyperparameter tuning for CatBoost
def catboost_objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 500),
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 10),
    }
    catboost = CatBoostClassifier(**params, verbose=0, random_state=42)
    score = cross_val_score(catboost, X_resampled, y_resampled, cv=cv, scoring='accuracy').mean()
    return score

study_catboost = optuna.create_study(direction='maximize')
study_catboost.optimize(catboost_objective, n_trials=50)
best_catboost_params = study_catboost.best_trial.params
print(f"Best CatBoost params: {best_catboost_params}")

[I 2024-10-15 17:04:52,473] A new study created in memory with name: no-name-42aa1145-7216-420e-9e20-bbb957f0bea7
[I 2024-10-15 17:04:53,677] Trial 0 finished with value: 0.922811671087533 and parameters: {'iterations': 271, 'depth': 3, 'learning_rate': 0.2548115805989677, 'l2_leaf_reg': 6.987159567332204}. Best is trial 0 with value: 0.922811671087533.
[I 2024-10-15 17:04:54,801] Trial 1 finished with value: 0.9245063365753021 and parameters: {'iterations': 212, 'depth': 4, 'learning_rate': 0.0870348933142444, 'l2_leaf_reg': 2.3302598026919554}. Best is trial 1 with value: 0.9245063365753021.
[I 2024-10-15 17:04:55,791] Trial 2 finished with value: 0.9313881520778071 and parameters: {'iterations': 197, 'depth': 4, 'learning_rate': 0.27777813671265655, 'l2_leaf_reg': 2.8046743133751897}. Best is trial 2 with value: 0.9313881520778071.
[I 2024-10-15 17:04:56,795] Trial 3 finished with value: 0.9313734158561744 and parameters: {'iterations': 186, 'depth': 4, 'learning_rate': 0.1219756391

Best CatBoost params: {'iterations': 210, 'depth': 5, 'learning_rate': 0.23782103801233156, 'l2_leaf_reg': 6.494610445280222}


 Random Forest

In [27]:
# Hyperparameter tuning for Random Forest
def rf_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }
    rf = RandomForestClassifier(**params, random_state=42)
    score = cross_val_score(rf, X_resampled, y_resampled, cv=cv, scoring='accuracy').mean()
    return score

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(rf_objective, n_trials=50)
best_rf_params = study_rf.best_trial.params
print(f"Best Random Forest params: {best_rf_params}")

[I 2024-10-15 17:07:10,416] A new study created in memory with name: no-name-b964e635-374f-4b5a-9d05-40b68bb17a9f
[I 2024-10-15 17:07:11,866] Trial 0 finished with value: 0.9297082228116711 and parameters: {'n_estimators': 326, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 0 with value: 0.9297082228116711.
[I 2024-10-15 17:07:12,864] Trial 1 finished with value: 0.917683465959328 and parameters: {'n_estimators': 235, 'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 7, 'bootstrap': False}. Best is trial 0 with value: 0.9297082228116711.
[I 2024-10-15 17:07:14,132] Trial 2 finished with value: 0.9142499263188919 and parameters: {'n_estimators': 301, 'max_depth': 12, 'min_samples_split': 8, 'min_samples_leaf': 7, 'bootstrap': False}. Best is trial 0 with value: 0.9297082228116711.
[I 2024-10-15 17:07:14,745] Trial 3 finished with value: 0.9176539935160625 and parameters: {'n_estimators': 160, 'max_depth': 13, 'min_samples_spl

Best Random Forest params: {'n_estimators': 136, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}


In [28]:
# Final model training with the best parameters from hyperparameter tuning
dt_best = DecisionTreeClassifier(**best_dt_params, random_state=42)
xgb_best = XGBClassifier(**best_xgb_params, use_label_encoder=False, eval_metric='logloss', random_state=42)
lgbm_best = LGBMClassifier(**best_lgbm_params, random_state=42)
catboost_best = CatBoostClassifier(**best_catboost_params, verbose=0, random_state=42)
rf_best = RandomForestClassifier(**best_rf_params, random_state=42)


# Cross-validation for evaluation with tuned models

In [29]:
# Cross-validation for evaluation with tuned models
dt_score = cross_val_score(dt_best, X_resampled, y_resampled, cv=cv, scoring='accuracy').mean()
xgb_score = cross_val_score(xgb_best, X_resampled, y_resampled, cv=cv, scoring='accuracy').mean()
lgbm_score = cross_val_score(lgbm_best, X_resampled, y_resampled, cv=cv, scoring='accuracy').mean()
catboost_score = cross_val_score(catboost_best, X_resampled, y_resampled, cv=cv, scoring='accuracy').mean()
rf_score = cross_val_score(rf_best, X_resampled, y_resampled, cv=cv, scoring='accuracy').mean()


[LightGBM] [Info] Number of positive: 233, number of negative: 232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3089
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501075 -> initscore=0.004301
[LightGBM] [Info] Start training from score 0.004301
[LightGBM] [Info] Number of positive: 232, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3087
[LightGBM] [Info] Number of data points in the train set: 465, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498925 -> initscore=-0.004301
[LightGBM] [Info] Start training from score -0.004301
[LightGBM] [Info] Number o

# final results

In [30]:

print(f"Decision Tree Final Cross-Validation Accuracy: {dt_score:.4f}")
print(f"XGBoost Final Cross-Validation Accuracy: {xgb_score:.4f}")
print(f"LightGBM Final Cross-Validation Accuracy: {lgbm_score:.4f}")
print(f"CatBoost Final Cross-Validation Accuracy: {catboost_score:.4f}")
print(f"Random Forest Final Cross-Validation Accuracy: {rf_score:.4f}")

Decision Tree Final Cross-Validation Accuracy: 0.8833
XGBoost Final Cross-Validation Accuracy: 0.9297
LightGBM Final Cross-Validation Accuracy: 0.9348
CatBoost Final Cross-Validation Accuracy: 0.9349
Random Forest Final Cross-Validation Accuracy: 0.9314


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

# Initialize dictionary to store all metrics
evaluation_results = []

# Evaluate each classifier and calculate metrics
for name, clf in classifiers.items():
    start_time = time.perf_counter()  # Start timer
    clf.fit(X_resampled, y_resampled)  # Train the model
    y_pred = clf.predict(X_test_preprocessed)  # Predict on the test set
    end_time = time.perf_counter()  # End timer

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    time_taken = end_time - start_time

    # Append results to the list
    evaluation_results.append({
        'Classifier': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Time Taken (seconds)': time_taken
    })

# Convert results to a DataFrame
results_df = pd.DataFrame(evaluation_results)

# Sort by Accuracy (or any other metric)
results_df = results_df.sort_values(by='Accuracy', ascending=False)

# Display the results
print(results_df)


                Classifier  Accuracy  Precision    Recall  F1 Score  \
4            Random Forest  0.926606   0.929789  0.902968  0.914510   
7                  XGBoost  0.926606   0.929789  0.902968  0.914510   
8                 LightGBM  0.917431   0.922870  0.889079  0.903034   
9                 CatBoost  0.917431   0.915072  0.896119  0.904581   
1                KNN (k=5)  0.880734   0.864276  0.903919  0.873539   
3  Decision Tree (entropy)  0.862385   0.845946  0.840944  0.843346   
0                KNN (k=3)  0.853211   0.832867  0.862253  0.842029   
2     Decision Tree (gini)  0.853211   0.837059  0.827055  0.831660   
5              Naive Bayes  0.788991   0.784535  0.821347  0.781639   
6         Dummy Classifier  0.669725   0.334862  0.500000  0.401099   

   Time Taken (seconds)  
4              0.123040  
7              0.046196  
8              0.049433  
9              1.783315  
1              0.011086  
3              0.006524  
0              0.012690  
2              0.006006  
5              0.000555  
6              0.000133

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Initialize dictionary for cross-validation results
cv_results = []

# Cross-validation setup
cv = StratifiedKFold(n_splits=5)

# Perform cross-validation for each classifier
for name, clf in classifiers.items():
    start_time = time.perf_counter()
    scores = cross_val_score(clf, X_resampled, y_resampled, cv=cv, scoring='accuracy')
    end_time = time.perf_counter()

    # Calculate mean accuracy and time taken
    mean_accuracy = scores.mean()
    time_taken = end_time - start_time

    # Append results to the list
    cv_results.append({
        'Classifier': name,
        'CV Mean Accuracy': mean_accuracy,
        'Time Taken (seconds)': time_taken
    })

# Convert to DataFrame and display results
cv_results_df = pd.DataFrame(cv_results).sort_values(by='CV Mean Accuracy', ascending=False)
print("\nCross-Validation Results:")
print(cv_results_df)


Cross-Validation Results:
                Classifier  CV Mean Accuracy  Time Taken (seconds)
4            Random Forest          0.931418              0.457998
8                 LightGBM          0.926260              0.213678
9                 CatBoost          0.926260              8.211299
7                  XGBoost          0.910816              0.242083
0                KNN (k=3)          0.893634              0.045759
3  Decision Tree (entropy)          0.878146              0.026417
1                KNN (k=5)          0.874786              0.039173
2     Decision Tree (gini)          0.864486              0.027069
5              Naive Bayes          0.828235              0.005399
6         Dummy Classifier          0.498291              0.003393

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform

# Define hyperparameter grids and distributions
param_grid = {
    'KNN (k=3)': {'n_neighbors': [3, 5, 7, 10]},
    'KNN (k=5)': {'n_neighbors': [3, 5, 7, 10]},
    'Decision Tree (gini)': {'max_depth': [None, 10, 20, 30], 'criterion': ['gini']},
    'Decision Tree (entropy)': {'max_depth': [None, 10, 20, 30], 'criterion': ['entropy']},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]},
    'XGBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    'LightGBM': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'num_leaves': [31, 62]},
    'CatBoost': {'iterations': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'depth': [4, 6, 8]}
}

param_dist = {
    'KNN (k=3)': {'n_neighbors': randint(1, 20)},
    'KNN (k=5)': {'n_neighbors': randint(1, 20)},
    'Decision Tree (gini)': {'max_depth': [None, 10, 20, 30], 'criterion': ['gini', 'entropy']},
    'Decision Tree (entropy)': {'max_depth': [None, 10, 20, 30], 'criterion': ['gini', 'entropy']},
    'Random Forest': {'n_estimators': randint(10, 300), 'max_depth': [None, 10, 20, 30]},
    'XGBoost': {'n_estimators': randint(50, 300), 'learning_rate': uniform(0.01, 0.2), 'max_depth': randint(3, 10)},
    'LightGBM': {'n_estimators': randint(50, 300), 'learning_rate': uniform(0.01, 0.2), 'num_leaves': randint(20, 100)},
    'CatBoost': {'iterations': randint(10, 300), 'learning_rate': uniform(0.01, 0.2), 'depth': randint(3, 10)}
}

# Initialize results for both searches
grid_search_results = []
random_search_results = []

# Perform hyperparameter tuning
for name, clf in classifiers.items():
    if name in param_grid:
        # Grid Search
        grid_search = GridSearchCV(clf, param_grid[name], cv=3, scoring='accuracy', verbose=1)
        grid_search.fit(X_resampled, y_resampled)
        grid_search_results.append({
            'Classifier': name,
            'Best Params': grid_search.best_params_,
            'Best Accuracy': grid_search.best_score_
        })

        # Randomized Search
        random_search = RandomizedSearchCV(clf, param_distributions=param_dist[name], n_iter=20, cv=3, scoring='accuracy', random_state=42, verbose=1)
        random_search.fit(X_resampled, y_resampled)
        random_search_results.append({
            'Classifier': name,
            'Best Params': random_search.best_params_,
            'Best Accuracy': random_search.best_score_
        })

# Convert to DataFrames and display
grid_search_df = pd.DataFrame(grid_search_results)
random_search_df = pd.DataFrame(random_search_results)

print("\nGrid Search Results:")
print(grid_search_df)

print("\nRandomized Search Results:")
print(random_search_df)


Grid Search Results:
                Classifier                                        Best Params  \
0                KNN (k=3)                                 {'n_neighbors': 3}   
1                KNN (k=5)                                 {'n_neighbors': 3}   
2     Decision Tree (gini)             {'criterion': 'gini', 'max_depth': 10}   
3  Decision Tree (entropy)          {'criterion': 'entropy', 'max_depth': 30}   
4            Random Forest             {'max_depth': 10, 'n_estimators': 100}   
5                  XGBoost  {'learning_rate': 0.1, 'max_depth': 3, 'n_esti...   
6                 LightGBM  {'learning_rate': 0.1, 'n_estimators': 50, 'nu...   
7                 CatBoost  {'depth': 6, 'iterations': 200, 'learning_rate...   

   Best Accuracy  
0       0.883162  
1       0.883162  
2       0.871134  
3       0.883162  
4       0.924399  
5       0.915808  
6       0.915808  
7       0.934708  

Randomized Search Results:
                Classifier                                        Best Params  \
0                KNN (k=3)                                 {'n_neighbors': 1}   
1                KNN (k=5)                                 {'n_neighbors': 1}   
2     Decision Tree (gini)          {'max_depth': 20, 'criterion': 'entropy'}   
3  Decision Tree (entropy)          {'max_depth': 30, 'criterion': 'entropy'}   
4            Random Forest              {'max_depth': 20, 'n_estimators': 97}   
5                  XGBoost  {'learning_rate': 0.15639878836228102, 'max_de...   
6                 LightGBM  {'learning_rate': 0.08327236865873834, 'n_esti...   
7                 CatBoost  {'depth': 6, 'iterations': 280, 'learning_rate...   

   Best Accuracy  
0       0.903780  
1       0.903780  
2       0.893471  
3       0.884880  
4       0.926117  
5       0.915808  
6       0.919244  
7       0.927835  

In [None]:
final_results = []

for model_info in grid_search_results + random_search_results:
    name = model_info['Classifier']
    params = model_info['Best Params']

    # Create a new instance of the model with the best parameters
    if "CatBoost" in name:
        clf = CatBoostClassifier(**params, verbose=0, random_state=42)
    elif "XGBoost" in name:
        clf = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=42)
    elif "LightGBM" in name:
        clf = LGBMClassifier(**params, random_state=42)
    elif "Random Forest" in name:
        clf = RandomForestClassifier(**params, random_state=42)
    elif "Decision Tree" in name:
        clf = DecisionTreeClassifier(**params, random_state=42)
    elif "KNN" in name:
        clf = KNeighborsClassifier(**params)
    elif "Naive Bayes" in name:
        clf = GaussianNB()
    elif "Dummy" in name:
        clf = DummyClassifier(**params)
    else:
        continue

    # Train and evaluate the model
    start_time = time.perf_counter()
    clf.fit(X_resampled, y_resampled)
    y_pred = clf.predict(X_test_preprocessed)
    end_time = time.perf_counter()

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    time_taken = end_time - start_time

    final_results.append({
        'Classifier': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Time Taken (seconds)': time_taken
    })

# Convert to DataFrame and display
final_results_df = pd.DataFrame(final_results).sort_values(by='Accuracy', ascending=False)
print("\nFinal Results:")
print(final_results_df)


Final Results:
                 Classifier  Accuracy  Precision    Recall  F1 Score  \
6                  LightGBM  0.926606   0.922549  0.910008  0.915830   
12            Random Forest  0.926606   0.929789  0.902968  0.914510   
13                  XGBoost  0.926606   0.929789  0.902968  0.914510   
4             Random Forest  0.917431   0.915072  0.896119  0.904581   
7                  CatBoost  0.917431   0.915072  0.896119  0.904581   
5                   XGBoost  0.908257   0.907670  0.882230  0.893137   
14                 LightGBM  0.908257   0.901176  0.889269  0.894788   
15                 CatBoost  0.899083   0.888031  0.882420  0.885120   
3   Decision Tree (entropy)  0.871560   0.852483  0.861872  0.856794   
8                 KNN (k=3)  0.871560   0.852483  0.861872  0.856794   
9                 KNN (k=5)  0.871560   0.852483  0.861872  0.856794   
10     Decision Tree (gini)  0.871560   0.852483  0.861872  0.856794   
11  Decision Tree (entropy)  0.871560   0.852483  0.861872  0.856794   
0                 KNN (k=3)  0.853211   0.832867  0.862253  0.842029   
1                 KNN (k=5)  0.853211   0.832867  0.862253  0.842029   
2      Decision Tree (gini)  0.853211   0.832283  0.841134  0.836336   

    Time Taken (seconds)  
6               0.027233  
12              0.111890  
13              0.054412  
4               0.119388  
7               0.391472  
5               0.038196  
14              0.111108  
15              0.496564  
3               0.007931  
8               0.009547  
9               0.011342  
10              0.006300  
11              0.006185  
0               0.011697  
1               0.011528  
2               0.006733  