## Example of wrong treatment of train and test set
The data for this example is taken from here: https://www.kaggle.com/datasets/wenruliu/adult-income-dataset

In [35]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning) 
import pandas as pd
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)
# pd.set_option('display.max_colwidth', -1)
import numpy as np
import re

# Unblanaced dataset
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC, SMOTEN, SVMSMOTE, BorderlineSMOTE, ADASYN

# modeling utilities
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer # enables sklearn.impute.IterativeImputer
from sklearn.impute import SimpleImputer, IterativeImputer


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score


from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.compose import make_column_transformer, ColumnTransformer

# Additional models
from lightgbm import LGBMClassifier

In [36]:
columns = ["age","workclass" ,"final-weight","education","education-num","marital-status","occupation", "relationship", "race", "sex", "capital-gain",
            "capital-loss", "hours-per-week", "native-country", "salary" ]

# Reading datasets
train_df = pd.read_csv('../data/adult.data', names=columns, header=None, skipinitialspace = True)
test_df = pd.read_csv('../data/adult.test', names=columns, header=None, skipinitialspace = True)
test_df["salary"] = test_df["salary"].apply(lambda x: x.replace('.', ''))

print("Training set:") #, train_df.columns)
print(train_df.sample(frac=1).tail(3))

print("Test set:") #, test_df.columns)
print(test_df.sample(frac=1).tail(3))

Training set:
       age workclass  final-weight education  education-num marital-status     occupation   relationship   race     sex  capital-gain  capital-loss  hours-per-week native-country salary
32504   50   Private        208630   Masters             14       Divorced          Sales  Not-in-family  White  Female             0             0              50  United-States   >50K
21760   41   Private        392167      10th              6       Divorced          Sales  Not-in-family  White    Male             0             0              48  United-States  <=50K
1040    90   Private        137018   HS-grad              9  Never-married  Other-service  Not-in-family  White  Female             0             0              40  United-States  <=50K
Test set:
       age workclass  final-weight   education  education-num      marital-status         occupation relationship   race   sex  capital-gain  capital-loss  hours-per-week native-country salary
5814    48   Private        323798  Ass

### some of the categorical variables are amended
The following procedures have to be profen to be beneficial in many example notebooks in the internet

In [37]:
education_map = {'Preschool': "elem_school", '1st-4th':"elem_school",'5th-6th':"elem_school", 
                 "mid_school":'7th-8th', 
                 '9th':"high_school_lower",'10th':"high_school_lower", '11th':"high_school_lower", '12th':"high_school_lower",
                 'HS-grad':"high_school_upper", 'Assoc-voc':"high_school_upper", 'Assoc-acdm':"high_school_upper", 'Some-college':"high_school_upper", 
                  'Bachelors':"graduate", 'Masters':'graduate', 'Prof-school':"post_grad", 'Doctorate':'post_grad'}
train_df['education'] = train_df['education'].map(education_map)
test_df['education'] = test_df['education'].map(education_map)

In [38]:
employed_map =  {'Self-emp-inc': "self_employed_expert", 'Self-emp-not-inc':"self_employed", 'Private':"self_employed", 
                 'Local-gov':"public_servant", 'State-gov':'public_servant','Federal-gov': 'public_servant',
                 "Withoug-pay":"unemployed", 'Never-worked':'unemployed'}
train_df['workclass'] = train_df['workclass'].map(employed_map)
test_df['workclass'] = test_df['workclass'].map(employed_map)

In [39]:
occupation_map = {"Prof-specialty":"executives", "Exec-managerial":"executives", 
                  "Protective-serv":"experts", "Tech-support":"experts", "Sales":"experts", "Craft-repair":"experts", "Transport-moving":"experts",
                  "Adm-clerical":"technicians", "Machine-op-inspct":"technicians", "Farming-fishing":"technicians", "Armed-Forces":"technicians",
                  "Priv-house-serv":"services", "Other-service":"services", "Handlers-cleaners":"services"}
train_df["occupation"] = train_df["occupation"].map(occupation_map)
test_df["occupation"] = test_df["occupation"].map(occupation_map)

In [40]:
marital_map = {'Married-civ-spouse':"couple",'Married-AF-spouse':"couple", 
               'Separated':"single",'Divorced':"single", 'Married-spouse-absent':"single", 'Widowed':"single", 'Never-married':"single"}
train_df["marital-status"] = train_df["marital-status"].map(marital_map)
test_df["marital-status"] = test_df["marital-status"].map(marital_map)


In [41]:
family_map = {"Wife":"family_strong", "Husband":"family_strong", 
              "Other-relative":"family_weak", "Own-child":"family_weak", 
              "Unmarried":"family_none", "Not-in-family":"family_none"}
train_df["relationship"] =  train_df["relationship"].map(family_map)
test_df["relationship"] =  test_df["relationship"].map(family_map)

In [42]:
gdp_map = {"France":"GDP_high",  "Ireland":"GDP_high",  "United-States":"GDP_high",  "Holand-Netherlands":"GDP_high",  "Canada":"GDP_high",  "Germany":"GDP_high",  "Hong":"GDP_high",  "England":"GDP_high",  "Japan":"GDP_high",  "Scotland":"GDP_high", 
           "Italy":"GDP_mid",  "South":"GDP_mid",  "Puerto-Rico":"GDP_mid",  "Taiwan":"GDP_mid",  "Portugal":"GDP_mid", "Greece":"GDP_mid", "Hungary":"GDP_mid", "Poland":"GDP_mid", "Trinadad&Tobago":"GDP_mid", "China":"GDP_mid", 
           "Mexico":"GDP_low", "Dominican-Republic":"GDP_low", "Thailand":"GDP_low", "Peru":"GDP_low", "Columbia":"GDP_low", 
           "Ecuador":"GDP_low", "Jamaica":"GDP_low", "Guatemala":"GDP_low", "El-Salvador":"GDP_low", "Vietnam":"GDP_low", "Philippines":"GDP_low",
           "Laos":"GDP_low", "Honduras":"GDP_low", "India":"GDP_low", 
           "Nicaragua":"GDP_low", "Haiti":"GDP_low", "Cambodia":"GDP_low", "Iran":"GDP_low", "Yugoslavia":"GDP_low", 
           "Outlying-US(Guam-USVI-etc)":"GDP_low", "Cuba":"GDP_low"}
train_df["native-country"] =  train_df["native-country"].map(gdp_map)
test_df["native-country"] =  test_df["native-country"].map(gdp_map)


### Now, a pipeline is built for applying some transformation to the data

In [43]:
from imblearn.pipeline import Pipeline

num_feats_stdscale = ['hours-per-week', 'capital-gain', 'capital-loss']
ordinal_feats = ['education']
education_rank = [["not_known","elem_school", "mid_school", "high_school_lower", "high_school_upper", "graduate", "post_grad"]]
cat_feats_ohe = ['workclass', 'marital-status', 'occupation', 'relationship', 'native-country','race', 'sex']
num_feats_min_max = ['age']

numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer()),
    ('scaler', StandardScaler())
])

ohe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='not_known')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])


ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='not_known')),
    ('ordinal', OrdinalEncoder(categories=education_rank, handle_unknown='use_encoded_value', unknown_value=-1))
])

min_max_transformer = MinMaxScaler()

pipeline = Pipeline(steps =[ 
    ('columnT', ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_feats_stdscale),
            ('cat', ohe, cat_feats_ohe),
            ('ordinal', ordinal_transformer, ordinal_feats),
            ('min_max_standardizer', min_max_transformer, num_feats_min_max)
            ]))
])


## these are the two first mistakes:
### 1. The pipeline is fit on the whole train data and not within the cross-validation framework on the training-folds only. 
 - This is against the rule that train-folds and hold-out-folds should be independent. The pipline should be passed to a cross-validation method that handles its correct application to the train and hold-out folds.
 - test data should be just transformed with the fitted pipeline by calling `pipeline.transform()` .

In [44]:
processed_train = pipeline.fit_transform(train_df.drop("salary", axis=1))
processed_test = pipeline.fit_transform(test_df.drop("salary", axis=1))

In [45]:
categorical_feature_mask = np.array([False, False, False,  
                                     True, True, True, True, True, True, True, True, True, True, True, True, True, 
                                     True, True, True, True, True, True, True, True, True, True, True, True, True, 
                                     True, 
                                    False])
categorical_feature_mask.shape

(31,)

In [46]:
processed_train.shape

(32561, 31)

### 2. Upsampling is done before splitting data into train and hold-out folds:
  - up-sampline (`SMOTENC`) is applied to all data - not only the trainings-fold within the cross-validation. We generate dependencies between data records. This will give a too good a result in the cross-validation.

In [47]:
#categorical_feature_mask = (train_df.drop("salary", axis=1).dtypes == object).values

smote = SMOTENC(categorical_features = categorical_feature_mask, sampling_strategy="minority")
X_balanced_train_df, y_balanced_train_df = smote.fit_resample(processed_train, train_df["salary"])
train_y = (y_balanced_train_df == '>50K')
test_y = (test_df['salary'] == '>50k')


In [48]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


models_dict_list = [
            {"name": "Lgbm", "model": LGBMClassifier(objective='binary', boosting_type='gbdt',
                                                    ),                      # WORKS
                    "params": {'n_estimators' : Integer(200,1000),
                                'learning_rate': Real(1e-3, 0.1, prior='log-uniform'),
                                "subsample_freq" : Integer(0, 4),
                               "num_leaves": Integer(15,25),
                                "max_depth": Integer(2, 10),
                               "subsample": Real(0.7,1.0),
                               "colsample_bytree": Real(0.4, 1.0),
                              'min_child_samples':  Integer(2,20)
                              } },            
            {"name": "LogisticReg", "model": LogisticRegression(max_iter=500, solver = 'liblinear'), # WORKS
                    "params": {'C': Real(1e-6, 1e+6, prior='log-uniform'), 
                               'penalty' : Categorical(['l1', 'l2']) } }
            ]

### Finding the best model with the best parameters:
 - a Grid-Search is run over the training-data to find the best parameters and the best model
 - the final result is evaluated on the test-set

Instead of passing a single algorithm to the BayesSearchCV class, the pipeline should be passed. The pipline should include the preprocessing steps, the upsampling and the final algorithm.

In [49]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, precision_recall_curve,  confusion_matrix, roc_curve, auc, precision_recall_fscore_support
results_df = pd.DataFrame(columns=['Model', 'precision', 'recall', 'f1-score', 'accuracy', 'AUC score', 'Best params'])

N_folds = 5

for model_dict in models_dict_list:
    
    model_name = model_dict["name"]    
    if model_dict["params"] == {}:
        continue
    
    # Define search grid
    search = BayesSearchCV(estimator = model_dict["model"],
                           search_spaces = model_dict["params"],
                           n_iter = 10,
                           scoring = 'f1', # 'f1', f1_weighted', 'roc_auc', 'accuracy'
                           n_jobs = 6, # change based on how many cpus available (8 for me)
                           n_points = 6, # change based on how many cpus available (8 for me)
                           cv = N_folds, # default: StratifiedKFold for binary labels
                           refit = True,
                           verbose = 0,
                           error_score = 'raise', 
                           return_train_score  = True,        
                            )
    
    # Training model
    print("========== Running model: "+model_name, end='')
    _ = search.fit(X_balanced_train_df, train_y)
    
    # Print results
    print("Best validation score: ", round(search.best_score_, 3))
    print("Best search parameters: ", search.best_params_)
    
    best_model = search.best_estimator_
    cv_results = search.cv_results_
    
    y_true = test_y.to_numpy()
    y_pred = best_model.predict(processed_test) 
    # Evauate metrics for weighted support
    accuracy = accuracy_score(y_true, y_pred)
    P, R, F1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    
    # Evaluate probabilities and auc_score

    y_predict_proba = best_model.predict_proba(processed_test)[:,1]
    
    fpr, tpr, thresholds = roc_curve(y_true, y_predict_proba)    
    auc_score = auc(fpr, tpr)
    
    # Store parameters
    results_df.loc[len(results_df.index)] = [model_name, round(P, 3), round(R, 3),  round(F1, 3), round(accuracy, 3), round(auc_score, 3), search.best_params_]
    
print("=== BEST MODEL RESULTS SUMMARY ===")
results_df = results_df.set_index('Model')
results_df.sort_values("f1-score", inplace = True, ascending=False)
results_df  

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068615 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1070
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031236 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1070
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 

[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028622 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1069
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1064
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1062
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034179 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1064
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0

[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051865 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1064
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046787 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1065
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0





[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029589 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1070
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030977 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1062
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0

[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030860 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1064
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031005 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1062
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0

[LightGBM] [Info] Number of positive: 24720, number of negative: 24720
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001562 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1068
[LightGBM] [Info] Number of data points in the train set: 49440, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best validation score:  0.882
Best search parameters:  OrderedDict([('colsample_bytree', 0.4966087588802698), ('learning_rate', 0.04165910577900045), ('max_depth', 8), ('min_child_samples', 7), ('n_estimators', 824), ('num_leaves', 23), ('subsample', 0.919203598391497), ('subsample_freq', 0)])

  _warn_prf(average, modifier, msg_start, len(result))


Best validation score:  0.823
Best search parameters:  OrderedDict([('C', 199.88305921964735), ('penalty', 'l1')])
=== BEST MODEL RESULTS SUMMARY ===


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0_level_0,precision,recall,f1-score,accuracy,AUC score,Best params
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LogisticReg,1.0,0.635,0.777,0.635,,"{'C': 199.88305921964735, 'penalty': 'l1'}"
Lgbm,1.0,0.374,0.544,0.374,,"{'colsample_bytree': 0.4966087588802698, 'lear..."


### see the result: logistic regression is better than Light-GBM



### next, the mistakes from above are not repeated:
  - up-sampling (`SMOTENC`) is part of the pipeline right now - it gets trained on training folds and applied to the hold-out fold
  - the whole pipeline is fit within the cross-validation framework and applied to the hold-out set
  - the same is true for the test set: just the `.predict()`-method of the pipeline is applied; not `.fit_transform()`

In [50]:
models_dict_list = [
            {"name": "Lgbm", "model": LGBMClassifier(objective='binary', boosting_type='gbdt',
                                                    ),                      # WORKS
                    "params": {'clf__n_estimators' : Integer(200,1000),
                                'clf__learning_rate': Real(1e-3, 0.1, prior='log-uniform'),
                                "clf__subsample_freq" : Integer(0, 4),
                               "clf__num_leaves": Integer(15,25),
                                "clf__max_depth": Integer(2, 10),
                               "clf__subsample": Real(0.7,1.0),
                               "clf__colsample_bytree": Real(0.4, 1.0),
                              'clf__min_child_samples':  Integer(2,20)
                              } },            
            {"name": "LogisticReg", "model": LogisticRegression(max_iter=500, solver = 'liblinear'), # WORKS
                    "params": {'clf__C': Real(1e-6, 1e+6, prior='log-uniform'), 
                               'clf__penalty' : Categorical(['l1', 'l2']) } }
            ]

In [51]:
pipelines = []
params = []
names = []
for m in models_dict_list:
    pipelines.append(Pipeline(steps =[ 
    ('columnT', ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_feats_stdscale),
            ('cat', ohe, cat_feats_ohe),
            ('ordinal', ordinal_transformer, ordinal_feats),
            ('min_max_standardizer', min_max_transformer, num_feats_min_max)
            ])),
    ('smotenc', SMOTENC(categorical_features=categorical_feature_mask, sampling_strategy="minority")),
    ("clf", m['model'])
    ])
                    )
    params.append(m["params"])
    names.append(m["name"])

In [52]:
from sklearn.metrics import f1_score
results_df = pd.DataFrame(columns=['Model', 'precision', 'recall', 'f1-score', 'accuracy', 'AUC score', 'Best params'])

N_folds = 5

for pipe, name, par in zip(pipelines, names, params):
    
    model_name = name
    
    # Define search grid
    search = BayesSearchCV(estimator = pipe,
                           search_spaces = par,
                           n_iter = 10,
                           scoring = 'f1', # 'f1', f1_weighted', 'roc_auc', 'accuracy'
                           n_jobs = 6, # change based on how many cpus available (8 for me)
                           n_points = 6, # change based on how many cpus available (8 for me)
                           cv = N_folds, # default: StratifiedKFold for binary labels
                           refit = True,
                           verbose = 0,
                           error_score = 'raise', 
                           return_train_score  = True,        
                            )
    
    # Training model
    print("========== Running model: "+model_name, end='')
    train_y = (train_df['salary'] == '>50K')
    _ = search.fit(train_df.drop("salary", axis=1), train_y)
    
    # Print results
    print("Best validation score: ", round(search.best_score_, 3))
    print("Best search parameters: ", search.best_params_)
    
    best_model = search.best_estimator_
    cv_results = search.cv_results_
    
    y_true = test_y.to_numpy()
    y_pred = best_model.predict(test_df.drop('salary', axis=1)) 
    # Evauate metrics for weighted support
    accuracy = accuracy_score(y_true, y_pred)
    P, R, F1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    
    # Evaluate probabilities and auc_score

    y_predict_proba = best_model.predict_proba(test_df.drop('salary', axis=1))[:,1]
    
    fpr, tpr, thresholds = roc_curve(y_true, y_predict_proba)    
    auc_score = auc(fpr, tpr)
    
    # Store parameters
    results_df.loc[len(results_df.index)] = [model_name, round(P, 3), round(R, 3),  round(F1, 3), round(accuracy, 3), round(auc_score, 3), search.best_params_]
    
print("=== BEST MODEL RESULTS SUMMARY ===")
results_df = results_df.set_index('Model')
results_df.sort_values("f1-score", inplace = True, ascending=False)
results_df  

[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030975 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1065
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030935 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1064
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0

[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1072
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1071
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0

[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031304 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1070
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031069 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1069
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0



[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051588 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1066
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032926 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1069
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0

[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029033 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1069
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030006 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1064
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0

[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1068
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1068
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0

[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1069
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015447 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1068
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0

[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030302 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1068
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025000 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1068
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0

  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":


[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1070
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033362 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1068
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0

  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":


Best validation score:  0.713
Best search parameters:  OrderedDict([('clf__colsample_bytree', 0.7191449614860558), ('clf__learning_rate', 0.07422470985028515), ('clf__max_depth', 7), ('clf__min_child_samples', 16), ('clf__n_estimators', 403), ('clf__num_leaves', 20), ('clf__subsample', 0.9668932926538398), ('clf__subsample_freq', 1)])

  _warn_prf(average, modifier, msg_start, len(result))


[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037228 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1068
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19776, number of negative: 19776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028388 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1068
[LightGBM] [Info] Number of data points in the train set: 39552, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0

  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == "auto":
  if self.categorical_features == 

Best validation score:  0.665
Best search parameters:  OrderedDict([('clf__C', 91.77536999208886), ('clf__penalty', 'l2')])
=== BEST MODEL RESULTS SUMMARY ===


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0_level_0,precision,recall,f1-score,accuracy,AUC score,Best params
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Lgbm,1.0,0.719,0.837,0.719,,"{'clf__colsample_bytree': 0.7191449614860558, ..."
LogisticReg,1.0,0.638,0.779,0.638,,"{'clf__C': 91.77536999208886, 'clf__penalty': ..."


## The order of the models is reversed right now
### But why did the above mistakes affect the performance of the boosting algorithm in a negative way?

I assume that the following happened:<br>
 - the training data was split into the train and hold-out folds during the Grid-CV-Search. However, training and hold-out folds were dependent on each other (same statistics for preprocessing and the upsampled cases)
 - GradientBoosting is prone to overfitting and is considered to be the model with the higher capacity
 - Hence, GradientBoosting overfitted the training data and had worse results when applied to the test-data (it did not generalized to the test-data)
 - Logistic Regression has lower capacity, it could not fit to all intricacies of the training data and generalized better to the test-data
 - when preprocessing and upsampling was done within the pipeline within the Grid-CV-Search, the GradientBoosting algorithm was regularised properly and did not overfit. Then we saw the expected result: GradientBoosting outperformed Logistic Regression