In [17]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning) 
import pandas as pd
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', -1)
import numpy as np
import re

# Unblanaced dataset
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC, SMOTEN, SVMSMOTE, BorderlineSMOTE, ADASYN

# modeling utilities
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer # enables sklearn.impute.IterativeImputer
from sklearn.impute import SimpleImputer, IterativeImputer


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score


from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.compose import make_column_transformer, ColumnTransformer

# Additional models
from lightgbm import LGBMClassifier

In [19]:
columns = ["age","workclass" ,"final-weight","education","education-num","marital-status","occupation", "relationship", "race", "sex", "capital-gain",
            "capital-loss", "hours-per-week", "native-country", "salary" ]

# Reading datasets
train_df = pd.read_csv('../data/adult.data', names=columns, header=None, skipinitialspace = True)
test_df = pd.read_csv('../data/adult.test', names=columns, header=None, skipinitialspace = True)
test_df["salary"] = test_df["salary"].apply(lambda x: x.replace('.', ''))

print("Training set:") #, train_df.columns)
print(train_df.sample(frac=1).tail(3))

print("Test set:") #, test_df.columns)
print(test_df.sample(frac=1).tail(3))

Training set:
       age  workclass  final-weight     education  education-num marital-status      occupation   relationship   race     sex  capital-gain  capital-loss  hours-per-week native-country salary
23855  42   State-gov  39239         Masters       14             Never-married  Prof-specialty  Not-in-family  White  Male    0             0             70              United-States  <=50K
20950  38   Private    69306         Some-college  10             Divorced       Craft-repair    Unmarried      White  Female  0             0             40              United-States  <=50K
160    68   ?          38317         1st-4th       2              Divorced       ?               Not-in-family  White  Female  0             0             20              United-States  <=50K
Test set:
       age  workclass  final-weight     education  education-num      marital-status       occupation   relationship   race     sex  capital-gain  capital-loss  hours-per-week native-country salary
1028   27 

In [20]:
education_map = {'Preschool': "elem_school", '1st-4th':"elem_school",'5th-6th':"elem_school", 
                 "mid_school":'7th-8th', 
                 '9th':"high_school_lower",'10th':"high_school_lower", '11th':"high_school_lower", '12th':"high_school_lower",
                 'HS-grad':"high_school_upper", 'Assoc-voc':"high_school_upper", 'Assoc-acdm':"high_school_upper", 'Some-college':"high_school_upper", 
                  'Bachelors':"graduate", 'Masters':'graduate', 'Prof-school':"post_grad", 'Doctorate':'post_grad'}
train_df['education'] = train_df['education'].map(education_map)
test_df['education'] = test_df['education'].map(education_map)

In [21]:
employed_map =  {'Self-emp-inc': "self_employed_expert", 'Self-emp-not-inc':"self_employed", 'Private':"self_employed", 
                 'Local-gov':"public_servant", 'State-gov':'public_servant','Federal-gov': 'public_servant',
                 "Withoug-pay":"unemployed", 'Never-worked':'unemployed'}
train_df['workclass'] = train_df['workclass'].map(employed_map)
test_df['workclass'] = test_df['workclass'].map(employed_map)

In [22]:
occupation_map = {"Prof-specialty":"executives", "Exec-managerial":"executives", 
                  "Protective-serv":"experts", "Tech-support":"experts", "Sales":"experts", "Craft-repair":"experts", "Transport-moving":"experts",
                  "Adm-clerical":"technicians", "Machine-op-inspct":"technicians", "Farming-fishing":"technicians", "Armed-Forces":"technicians",
                  "Priv-house-serv":"services", "Other-service":"services", "Handlers-cleaners":"services"}
train_df["occupation"] = train_df["occupation"].map(occupation_map)
test_df["occupation"] = test_df["occupation"].map(occupation_map)

In [23]:
marital_map = {'Married-civ-spouse':"couple",'Married-AF-spouse':"couple", 
               'Separated':"single",'Divorced':"single", 'Married-spouse-absent':"single", 'Widowed':"single", 'Never-married':"single"}
train_df["marital-status"] = train_df["marital-status"].map(marital_map)
test_df["marital-status"] = test_df["marital-status"].map(marital_map)


In [24]:
family_map = {"Wife":"family_strong", "Husband":"family_strong", 
              "Other-relative":"family_weak", "Own-child":"family_weak", 
              "Unmarried":"family_none", "Not-in-family":"family_none"}
train_df["relationship"] =  train_df["relationship"].map(family_map)
test_df["relationship"] =  test_df["relationship"].map(family_map)

In [25]:
gdp_map = {"France":"GDP_high",  "Ireland":"GDP_high",  "United-States":"GDP_high",  "Holand-Netherlands":"GDP_high",  "Canada":"GDP_high",  "Germany":"GDP_high",  "Hong":"GDP_high",  "England":"GDP_high",  "Japan":"GDP_high",  "Scotland":"GDP_high", 
           "Italy":"GDP_mid",  "South":"GDP_mid",  "Puerto-Rico":"GDP_mid",  "Taiwan":"GDP_mid",  "Portugal":"GDP_mid", "Greece":"GDP_mid", "Hungary":"GDP_mid", "Poland":"GDP_mid", "Trinadad&Tobago":"GDP_mid", "China":"GDP_mid", 
           "Mexico":"GDP_low", "Dominican-Republic":"GDP_low", "Thailand":"GDP_low", "Peru":"GDP_low", "Columbia":"GDP_low", 
           "Ecuador":"GDP_low", "Jamaica":"GDP_low", "Guatemala":"GDP_low", "El-Salvador":"GDP_low", "Vietnam":"GDP_low", "Philippines":"GDP_low",
           "Laos":"GDP_low", "Honduras":"GDP_low", "India":"GDP_low", 
           "Nicaragua":"GDP_low", "Haiti":"GDP_low", "Cambodia":"GDP_low", "Iran":"GDP_low", "Yugoslavia":"GDP_low", 
           "Outlying-US(Guam-USVI-etc)":"GDP_low", "Cuba":"GDP_low"}
train_df["native-country"] =  train_df["native-country"].map(gdp_map)
test_df["native-country"] =  test_df["native-country"].map(gdp_map)


In [26]:
from imblearn.pipeline import Pipeline

num_feats_stdscale = ['hours-per-week', 'capital-gain', 'capital-loss']
ordinal_feats = ['education']
education_rank = [["not_known","elem_school", "mid_school", "high_school_lower", "high_school_upper", "graduate", "post_grad"]]
cat_feats_ohe = ['workclass', 'marital-status', 'occupation', 'relationship', 'native-country','race', 'sex']
num_feats_min_max = ['age']

numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer()),
    ('scaler', StandardScaler())
])

ohe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='not_known')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])


ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='not_known')),
    ('ordinal', OrdinalEncoder(categories=education_rank, handle_unknown='use_encoded_value', unknown_value=-1))
])

min_max_transformer = MinMaxScaler()

pipeline = Pipeline(steps =[ 
    ('columnT', ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_feats_stdscale),
            ('cat', ohe, cat_feats_ohe),
            ('ordinal', ordinal_transformer, ordinal_feats),
            ('min_max_standardizer', min_max_transformer, num_feats_min_max)
            ]))
])


### these are the two first mistakes:
  - pipeline is fit on whole data and not in cross-validation framework on the training-folds only
  - test data shouldl be just transformed with `pipeline.transform()` and not fitted again

In [27]:
processed_train = pipeline.fit_transform(train_df.drop("salary", axis=1))
processed_test = pipeline.fit_transform(test_df.drop("salary", axis=1))

### here is the second mistake:
  - up-sampline (`SMOTENC`) is applied to all data - not only the trainings-fold within the cross-validation

In [28]:
categorical_feature_mask = train_df.drop("salary", axis=1).dtypes == object

smote = SMOTENC(categorical_features = categorical_feature_mask, sampling_strategy="minority")
X_balanced_train_df, y_balanced_train_df = smote.fit_resample(processed_train, train_df["salary"])
train_y = (y_balanced_train_df == '>50K')
test_y = (test_df['salary'] == '>50k')


In [29]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


models_dict_list = [
            {"name": "Lgbm", "model": LGBMClassifier(objective='binary', boosting_type='gbdt',
                                                    ),                      # WORKS
                    "params": {'n_estimators' : Integer(200,1000),
                                'learning_rate': Real(1e-3, 0.1, prior='log-uniform'),
                                "subsample_freq" : Integer(0, 4),
                               "num_leaves": Integer(15,25),
                                "max_depth": Integer(2, 10),
                               "subsample": Real(0.7,1.0),
                               "colsample_bytree": Real(0.4, 1.0),
                              'min_child_samples':  Integer(2,20)
                              } },            
            {"name": "LogisticReg", "model": LogisticRegression(max_iter=500, solver = 'liblinear'), # WORKS
                    "params": {'C': Real(1e-6, 1e+6, prior='log-uniform'), 
                               'penalty' : Categorical(['l1', 'l2']) } }
            ]

In [30]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, precision_recall_curve,  confusion_matrix, roc_curve, auc, precision_recall_fscore_support
results_df = pd.DataFrame(columns=['Model', 'precision', 'recall', 'f1-score', 'accuracy', 'AUC score', 'Best params'])

N_folds = 5

for model_dict in models_dict_list:
    
    model_name = model_dict["name"]    
    if model_dict["params"] == {}:
        continue
    
    # Define search grid
    search = BayesSearchCV(estimator = model_dict["model"],
                           search_spaces = model_dict["params"],
                           n_iter = 10,
                           scoring = 'f1', # 'f1', f1_weighted', 'roc_auc', 'accuracy'
                           n_jobs = 6, # change based on how many cpus available (8 for me)
                           n_points = 6, # change based on how many cpus available (8 for me)
                           cv = N_folds, # default: StratifiedKFold for binary labels
                           refit = True,
                           verbose = 0,
                           error_score = 'raise', 
                           return_train_score  = True,        
                            )
    
    # Training model
    print("========== Running model: "+model_name, end='')
    _ = search.fit(X_balanced_train_df, train_y)
    
    # Print results
    print("Best validation score: ", round(search.best_score_, 3))
    print("Best search parameters: ", search.best_params_)
    
    best_model = search.best_estimator_
    cv_results = search.cv_results_
    
    y_true = test_y.to_numpy()
    y_pred = best_model.predict(processed_test) 
    # Evauate metrics for weighted support
    accuracy = accuracy_score(y_true, y_pred)
    P, R, F1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    
    # Evaluate probabilities and auc_score

    y_predict_proba = best_model.predict_proba(processed_test)[:,1]
    
    fpr, tpr, thresholds = roc_curve(y_true, y_predict_proba)    
    auc_score = auc(fpr, tpr)
    
    # Store parameters
    results_df.loc[len(results_df.index)] = [model_name, round(P, 3), round(R, 3),  round(F1, 3), round(accuracy, 3), round(auc_score, 3), search.best_params_]
    
print("=== BEST MODEL RESULTS SUMMARY ===")
results_df = results_df.set_index('Model')
results_df.sort_values("f1-score", inplace = True, ascending=False)
results_df  

Best search parameters:  OrderedDict([('colsample_bytree', 0.5165616293938133), ('learning_rate', 0.06838141928599331), ('max_depth', 4), ('min_child_samples', 9), ('n_estimators', 655), ('num_leaves', 23), ('subsample', 0.908930436038046), ('subsample_freq', 4)])


  _warn_prf(average, modifier, msg_start, len(result))


Best search parameters:  OrderedDict([('C', 4704.313930677073), ('penalty', 'l2')])
=== BEST MODEL RESULTS SUMMARY ===


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0_level_0,precision,recall,f1-score,accuracy,AUC score,Best params
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LogisticReg,1.0,0.62,0.766,0.62,,"{'C': 4704.313930677073, 'penalty': 'l2'}"
Lgbm,1.0,0.569,0.725,0.569,,"{'colsample_bytree': 0.5165616293938133, 'learning_rate': 0.06838141928599331, 'max_depth': 4, 'min_child_samples': 9, 'n_estimators': 655, 'num_leaves': 23, 'subsample': 0.908930436038046, 'subsample_freq': 4}"


### see the result: logistic regression is better than Light-GBM



### next, the mistakes from above are not repeated:
  - up-sampling (`SMOTENC`) is part of the pipeline right now - it gets trained on training folds and applied to hold-out fold
  - the whole pipeline is fit within the cross-validation framework and applied to the hold-out set
  - the same is true for the test set: just the `.predict()`-method of the pipeline is applied; not `.fit_transform()`

In [31]:
models_dict_list = [
            {"name": "Lgbm", "model": LGBMClassifier(objective='binary', boosting_type='gbdt',
                                                    ),                      # WORKS
                    "params": {'clf__n_estimators' : Integer(200,1000),
                                'clf__learning_rate': Real(1e-3, 0.1, prior='log-uniform'),
                                "clf__subsample_freq" : Integer(0, 4),
                               "clf__num_leaves": Integer(15,25),
                                "clf__max_depth": Integer(2, 10),
                               "clf__subsample": Real(0.7,1.0),
                               "clf__colsample_bytree": Real(0.4, 1.0),
                              'clf__min_child_samples':  Integer(2,20)
                              } },            
            {"name": "LogisticReg", "model": LogisticRegression(max_iter=500, solver = 'liblinear'), # WORKS
                    "params": {'clf__C': Real(1e-6, 1e+6, prior='log-uniform'), 
                               'clf__penalty' : Categorical(['l1', 'l2']) } }
            ]

In [32]:
pipelines = []
params = []
names = []
for m in models_dict_list:
    pipelines.append(Pipeline(steps =[ 
    ('columnT', ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_feats_stdscale),
            ('cat', ohe, cat_feats_ohe),
            ('ordinal', ordinal_transformer, ordinal_feats),
            ('min_max_standardizer', min_max_transformer, num_feats_min_max)
            ])),
    ('smotenc', SMOTENC(categorical_features=categorical_feature_mask, sampling_strategy="minority")),
    ("clf", m['model'])
    ])
                    )
    params.append(m["params"])
    names.append(m["name"])

In [33]:
from sklearn.metrics import f1_score
results_df = pd.DataFrame(columns=['Model', 'precision', 'recall', 'f1-score', 'accuracy', 'AUC score', 'Best params'])

N_folds = 5

for pipe, name, par in zip(pipelines, names, params):
    
    model_name = name
    
    # Define search grid
    search = BayesSearchCV(estimator = pipe,
                           search_spaces = par,
                           n_iter = 10,
                           scoring = 'f1', # 'f1', f1_weighted', 'roc_auc', 'accuracy'
                           n_jobs = 6, # change based on how many cpus available (8 for me)
                           n_points = 6, # change based on how many cpus available (8 for me)
                           cv = N_folds, # default: StratifiedKFold for binary labels
                           refit = True,
                           verbose = 0,
                           error_score = 'raise', 
                           return_train_score  = True,        
                            )
    
    # Training model
    print("========== Running model: "+model_name, end='')
    train_y = (train_df['salary'] == '>50K')
    _ = search.fit(train_df.drop("salary", axis=1), train_y)
    
    # Print results
    print("Best validation score: ", round(search.best_score_, 3))
    print("Best search parameters: ", search.best_params_)
    
    best_model = search.best_estimator_
    cv_results = search.cv_results_
    
    y_true = test_y.to_numpy()
    y_pred = best_model.predict(test_df.drop('salary', axis=1)) 
    # Evauate metrics for weighted support
    accuracy = accuracy_score(y_true, y_pred)
    P, R, F1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    
    # Evaluate probabilities and auc_score

    y_predict_proba = best_model.predict_proba(test_df.drop('salary', axis=1))[:,1]
    
    fpr, tpr, thresholds = roc_curve(y_true, y_predict_proba)    
    auc_score = auc(fpr, tpr)
    
    # Store parameters
    results_df.loc[len(results_df.index)] = [model_name, round(P, 3), round(R, 3),  round(F1, 3), round(accuracy, 3), round(auc_score, 3), search.best_params_]
    
print("=== BEST MODEL RESULTS SUMMARY ===")
results_df = results_df.set_index('Model')
results_df.sort_values("f1-score", inplace = True, ascending=False)
results_df  

Best search parameters:  OrderedDict([('clf__colsample_bytree', 0.43804816661770124), ('clf__learning_rate', 0.01931522132340677), ('clf__max_depth', 8), ('clf__min_child_samples', 14), ('clf__n_estimators', 711), ('clf__num_leaves', 23), ('clf__subsample', 0.8649509640467155), ('clf__subsample_freq', 0)])


  _warn_prf(average, modifier, msg_start, len(result))


Best search parameters:  OrderedDict([('clf__C', 50728.02008140517), ('clf__penalty', 'l1')])
=== BEST MODEL RESULTS SUMMARY ===


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0_level_0,precision,recall,f1-score,accuracy,AUC score,Best params
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Lgbm,1.0,0.696,0.821,0.696,,"{'clf__colsample_bytree': 0.43804816661770124, 'clf__learning_rate': 0.01931522132340677, 'clf__max_depth': 8, 'clf__min_child_samples': 14, 'clf__n_estimators': 711, 'clf__num_leaves': 23, 'clf__subsample': 0.8649509640467155, 'clf__subsample_freq': 0}"
LogisticReg,1.0,0.621,0.766,0.621,,"{'clf__C': 50728.02008140517, 'clf__penalty': 'l1'}"


### the order of the models is reversed right now