In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [16]:
train_path = '../data/adult/adult.data'
test_path = '../data/adult/adult.test'

column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
    'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
    'hours_per_week', 'native_country', 'income'
]


df_train = pd.read_csv(train_path, names=column_names, na_values=' ?', skipinitialspace=True)
df_test = pd.read_csv(test_path, names=column_names, na_values=' ?', skipinitialspace=True, skiprows=1)

df_train.drop(columns=['fnlwgt'], inplace=True)
df_test.drop(columns=['fnlwgt'], inplace=True)

df_test['income'] = df_test['income'].str.rstrip('.')

target_map = {"<=50K": 0, ">50K": 1}
df_train["income"] = df_train["income"].map(target_map)
df_test["income"] = df_test["income"].map(target_map)

In [17]:
X_train = df_train.drop('income',axis=1)
y_train = df_train['income']
X_test  = df_test.drop('income',axis=1)
y_test  = df_test['income']

cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), num_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]), cat_cols)
])

In [18]:
def evaluate(model, X_test, y_test):
    pred = model.predict(X_test)
    proba = model.predict_proba(X_test)[:,1]
    return {
        "accuracy": accuracy_score(y_test,pred),
        "precision": precision_score(y_test,pred,zero_division=0),
        "recall": recall_score(y_test,pred,zero_division=0),
        "f1": f1_score(y_test,pred),
        "roc_auc": roc_auc_score(y_test,proba),
        "pr_auc": average_precision_score(y_test,proba)
    }

In [19]:
pos_weight = (y_train==0).sum()/(y_train==1).sum()

xgb = XGBClassifier(
    tree_method='hist',
    eval_metric='logloss',
    random_state=42
)

xgb_params = {
    'clf__n_estimators': [300,500,800,1000],
    'clf__learning_rate': [0.001,0.01,0.03,0.05,0.1],
    'clf__max_depth': [3,4,5,6,8],
    'clf__subsample': [0.7,0.8,1.0],
    'clf__colsample_bytree': [0.7,0.8,1.0],
    'clf__min_child_weight': [1,3,5,7],
    'clf__gamma': [0,0.1,0.3,1],
    'clf__scale_pos_weight': [pos_weight]
}

pipe_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', xgb)
])

search_xgb = RandomizedSearchCV(
    pipe_xgb,
    xgb_params,
    n_iter=40,
    cv=StratifiedKFold(5,shuffle=True,random_state=42),
    scoring='f1',
    verbose=2,
    n_jobs=1,
    random_state=42
)

search_xgb.fit(X_train,y_train)
best_xgb = search_xgb.best_estimator_
xgb_metrics = evaluate(best_xgb, X_test, y_test)

print(search_xgb.best_params_)
print(xgb_metrics)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END clf__colsample_bytree=0.8, clf__gamma=0.3, clf__learning_rate=0.001, clf__max_depth=4, clf__min_child_weight=3, clf__n_estimators=1000, clf__scale_pos_weight=3.152659099604642, clf__subsample=0.8; total time=  14.0s
[CV] END clf__colsample_bytree=0.8, clf__gamma=0.3, clf__learning_rate=0.001, clf__max_depth=4, clf__min_child_weight=3, clf__n_estimators=1000, clf__scale_pos_weight=3.152659099604642, clf__subsample=0.8; total time=   2.4s
[CV] END clf__colsample_bytree=0.8, clf__gamma=0.3, clf__learning_rate=0.001, clf__max_depth=4, clf__min_child_weight=3, clf__n_estimators=1000, clf__scale_pos_weight=3.152659099604642, clf__subsample=0.8; total time=   2.8s
[CV] END clf__colsample_bytree=0.8, clf__gamma=0.3, clf__learning_rate=0.001, clf__max_depth=4, clf__min_child_weight=3, clf__n_estimators=1000, clf__scale_pos_weight=3.152659099604642, clf__subsample=0.8; total time=   4.2s
[CV] END clf__colsample_bytree=0.8, cl

In [20]:
# LightGBM model with verbosity off
lgb = LGBMClassifier(objective='binary', random_state=42, verbosity=-1)

lgb_params = {
    'clf__n_estimators':[300,500,800,1000],
    'clf__learning_rate':[0.001,0.01,0.03,0.05,0.1],
    'clf__num_leaves':[15,31,63,127],
    'clf__subsample':[0.7,0.8,1.0],
    'clf__colsample_bytree':[0.7,0.8,1.0],
    'clf__reg_alpha':[0,0.1,0.5,1],
    'clf__reg_lambda':[0,0.1,0.5,1],
    'clf__scale_pos_weight':[pos_weight]
}

pipe_lgb = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', lgb)
])

search_lgb = RandomizedSearchCV(
    pipe_lgb,
    lgb_params,
    n_iter=40,
    cv=StratifiedKFold(5,shuffle=True,random_state=42),
    scoring='f1',
    verbose=2, 
    n_jobs=1,
    random_state=42
)

search_lgb.fit(X_train,y_train)

best_lgb = search_lgb.best_estimator_
lgb_metrics = evaluate(best_lgb, X_test, y_test)

print(search_lgb.best_params_)
print(lgb_metrics)


Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END clf__colsample_bytree=0.8, clf__learning_rate=0.1, clf__n_estimators=500, clf__num_leaves=127, clf__reg_alpha=0.1, clf__reg_lambda=1, clf__scale_pos_weight=3.152659099604642, clf__subsample=0.8; total time=   3.3s
[CV] END clf__colsample_bytree=0.8, clf__learning_rate=0.1, clf__n_estimators=500, clf__num_leaves=127, clf__reg_alpha=0.1, clf__reg_lambda=1, clf__scale_pos_weight=3.152659099604642, clf__subsample=0.8; total time=   2.5s
[CV] END clf__colsample_bytree=0.8, clf__learning_rate=0.1, clf__n_estimators=500, clf__num_leaves=127, clf__reg_alpha=0.1, clf__reg_lambda=1, clf__scale_pos_weight=3.152659099604642, clf__subsample=0.8; total time=   3.0s
[CV] END clf__colsample_bytree=0.8, clf__learning_rate=0.1, clf__n_estimators=500, clf__num_leaves=127, clf__reg_alpha=0.1, clf__reg_lambda=1, clf__scale_pos_weight=3.152659099604642, clf__subsample=0.8; total time=   3.1s
[CV] END clf__colsample_bytree=0.8, clf__learn

In [21]:
df = pd.DataFrame([
    {"model":"XGBoost", **xgb_metrics},
    {"model":"LightGBM", **lgb_metrics}
])

df

Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc,pr_auc
0,XGBoost,0.833364,0.603888,0.856214,0.708248,0.927182,0.825005
1,LightGBM,0.836128,0.609398,0.853094,0.710943,0.927885,0.826196


In [22]:
param_distributions = {
    "num_leaves": np.arange(20, 200, 5),
    "max_depth": [-1, 3, 4, 5, 6, 7, 8, 9, 10],
    "learning_rate": np.linspace(0.005, 0.2, 30),
    "n_estimators": np.arange(200, 2000, 100),
    "subsample": np.linspace(0.6, 1.0, 10),
    "colsample_bytree": np.linspace(0.6, 1.0, 10),
    "min_child_samples": np.arange(5, 100, 5),
}

base_lgb = LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    random_state=42,
    n_jobs=-1,
    verbosity=-1
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X_train = X_train.copy()
X_test = X_test.copy()

for col in cat_cols:
    X_train[col] = X_train[col].astype("category")
    X_test[col] = X_test[col].astype("category")


In [23]:
search_lgb = RandomizedSearchCV(
    estimator=base_lgb,
    param_distributions=param_distributions,
    n_iter=50,
    scoring="f1",
    cv=cv,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

search_lgb.fit(X_train, y_train)

best_lgb = search_lgb.best_estimator_
search_lgb.best_params_


Fitting 5 folds for each of 50 candidates, totalling 250 fits


{'subsample': np.float64(0.7333333333333333),
 'num_leaves': np.int64(155),
 'n_estimators': np.int64(300),
 'min_child_samples': np.int64(40),
 'max_depth': 5,
 'learning_rate': np.float64(0.07224137931034483),
 'colsample_bytree': np.float64(0.6444444444444444)}

In [24]:
y_pred = best_lgb.predict(X_test)
y_proba = best_lgb.predict_proba(X_test)[:, 1]

results = {
    "model": ["LightGBM (tuned)"],
    "accuracy": [accuracy_score(y_test, y_pred)],
    "precision": [precision_score(y_test, y_pred)],
    "recall": [recall_score(y_test, y_pred)],
    "f1": [f1_score(y_test, y_pred)],
    "roc_auc": [roc_auc_score(y_test, y_proba)],
    "pr_auc": [average_precision_score(y_test, y_proba)],
}

results_df = pd.DataFrame(results)

benchmark_df = pd.concat([df, results_df], ignore_index=True)
benchmark_df

Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc,pr_auc
0,XGBoost,0.833364,0.603888,0.856214,0.708248,0.927182,0.825005
1,LightGBM,0.836128,0.609398,0.853094,0.710943,0.927885,0.826196
2,LightGBM (tuned),0.872797,0.771656,0.655486,0.708843,0.928415,0.827824
