In [13]:
from collections import Counter
import pandas as pd
from sklearn.model_selection import (train_test_split)
from sklearn.metrics import (
    mean_squared_error as MSE,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    average_precision_score,
    log_loss,

)
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
import numpy as np
import optuna
from sklearn.preprocessing import OrdinalEncoder
import matplotlib.pyplot as plt
import xgboost as xgb
from imblearn.over_sampling import SMOTE
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

scaler = StandardScaler()
data = pd.read_csv("data/employee_data.csv")
oversample = SMOTE()
encoder = OrdinalEncoder()



In [14]:
data = data.drop(["EmployeeCount", "EmployeeNumber", "Over18", "StandardHours"], axis=1)
data["first_company"] = np.where(data.NumCompaniesWorked == 0, 1, 0)
data["first_role"] = np.where(data.YearsAtCompany == data.YearsInCurrentRole, 1, 0)
data["first_manager"] = np.where(data.YearsWithCurrManager == data.YearsAtCompany, 1, 0)

In [15]:
data = pd.DataFrame(encoder.fit_transform(data), columns=encoder.get_feature_names_out())

categorical_cols = data.select_dtypes(include="object").columns.tolist()
categorical_cols.append("Education")
categorical_cols.append("EnvironmentSatisfaction")
categorical_cols.append("JobInvolvement")
categorical_cols.append("JobLevel")
categorical_cols.append("JobSatisfaction")
categorical_cols.append("PerformanceRating")
categorical_cols.append("RelationshipSatisfaction")
categorical_cols.append("WorkLifeBalance")
categorical_cols.append("StockOptionLevel")
categorical_cols.append("first_company")
categorical_cols.append("first_role")
categorical_cols.append("first_manager")
categorical_cols.append("Gender")

for col in categorical_cols:
    data[col] = data[col].astype("category")
    
X = data.drop(columns=["Staying?"])
y = data["Staying?"]
    
X_train, X_leftover, y_train, y_leftover = train_test_split(X, y, test_size=0.3, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_leftover, y_leftover, test_size=0.5, stratify=y_leftover)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Age                       1470 non-null   float64 
 1   Staying?                  1470 non-null   float64 
 2   BusinessTravel            1470 non-null   float64 
 3   DailyRate                 1470 non-null   float64 
 4   Department                1470 non-null   float64 
 5   DistanceFromHome          1470 non-null   float64 
 6   Education                 1470 non-null   category
 7   EducationField            1470 non-null   float64 
 8   EnvironmentSatisfaction   1470 non-null   category
 9   Gender                    1470 non-null   category
 10  HourlyRate                1470 non-null   float64 
 11  JobInvolvement            1470 non-null   category
 12  JobLevel                  1470 non-null   category
 13  JobRole                   1470 non-null   float6

In [17]:
def objective(trial):
    test_metric = "auc"
    
    param = {
        # "verbosity": 0,
        "objective": "binary:logistic",
        "tree_method": "hist",
        "eval_metric": f"{test_metric}",
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0, 6),
        # "scale_pos_weight": Counter(y)[0] / Counter(y)[1],
        # "max_delta_step": trial.suggest_int("max_delta_step", 0, 10),
        "booster": trial.suggest_categorical("booster", ["gbtree","dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 10.0),
        "alpha": trial.suggest_float("alpha", 0, 10.0, ),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0, log=True),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1.0), 
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.1, 1.0), 
        "sampling_method": "uniform",  # uniform, gradient_based
        "min_child_weight": trial.suggest_float("min_child_weight", 1, 50),
        "max_bin": trial.suggest_int("max_bin", 10, 2000),
        # "num_parallel_tree": trial.suggest_int("num_parallel_tree", 0, 500),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-8, 1.0),
        "gamma": trial.suggest_float("gamma", 1e-8, 10.0),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise","lossguide"]),
    }

    if param["grow_policy"] == "lossguide":
        param["max_leaves"] = trial.suggest_int("max_leaves", 1, 100)

    if param["booster"] == "dart":
        param["sample_type"] = "uniform"
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree","forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0)
        param["one_drop"] = trial.suggest_categorical("one_drop", [True,False])
        
    # pruning_callback = optuna.integration.XGBoostPruningCallback(trial, f"validate-{test_metric}")
    clf = xgb.XGBClassifier(**param, n_estimators=10000, verbosity=0, enable_categorical=True)
    selector = RFECV(clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10), cv=10, scoring="f1")
    
    selector.fit(X_train, y_train)
    y_pred = selector.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    
    return f1


# study.optimize(objective, n_trials=1000, n_jobs=1, show_progress_bar=True, gc_after_trial=True)

In [18]:
clf = xgb.XGBClassifier(tree_method="hist" ,n_estimators=10000, enable_categorical=True, verbosity=1)
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50)
# clf.set_params(early_stopping_rounds=10)
rfe = RFECV(clf, cv=3, scoring="f1")


[0]	validation_0-logloss:0.56276
[1]	validation_0-logloss:0.49069
[2]	validation_0-logloss:0.44087
[3]	validation_0-logloss:0.41620
[4]	validation_0-logloss:0.38980
[5]	validation_0-logloss:0.36962
[6]	validation_0-logloss:0.36216
[7]	validation_0-logloss:0.35921
[8]	validation_0-logloss:0.35553
[9]	validation_0-logloss:0.35563
[10]	validation_0-logloss:0.36001
[11]	validation_0-logloss:0.35455
[12]	validation_0-logloss:0.35625
[13]	validation_0-logloss:0.35429
[14]	validation_0-logloss:0.35556
[15]	validation_0-logloss:0.35759
[16]	validation_0-logloss:0.35869
[17]	validation_0-logloss:0.35492
[18]	validation_0-logloss:0.35844
[19]	validation_0-logloss:0.36370
[20]	validation_0-logloss:0.36637
[21]	validation_0-logloss:0.36726
[22]	validation_0-logloss:0.36623
[23]	validation_0-logloss:0.37186
[24]	validation_0-logloss:0.37112




[25]	validation_0-logloss:0.37145
[26]	validation_0-logloss:0.37595
[27]	validation_0-logloss:0.37598
[28]	validation_0-logloss:0.38001
[29]	validation_0-logloss:0.38199
[30]	validation_0-logloss:0.38516
[31]	validation_0-logloss:0.38879
[32]	validation_0-logloss:0.39063
[33]	validation_0-logloss:0.39122
[34]	validation_0-logloss:0.39326
[35]	validation_0-logloss:0.39106
[36]	validation_0-logloss:0.38926
[37]	validation_0-logloss:0.39394
[38]	validation_0-logloss:0.39712
[39]	validation_0-logloss:0.40019
[40]	validation_0-logloss:0.40122
[41]	validation_0-logloss:0.40232
[42]	validation_0-logloss:0.40397
[43]	validation_0-logloss:0.40465
[44]	validation_0-logloss:0.40776
[45]	validation_0-logloss:0.41037
[46]	validation_0-logloss:0.41201
[47]	validation_0-logloss:0.41353
[48]	validation_0-logloss:0.41196
[49]	validation_0-logloss:0.41128
[50]	validation_0-logloss:0.41428
[51]	validation_0-logloss:0.41820
[52]	validation_0-logloss:0.42110
[53]	validation_0-logloss:0.42128
[54]	validatio

In [19]:
rfe.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
study = optuna.create_study(
    # storage=storage,
    # directions=["maximize", "minimize"],
    direction="maximize",
    study_name="employee_data_f1_weighted",
    load_if_exists=True,
    # sampler=optuna.samplers.MOTPESampler(),
    # pruner=optuna.pruners.HyperbandPruner(),
    # sampler=optuna.samplers.TPESampler(),
    pruner=optuna.pruners.MedianPruner(),
    sampler=optuna.samplers.RandomSampler()
)

In [None]:
xgb.cv()

In [None]:
study.optimize(objective, n_trials=10000, show_progress_bar=True)