##  Employee Attrition Prediction with DNN
In this notebook, I will create Employee Attrition Prediction Models using DNN with Employee Attrition Dataset in [Playground Series Season 3, Episode 3 Competition](https://www.kaggle.com/competitions/playground-series-s3e3) and [IBM HR Analytics Employee Attrition & Performance](https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attrition-dataset). I will do Exploratory Data Analaysis to divide these features into categorical features and numerical features and also select correlated features. I will train models for 5 folds using StratifiedKFold split strategy with different random seeds. To maximize CV, I will search best model using KerasTuner for different folds and retrain the model for more epochs and  select best model among them. 

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import warnings
import keras_tuner as kt
import tensorflow as tf
from datetime import datetime
import math
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.layers import Normalization
warnings.filterwarnings("ignore")

## Configuration

In [None]:
class CFG:
    TRAIN_DATA_URL = "../input/playground-series-s3e3/train.csv"
    TEST_DATA_URL = "../input/playground-series-s3e3/test.csv"
    id_field = "id"
    target_field = "Attrition"
    n_folds = 5
    quick_experiment = False
    tuning_epochs = 20
    max_trials = 5 if quick_experiment else 10
    epochs = 30 if quick_experiment else 100
    use_correlated_columns = False

## Utils

In [None]:
def get_cosine_decay_learning_rate_scheduler(epochs, lr_start=0.001, lr_end=1e-6):
    def cosine_decay(epoch):
        if epoch <= CFG.tuning_epochs:
            return lr_start
        if epochs > 1:
            w = (1 + math.cos(epoch / (epochs-1) * math.pi)) / 2
        else:
            w = 1
        return w * lr_start + (1 - w) * lr_end
    return LearningRateScheduler(cosine_decay, verbose=0)

## Load data

In [None]:
train = pd.read_csv(CFG.TRAIN_DATA_URL)
external_train = pd.read_csv("../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")
external_train.rename(columns={"EmployeeNumber": "id"}, inplace=True)
external_train[CFG.target_field] = external_train[CFG.target_field].apply(lambda attribution: 1 if attribution=="Yes" else 0)
train = pd.concat([train, external_train])
train.index = list(range(len(train)))
train.head()

In [None]:
test = pd.read_csv(CFG.TEST_DATA_URL)
test.head()

## EDA

### Overall label distribution

In [None]:
sns.countplot(train[CFG.target_field])

### Infer categorical columns and numeric columns automatically
As we can see, StandardHours, Over18 and EmployeeCount only have one unique value, doesn't make any difference. These columns can be removed.

In [None]:
categorical_columns = []
numeric_columns = []
for column in train.columns:
    if train[column].dtype == object:
        if len(train[column].unique()) == 1:
            print(f"{column} only have one unique value, omit this feature.")
        else:
            categorical_columns.append(column)
    elif column != CFG.target_field and column != CFG.id_field:
        if len(train[column].unique()) == 1:
            print(f"{column} only have one unique value, omit this feature.")
        elif len(train[column].unique()) <= 10:
            categorical_columns.append(column)
        else:
            numeric_columns.append(column)
feature_columns = numeric_columns + categorical_columns
all_columns = feature_columns + [CFG.target_field]
print(f"Categorical columns:{categorical_columns}.")
print(f"Numerical columns:{numeric_columns}.")

### Distribution of target in different categorical features

In [None]:
for column in categorical_columns:
    sns.countplot(data=train, x=column, hue=CFG.target_field)
    plt.show()

### Distribution of target in different numerical features

In [None]:
for column in numeric_columns:
    sns.pairplot(train[[column, CFG.target_field]], hue=CFG.target_field, diag_kind="kde")
    plt.figure(figsize=(10, 10))
    plt.show()

In [None]:
test[CFG.target_field] = 0.0
data = pd.concat([train[all_columns], test[all_columns]])
data_dummy = pd.get_dummies(data, columns=categorical_columns)
train_dummy = data_dummy.iloc[:len(train)]
test_dummy = data_dummy.iloc[len(train):]
train_dummy.head()

### Correlated features

In [None]:
corr = train_dummy.corr()
correlated_columns =  list(corr[CFG.target_field][corr[CFG.target_field].abs() > 0.05].index)
correlated_columns.remove(CFG.target_field)
if CFG.id_field in correlated_columns:
    correlated_columns.remove(CFG.id_field)
print(f"Correlated features:{correlated_columns}")

In [None]:
corr = train_dummy[correlated_columns + [CFG.target_field]].corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr, vmin=0, vmax=1.0,  cmap="PiYG")
plt.show()

In [None]:
corr[CFG.target_field].sort_values(key=lambda x: abs(x), ascending=False)

In [None]:
if CFG.use_correlated_columns:
    columns = correlated_columns
else:
    columns = list(train_dummy.columns)
    columns.remove(CFG.target_field)
print(columns)

## Create Normalization Layer

In [None]:
nomalization = Normalization()
with tf.device("cpu"):
    nomalization.adapt(train_dummy[columns])

## Hyperparameter Tuning

In [None]:
def build_model(hp):
    use_dropout = hp.Choice("use_dropout", [True, False])
    dropout_value = hp.Float("dropout", min_value=0.1, max_value=0.5)
    learning_rate = hp.Float("learning_rate", min_value=1e-5, max_value=1e-3, sampling="log")
    depth = 5
    units = list(reversed(sorted([hp.Int(f"unit_{i}", min_value=16, max_value=128, step=16) for i in range(depth)])))
    activation =  hp.Choice("activation", ["relu", "swish"])
    l2_factor = hp.Float("l2", min_value=1e-6, max_value=1e-4, sampling="log")
    inputs = tf.keras.Input(shape=(len(columns)), dtype=tf.float32) 
    vector = nomalization(inputs)
    for i in range(depth):
        vector = tf.keras.layers.Dense(units[i], activation=activation, kernel_regularizer=tf.keras.regularizers.l2(l2_factor) if i == depth - 1 else None)(vector)
    if use_dropout:
        vector = tf.keras.layers.Dropout(dropout_value)(vector)
    output = tf.keras.layers.Dense(1, activation="sigmoid")(vector)
    model = tf.keras.Model(inputs=inputs, outputs=output)
    auc = tf.keras.metrics.AUC(name="auc")
    model.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate), metrics=[auc, "accuracy"])
    return model

## Modeling

Now train the model for using StratifiedKFold for 5 folds. I will search best model using KerasTuner for different folds. Since this dataset is very small, use different random seeds to stabilize the result. 

In [None]:
%%time
seeds = [42, 997]
models = []
scores = []
for seed in seeds:
    kfold =StratifiedKFold(CFG.n_folds, shuffle=True, random_state=seed)
    for fold, (train_indices, valid_indices) in enumerate(kfold.split(train_dummy, train_dummy[CFG.target_field])):
        print("=" * 100)
        print(f"Fold {fold}")
        print("=" * 100)
        X_train = train_dummy.iloc[train_indices]
        y_train = X_train.pop(CFG.target_field)
        X_val = train_dummy.iloc[valid_indices]
        y_val = X_val.pop(CFG.target_field)
        train_ds = tf.data.Dataset.from_tensor_slices((X_train[columns], y_train)).shuffle(1024).batch(128).cache().prefetch(tf.data.AUTOTUNE)
        valid_ds = tf.data.Dataset.from_tensor_slices((X_val[columns], y_val)).batch(128).cache().prefetch(tf.data.AUTOTUNE)
        
        tuner = kt.BayesianOptimization(
            build_model,
            objective=kt.Objective("val_auc", direction="max"),
            max_trials=CFG.max_trials,
            overwrite=True
        )
        tuner.search(train_ds, epochs=CFG.tuning_epochs, validation_data=valid_ds, verbose=2)
        tuner.results_summary()
        best_hps = tuner.get_best_hyperparameters()
        model = build_model(best_hps[0])
        model_name = f"model_seed_{seed}_fold_{fold}.tf"
        checkpoints = tf.keras.callbacks.ModelCheckpoint(
            model_name, 
            monitor="val_auc",
            mode="max",
            save_best_only=True,
            restore_best_weights=True
        )
        epochs = CFG.epochs
        learning_rate = model.optimizer.learning_rate.numpy()
        scheduler = get_cosine_decay_learning_rate_scheduler(epochs=epochs, lr_start=learning_rate, lr_end=learning_rate * 0.01)
        
        history = model.fit(train_ds, epochs=epochs, validation_data=valid_ds, callbacks=[checkpoints, scheduler], verbose=2)
        model = tf.keras.models.load_model(model_name)
        score = model.evaluate(valid_ds)[1]

        best_model = tuner.get_best_models()[0]
        score1 = best_model.evaluate(valid_ds)[1]
        if score > score1:
            models.append(model)
            scores.append(score)
        else:
            models.append(best_model)
            scores.append(score1)
print("Average AUC: %.5f"%(np.mean(scores)))

## Submission 

In [None]:
test[CFG.target_field]  = np.dot(np.array(scores), np.array([model.predict(test_dummy[columns]).reshape(-1) for model in models])) / np.sum(scores)
submission = test[[CFG.id_field, CFG.target_field]]
submission.to_csv("submission.csv", index=False)
submission.head()

In [None]:
submission[CFG.target_field].plot(kind="hist")