## #Importing Modules and Predefined Functions#

In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10, 6)

from skimpy import clean_columns

import plotly.io as pio
import plotly.express as px
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
pio.renderers.default = "colab" #'vscode','notebook', 'notebook_connected', 'kaggle', 'azure', 'colab','jpg', 'svg',
                                 #'pdf', 'browser', 'firefox', 'chrome', 'chromium', 'iframe',

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay, roc_auc_score, roc_curve,\
                            average_precision_score, precision_recall_curve

import pickle

In [10]:
df0=pd.read_csv('HR_Dataset.csv')
df0.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Departments,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [11]:
df=df0.copy()

In [12]:
df = clean_columns(df)

In [13]:
df.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'work_accident', 'left',
       'promotion_last_5years', 'departments', 'salary'],
      dtype='object')

In [14]:
df.drop_duplicates(keep='first', inplace=True)

In [15]:
X = df.drop('left',axis=1)
y = df['left']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify = y, random_state=101)

# XGBoost

In [17]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBClassifier


In [18]:
cat = X_train.select_dtypes("object").columns
cat

Index(['departments', 'salary'], dtype='object')

In [19]:
ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

column_trans = make_column_transformer((ord_enc, cat),
                                        remainder='passthrough',
                                        verbose_feature_names_out=False).set_output(transform="pandas")

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

operations = [("OrdinalEncoder", column_trans),
              ("scaler", StandardScaler()),
              ("xgb", XGBClassifier(random_state=101))]


xgb_model = Pipeline(steps=operations)

xgb_model.fit(X_train, y_train)



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).




In [21]:
xgb_model = Pipeline(steps=operations)

scores = cross_validate(xgb_model,
                        X_train,
                        y_train,
                        scoring = ['accuracy', 'precision','recall','f1'],
                        cv = 10,
                        return_train_score=True)

df_scores = pd.DataFrame(scores, index = range(1, 11))
df_scores.mean()[2:]

test_accuracy      0.983080
train_accuracy     0.998054
test_precision     0.977859
train_precision    0.998152
test_recall        0.918905
train_recall       0.990116
test_f1            0.947385
train_f1           0.994117
dtype: float64

In [22]:
xgb_model = Pipeline(steps=operations)


param_grid = {
    "xgb__n_estimators": [50, 100, 200],
    "xgb__max_depth": [3, 4, 5],
    "xgb__learning_rate": [0.1, 0.2],
    "xgb__subsample": [0.5, 0.8, 1],
    "xgb__colsample_bytree": [0.5, 0.7, 1]
}

xgb_grid_model = GridSearchCV(estimator=xgb_model,
                          param_grid=param_grid,
                          cv=10,
                          scoring = "recall",
                          n_jobs = -1,
                          return_train_score=True)

In [23]:
xgb_grid_model.fit(X_train, y_train)



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).




In [24]:
xgb_grid_model.best_estimator_

####  Prediction

In [25]:
X = df.drop('left',axis=1)
y = df['left']

In [26]:
operations = [("OrdinalEncoder", column_trans),
              ("scaler", StandardScaler()),
              ("xgb", XGBClassifier(random_state=101))]

final_model = Pipeline(steps=operations)

final_model.fit(X, y)



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).




In [27]:
new_obs={"satisfaction_level":[0.65],
           "last_evaluation": [0.68],
           "number_project":[5],
           'average_montly_hours':[220],
           'time_spend_company': [4],
           "work_accident":[0],
           "promotion_last_5years":[0],
            "departments": ['sales'],
            "salary": ['medium']}

In [28]:
samples = pd.DataFrame(new_obs)
samples

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,promotion_last_5years,departments,salary
0,0.65,0.68,5,220,4,0,0,sales,medium


In [29]:
prediction=final_model.predict(samples)
prediction

array([0])

In [30]:
predictions_proba = final_model.predict_proba(samples)
predictions_proba

array([[0.98269826, 0.01730172]], dtype=float32)

### Save and Export the Best Model

In [31]:
with open('xgb_model_with_transformer', 'wb') as file:
    pickle.dump(final_model, file)