In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

import os

import warnings
warnings.filterwarnings('ignore') # pandas had annoyting warnings below 

In [2]:
df = pd.read_csv('HR-Employee-Attrition.csv')
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


**Feature Integer Information from Kaggle Datacard:**  
Education:    
1 'Below College'
2 'College'
3 'Bachelor'
4 'Master'
5 'Doctor'

EnvironmentSatisfaction:  
1 'Low'
2 'Medium'
3 'High'
4 'Very High'

JobInvolvement:  
1 'Low'
2 'Medium'
3 'High'
4 'Very High'

JobSatisfaction:  
1 'Low'
2 'Medium'
3 'High'
4 'Very High'

PerformanceRating:  
1 'Low'
2 'Good'
3 'Excellent'
4 'Outstanding'

RelationshipSatisfaction:  
1 'Low'
2 'Medium'
3 'High'
4 'Very High'

WorkLifeBalance:  
1 'Bad'
2 'Good'
3 'Better'
4 'Best'

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

### **Data Cleaning**  
Below columns found to be superfluous 

In [4]:
drop_cols = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours', 'MonthlyRate', 'HourlyRate', 'DailyRate']
df_good_cols = df.drop(drop_cols, axis=1)

In [5]:
df_good_cols

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,Female,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,Male,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,Male,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,Female,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,Male,3,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,Research & Development,23,2,Medical,3,Male,4,...,3,3,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,Research & Development,6,1,Medical,4,Male,2,...,3,1,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,Research & Development,4,3,Life Sciences,2,Male,4,...,4,2,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,Sales,2,3,Medical,4,Male,2,...,3,4,0,17,3,2,9,6,0,8


In [6]:
# Performing Integer/binary encoding
df_good_cols['Gender'].replace({'Male': 1, 'Female': 0}, inplace=True)
df_good_cols['Attrition'].replace({'Yes': 1, 'No': 0}, inplace=True)
df_good_cols['BusinessTravel'].replace({'Travel_Rarely': 1, 'Travel_Frequently': 2, 'Non-Travel': 0}, inplace=True)
df_good_cols['OverTime'].replace({'Yes': 1, 'No': 0}, inplace=True)

# need this for scaling features later (don't scale One-hot encoded or binary features)
feat_before_enc = list(df_good_cols.columns)
feat_before_enc.remove('Attrition')
feat_before_enc.remove('OverTime')
feat_before_enc.remove('Gender')

In [7]:
# Perforiming one-hot encoding for remaining String features
encoded_cols = ['Department', 'EducationField', 'JobRole', 'MaritalStatus']
#drop_first=False; slightly betterfor any tree based algorithm
df_encoded = pd.get_dummies(df_good_cols, columns=encoded_cols, drop_first=True)

# include binary encoding variables (will be excluded from standard scaling)
encoded_cols.extend(['Gender', 'OverTime'])
all_features = df_encoded.columns

In [8]:
df_encoded

Unnamed: 0,Age,Attrition,BusinessTravel,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobSatisfaction,...,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single
0,41,1,1,1,2,2,0,3,2,4,...,False,False,False,False,False,False,True,False,False,True
1,49,0,2,8,1,3,1,2,2,2,...,False,False,False,False,False,True,False,False,True,False
2,37,1,1,2,2,4,1,2,1,3,...,False,True,False,False,False,False,False,False,False,True
3,33,0,2,3,4,4,0,3,1,3,...,False,False,False,False,False,True,False,False,True,False
4,27,0,1,2,1,1,1,3,1,2,...,False,True,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,0,2,23,2,3,1,4,2,4,...,False,True,False,False,False,False,False,False,True,False
1466,39,0,1,6,1,4,1,2,3,1,...,False,False,False,False,False,False,False,False,True,False
1467,27,0,1,4,3,2,1,4,2,2,...,False,False,False,True,False,False,False,False,True,False
1468,49,0,2,2,3,4,1,2,2,2,...,False,False,False,False,False,False,True,False,True,False


In [9]:
# seperate numerical features from categorical
cat_feat = list(set(all_features) - set(feat_before_enc))
cat_feat.remove('Attrition')
numerical_feat = list(set(feat_before_enc) - set(encoded_cols))

numerical_feat.sort()
cat_feat.sort()

In [10]:
# keep X and y as dataframe so that ColumnTransformer can use column names
y = df_encoded['Attrition']
X = df_encoded.drop('Attrition', axis=1)
#added stratification to the train-test split
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

#perfrom standard scaling 
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_feat),
    ('cat', 'passthrough', cat_feat)
])

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

all_features2 = list(all_features)
all_features2.remove('Attrition')

In [11]:
# MLFlow setup (local directory to read experiment data):
mlflow.set_tracking_uri("file:./mlruns")

### **Comparing three model types:**

#### **1.) Logistic Regression**

In [12]:
with mlflow.start_run(run_name="Log_Reg_HR_analytic"):
    # default parameters used 
    mlflow.log_param("reg_strength", 1.0)
    mlflow.log_param("solver", 'lbfgs')
    mlflow.log_param("max_iter", 100)
    

    LR = LogisticRegression(random_state=42)
    LR.fit(X_train_processed, y_train)
    y_pred = LR.predict(X_test_processed)
    y_pred_tr = LR.predict(X_train_processed)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print("Simple Accuracy LR:", accuracy)
    print("Precision LR:", precision)
    print("Recall LR:", recall)
    print("FI LR:", f1)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    signature = infer_signature(X_train_processed, y_pred_tr)
        
        # Log the model with signature
    mlflow.sklearn.log_model(
        sk_model=LR,
        name="model",
        signature=signature,
        input_example=X_train_processed[:1],  # Log sample input for reference (have ability to store more than 1 exp)
        registered_model_name="LR_classifier"  # Optional: register to model registry
    )

Simple Accuracy LR: 0.8469387755102041
Precision LR: 0.8292440965207969
Recall LR: 0.8469387755102041
FI LR: 0.8348490478868713


Registered model 'LR_classifier' already exists. Creating a new version of this model...
Created version '3' of model 'LR_classifier'.


In [13]:
print("Classification Report: Logistic Regression\n")
print(classification_report(y_test, y_pred))

Classification Report: Logistic Regression

              precision    recall  f1-score   support

           0       0.89      0.94      0.91       986
           1       0.54      0.37      0.44       190

    accuracy                           0.85      1176
   macro avg       0.71      0.65      0.67      1176
weighted avg       0.83      0.85      0.83      1176



#### **2.) K Nearest Neighbors:**

In [14]:
with mlflow.start_run(run_name="KNN_HR_analytic"):
    # default parameters used 
    mlflow.log_param("n_neighbors", 5)
    mlflow.log_param("leaf_size", 30)
    mlflow.log_param("dist_metric", 'minkowski')
    
    knn = KNeighborsClassifier()
    knn.fit(X_train_processed, y_train)
    y_pred = knn.predict(X_test_processed)
    y_pred_tr = knn.predict(X_train_processed)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print("Simple Accuracy KNN:", accuracy)
    print("Precision KNN:", precision)
    print("Recall KNN:", recall)
    print("FI KNN:", f1)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    signature = infer_signature(X_train_processed, y_pred_tr)
        
        # Log the model with signature
    mlflow.sklearn.log_model(
        sk_model=knn,
        name="model",
        signature=signature,
        input_example=X_train_processed[:1],  # Log sample input for reference (have ability to store more than 1 exp)
        registered_model_name="knn_classifier" 
    )

Simple Accuracy KNN: 0.8367346938775511
Precision KNN: 0.7784670575196112
Recall KNN: 0.8367346938775511
FI KNN: 0.7757796574427704


Registered model 'knn_classifier' already exists. Creating a new version of this model...
Created version '2' of model 'knn_classifier'.


In [15]:
print("Classification Report: KNN\n")
print(classification_report(y_test, y_pred))

Classification Report: KNN

              precision    recall  f1-score   support

           0       0.84      0.99      0.91       986
           1       0.44      0.04      0.08       190

    accuracy                           0.84      1176
   macro avg       0.64      0.52      0.49      1176
weighted avg       0.78      0.84      0.78      1176



#### **3.) XGBoost:**

In [16]:
with mlflow.start_run(run_name="XGB_HR_analytic"):
    # default parameters used 
    mlflow.log_param("max_depth", 6)
    mlflow.log_param("learning_rate", 0.3)
    mlflow.log_param("max_bin", 256)
   
    xgb = XGBClassifier(random_state=42)
    xgb.fit(X_train_processed, y_train)
    y_pred = xgb.predict(X_test_processed)
    y_pred_tr = xgb.predict(X_train_processed)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print("Simple Accuracy XGB:", accuracy)
    print("Precision XGB:", precision)
    print("Recall XGB:", recall)
    print("FI XGB:", f1)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    signature = infer_signature(X_train_processed, y_pred_tr)
        
        # Log the model with signature
    mlflow.xgboost.log_model(
        xgb_model=xgb,
        name="model",
        signature=signature,
        input_example=X_train_processed[:1],  # Log sample input for reference (have ability to store more than 1 exp)
        registered_model_name="xgb_classifier" 
    )

Simple Accuracy XGB: 0.8443877551020408
Precision XGB: 0.8206815782209981
Recall XGB: 0.8443877551020408
FI XGB: 0.8263796406357059


Registered model 'xgb_classifier' already exists. Creating a new version of this model...
Created version '2' of model 'xgb_classifier'.


In [17]:
print("Classification Report: XGBoost\n")
print(classification_report(y_test, y_pred))

Classification Report: XGBoost

              precision    recall  f1-score   support

           0       0.88      0.95      0.91       986
           1       0.53      0.31      0.39       190

    accuracy                           0.84      1176
   macro avg       0.70      0.63      0.65      1176
weighted avg       0.82      0.84      0.83      1176



In [18]:
 !mlflow ui --port 5002

^C


Link: 
http://localhost:5002/