In [21]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('Project2.csv')

In [9]:
df.head()

Unnamed: 0,EmployeeID,Age,Gender,JobRole,MonthlyIncome,YearsAtCompany,EnvironmentSatisfaction,Attrition,TenureBucket,NormSatisfaction,NormTenure,NormIncome,EngagementScore
0,304eefc9,46,Female,HR,96370,5,2,No,3-6 years,0.5,0.263158,0.751923,0.504524
1,a525db8f,22,Male,Manager,86770,0,3,No,0-2 years,0.75,0.0,0.644767,0.49343
2,b0acb00e,39,Male,Sales Executive,57072,6,3,Yes,3-6 years,0.75,0.315789,0.313275,0.488719
3,91015b9b,40,Male,HR,97978,1,1,No,0-2 years,0.25,0.052632,0.769871,0.346751
4,656dd1c5,38,Female,HR,103437,18,3,No,15+ years,0.75,0.947368,0.830805,0.833452


Tenure bucket and engagement score feature engineering

In [5]:
def tenure_bucket(years):
    if years < 3:
        return '0-2 years'
    elif years < 7:
        return '3-6 years'
    elif years < 15:
        return '7-14 years'
    else:
        return '15+ years'

df['TenureBucket'] = df['YearsAtCompany'].apply(tenure_bucket)

In [8]:
scaler = MinMaxScaler()

df['NormSatisfaction'] = df['EnvironmentSatisfaction'] / 4.0
df['NormTenure'] = scaler.fit_transform(df[['YearsAtCompany']])
df['NormIncome'] = scaler.fit_transform(df[['MonthlyIncome']])

df['EngagementScore'] = (
    0.4 * df['NormSatisfaction'] +
    0.3 * df['NormTenure'] +
    0.3 * df['NormIncome']
)

In [None]:
Encoding Categorical columns

In [10]:
df['AttritionFlag'] = df['Attrition'].map({'Yes': 1, 'No': 0})

In [11]:
features = ['Age', 'MonthlyIncome', 'EnvironmentSatisfaction', 'TenureBucket', 'Gender', 'JobRole', 'EngagementScore']


In [12]:
# One-hot encode categorical variables
df_encoded = pd.get_dummies(df[features], drop_first=True)


In [13]:
#X-Y Split
X = df_encoded
y = df['AttritionFlag']

In [15]:
X.head()

Unnamed: 0,Age,MonthlyIncome,EnvironmentSatisfaction,EngagementScore,TenureBucket_15+ years,TenureBucket_3-6 years,TenureBucket_7-14 years,Gender_Male,JobRole_HR,JobRole_Manager,JobRole_Sales Executive
0,46,96370,2,0.504524,False,True,False,False,True,False,False
1,22,86770,3,0.49343,False,False,False,True,False,True,False
2,39,57072,3,0.488719,False,True,False,True,False,False,True
3,40,97978,1,0.346751,False,False,False,True,True,False,False
4,38,103437,3,0.833452,True,False,False,False,True,False,False


In [18]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
#build model for Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)



In [23]:
y_pred = lr.predict(X_test)
y_proba = lr.predict_proba(X_test)[:, 1]

print("Logistic Regression ROC-AUC:", roc_auc_score(y_test, y_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Logistic Regression ROC-AUC: 0.4285714285714286
Confusion Matrix:
 [[2 1]
 [4 3]]
              precision    recall  f1-score   support

           0       0.33      0.67      0.44         3
           1       0.75      0.43      0.55         7

    accuracy                           0.50        10
   macro avg       0.54      0.55      0.49        10
weighted avg       0.62      0.50      0.52        10



In [24]:
#Build DT model
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train, y_train)

y_pred_dt = dtree.predict(X_test)
y_proba_dt = dtree.predict_proba(X_test)[:, 1]

print("Decision Tree ROC-AUC:", roc_auc_score(y_test, y_proba_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

Decision Tree ROC-AUC: 0.6904761904761906
Confusion Matrix:
 [[2 1]
 [2 5]]
              precision    recall  f1-score   support

           0       0.50      0.67      0.57         3
           1       0.83      0.71      0.77         7

    accuracy                           0.70        10
   macro avg       0.67      0.69      0.67        10
weighted avg       0.73      0.70      0.71        10



Logistic Regression's ROC-AUC is 42% meaning the model is no better than random guessing. Overall Decision Tree performs better.
Based on Excel analysis:
The lesser rating on Employee satisfaction means the more employees are encouraged to leave the organization. Once these surveys are receieved,
we need to take measure according to the feedback provided so that there wont be attrition.
The number of attrition is independent of age, gender, years at company. And slightly dependent on job role as HR and Sales Executives are among the top roles for attribution and on salary as well.
To overcome attrition, we also need to look at the employees who have a lower rating but are still with the company. This number is huge and measures need to be taken like accumulating their feedback and working on areas of concerns. 
