In [13]:
#pip install imbalanced-learn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [14]:
df = pd.read_csv("Cleaned_Employee_Attrition.csv")

In [15]:
# Ensure all column names are lowercase (safety check)
df.columns = df.columns.str.lower()

# Quick check (optional)
print("Columns in data:\n", df.columns)

Columns in data:
 Index(['age', 'attrition', 'businesstravel', 'dailyrate', 'department',
       'distancefromhome', 'education', 'educationfield', 'employeenumber',
       'environmentsatisfaction', 'gender', 'hourlyrate', 'jobinvolvement',
       'joblevel', 'jobrole', 'jobsatisfaction', 'maritalstatus',
       'monthlyincome', 'monthlyrate', 'numcompaniesworked', 'overtime',
       'percentsalaryhike', 'performancerating', 'relationshipsatisfaction',
       'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear',
       'worklifebalance', 'yearsatcompany', 'yearsincurrentrole',
       'yearssincelastpromotion', 'yearswithcurrmanager'],
      dtype='object')


In [16]:
# 1) Tenure category from yearsatcompany
#    0 = New, 1 = Intermediate, 2 = Experienced, 3 = Loyal
tenure_bins = [0, 2, 5, 10, df['yearsatcompany'].max()]
tenure_labels = [0, 1, 2, 3]

df['tenure_category'] = pd.cut(
    df['yearsatcompany'],
    bins=tenure_bins,
    labels=tenure_labels,
    include_lowest=True
).astype(int)


In [17]:
# 2) Engagement score = sum of satisfaction-related columns
df['engagement_score'] = (
    df['jobsatisfaction'] +
    df['relationshipsatisfaction'] +
    df['environmentsatisfaction'] +
    df['worklifebalance']
)

In [18]:
# 3) Performance metric: combine performance rating and job satisfaction
df['performance_score'] = df['performancerating'] * df['jobsatisfaction']

In [19]:
# Optional: flag high performers (1 = high performer, 0 = others)
median_perf = df['performance_score'].median()
df['high_performer'] = (df['performance_score'] >= median_perf).astype(int)

print("\nNew engineered columns added:",
      ['tenure_category', 'engagement_score', 'performance_score', 'high_performer'])


New engineered columns added: ['tenure_category', 'engagement_score', 'performance_score', 'high_performer']


In [20]:
df

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,...,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,tenure_category,engagement_score,performance_score,high_performer
0,41,1,2,1102,2,1,2,1,1,2,...,0,1,6,4,0,5,2,8,12,1
1,49,0,1,279,1,8,1,1,2,3,...,3,3,10,7,1,7,2,12,8,0
2,37,1,2,1373,1,2,2,4,4,4,...,3,3,0,0,0,0,0,12,9,1
3,33,0,1,1392,1,3,4,1,5,4,...,3,3,8,7,3,0,2,13,9,1
4,27,0,2,591,1,2,1,3,7,1,...,3,3,2,2,2,2,0,10,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,0,1,884,1,23,2,3,2061,3,...,3,3,5,2,0,3,1,13,12,1
1466,39,0,2,613,1,6,1,3,2062,4,...,5,3,7,7,1,7,2,9,3,0
1467,27,0,2,155,1,4,3,1,2064,2,...,0,3,6,2,0,3,2,9,8,0
1468,49,0,1,1023,2,2,3,3,2065,4,...,3,2,9,6,0,8,2,12,6,0


In [21]:
df.shape

(1470, 36)

In [22]:
# ------------------ DEFINE FEATURES (X) AND TARGET (y) ------------------
# 'attrition' is the target variable (already numeric 0/1 after your preprocessing)
X = df.drop('attrition', axis=1)
y = df['attrition']

In [23]:
#attrition count
attrition_count=y.value_counts()
attrition_count

attrition
0    1233
1     237
Name: count, dtype: int64

In [24]:
# ------------------ TRAIN–TEST SPLIT ------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # keeps class ratio similar in train & test
)

print("\nTrain shape:", X_train.shape)
print("Test shape:", X_test.shape)



Train shape: (1176, 35)
Test shape: (294, 35)


In [25]:
# ------------------ SCALING NUMERICAL FEATURES ------------------
# At this stage, all columns in X are numeric (after preprocessing)
numeric_cols = X_train.columns  # all columns are numeric now

scaler = StandardScaler()

# Fit on training data only, then transform train and test
X_train_scaled = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled = scaler.transform(X_test[numeric_cols])

# Convert back to DataFrame (optional, but useful for later)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=numeric_cols, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=numeric_cols, index=X_test.index)

print("\nScaling complete.")



Scaling complete.


In [26]:
# ------------------ BALANCING TRAIN DATA USING SMOTE ------------------
# Apply SMOTE only on the training set to avoid data leakage
smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("\nSMOTE applied successfully on training data!")
print("Before SMOTE (train):")
print(y_train.value_counts())
print("\nAfter SMOTE (train):")
print(y_train_resampled.value_counts())

print("\nFinal datasets ready for model training:")
print("X_train_resampled shape:", X_train_resampled.shape)
print("y_train_resampled shape:", y_train_resampled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("y_test shape:", y_test.shape)
# The final datasets are:
# - X_train_resampled, y_train_resampled (balanced training set)
# - X_test_scaled, y_test (original test set for unbiased evaluation)


SMOTE applied successfully on training data!
Before SMOTE (train):
attrition
0    986
1    190
Name: count, dtype: int64

After SMOTE (train):
attrition
0    986
1    986
Name: count, dtype: int64

Final datasets ready for model training:
X_train_resampled shape: (1972, 35)
y_train_resampled shape: (1972,)
X_test_scaled shape: (294, 35)
y_test shape: (294,)
