In [59]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

In [40]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/WA_Fn-UseC_-HR-Employee-Attrition.csv')

# Print the shape
print('Data set shape', df.shape)

Data set shape (1470, 35)


# 1. Preprocessing

In [41]:
# Handling missing values
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurre

In [42]:
#dropping irrelevant and constant features
cols_to_drop = ['EmployeeCount', 'StandardHours', 'Over18', 'EmployeeNumber']
df.drop(columns=cols_to_drop, inplace=True)

# Outlier handling
# Using log transformation to reduce skewness
df['MonthlyIncomeLog'] = df['MonthlyIncome'].apply(lambda x: np.log1p(x))

# cap extream outliers for MonthlyIncome
income_99 = df['MonthlyIncome'].quantile(0.99)

# Encoding categorical variables
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})
df['Gender']   = df['Gender'].map({'Male': 1, 'Female': 0})
df['OverTime'] = df['OverTime'].map({'Yes': 1, 'No': 0})

# One hot encode multi class categories
df = pd.get_dummies(df, columns=['BusinessTravel','Department','EducationField','JobRole','MaritalStatus'], drop_first=True)

print("Shape after encoding:", df.shape)
# Separate features and target
X = df.drop('Attrition', axis=1)
y = df['Attrition']

Shape after encoding: (1470, 46)


In [43]:
# Feature Scaling (standardise numercal features)
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
# remove created dummy colums and Attrition from numeric cols
numeric_cols = [col for col in numeric_cols if col not in ['Attrition'] and '_' not in col]
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Save preprocessed dataset for future use
processed_df = pd.concat([X, y], axis=1)
processed_df.to_csv('processed_attrition_data.csv', index=False)
print("Preprocessing done. Processed data saved to CSV.")

Preprocessing done. Processed data saved to CSV.


# 2. Feature Engineering

In [44]:
# Create a copy of preprocessed data set for feature engineering
df_fe = processed_df.copy()

# Interaction between jobRole and MonthlyIncome (via one-hot roles * income)
jobrole_cols = [col for col in df_fe.columns if col.startswith('JobRole_')]
for col in jobrole_cols:
    new_col = col + '_Income'
    df_fe[new_col] = df_fe[col] * df_fe['MonthlyIncome']

In [45]:
# Relative income to avarage. of the same JobRole
orig_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/WA_Fn-UseC_-HR-Employee-Attrition.csv')
orig_df['Attrition'] = orig_df['Attrition'].map({'Yes':1,'No':0})
avg_income_by_role = orig_df.groupby('JobRole')['MonthlyIncome'].mean()

# Map each employee's JobRole to the average income of that role
df_fe['RoleAvgIncome'] = orig_df['JobRole'].map(avg_income_by_role)
# relative income = current income / avg income for that role
df_fe['RelIncomeToRoleAvg'] = df_fe['MonthlyIncome'] / df_fe['RoleAvgIncome']

# Tenure ratio (Years at company vs Age)
df_fe['TenureRatio'] = df_fe['YearsAtCompany'] / df_fe['Age']

# Promotion gap (if YearsAtCompany > 0)
df_fe['PromotionGap'] = df_fe.apply(lambda row: row['YearsSinceLastPromotion'] / row['YearsAtCompany']
                                    if row['YearsAtCompany'] > 0 else 0, axis=1)

print("Added engineered features. New shape:", df_fe.shape)

Added engineered features. New shape: (1470, 58)


# 3. Feature Selection

In [56]:
X_fe = df_fe.drop('Attrition', axis=1)
y_fe = df_fe['Attrition']

# Droping highly correlated features (treshold: 0.9)
corr_matrix = X_fe.corr().abs()
corr_matrix = X_fe.corr().abs()

# Use combinations to iterate through each pair of features only once
high_corr_pairs = []
for feature1, feature2 in itertools.combinations(corr_matrix.columns, 2):
    corr_value = corr_matrix.loc[feature1, feature2]
    if corr_value > 0.9:
        high_corr_pairs.append((feature1, feature2, corr_value))

# Print out the highly correlated feature pairs
if high_corr_pairs:
    print("Highly correlated feature pairs (corr > 0.9):")
    for pair in high_corr_pairs:
        print(f"{pair[0]} and {pair[1]}: correlation = {pair[2]:.2f}")
else:
    print("No feature pairs with correlation > 0.9 found.")

Highly correlated feature pairs (corr > 0.9):
JobLevel and MonthlyIncome: correlation = 0.95
JobLevel and MonthlyIncomeLog: correlation = 0.92
MonthlyIncome and MonthlyIncomeLog: correlation = 0.94
MonthlyIncome and RoleAvgIncome: correlation = 0.90
MonthlyIncomeLog and RelIncomeToRoleAvg: correlation = 0.97
Department_Research & Development and Department_Sales: correlation = 0.91
JobRole_Laboratory Technician and JobRole_Laboratory Technician_Income: correlation = 0.93
JobRole_Manager and JobRole_Manager_Income: correlation = 0.98
JobRole_Research Director and JobRole_Research Director_Income: correlation = 0.96
JobRole_Research Scientist and JobRole_Research Scientist_Income: correlation = 0.93
JobRole_Sales Representative and JobRole_Sales Representative_Income: correlation = 0.98


In [57]:
# List of features to remove based on our correlation analysis:
features_to_remove = [
    'JobLevel',          # redundant with salary features
    'MonthlyIncome',     # we will use the log-transformed version instead
    'RoleAvgIncome',     # redundant with MonthlyIncomeLog
    'RelIncomeToRoleAvg' # redundant with MonthlyIncomeLog
]

# For department dummies, drop one
if 'Department_Sales' in X_fe.columns and 'Department_Research & Development' in X_fe.columns:
    features_to_remove.append('Department_Sales')

# For job role interaction terms: drop all columns that have the pattern '_Income' and start with 'JobRole_'
job_role_income_features = [col for col in X_fe.columns if col.startswith('JobRole_') and '_Income' in col]
features_to_remove.extend(job_role_income_features)

# Print the list of features to be removed
print("Features to be removed due to high correlation:", features_to_remove)

# Remove these features from X_fe
X_reduced = X_fe.drop(features_to_remove, axis=1)

print("Shape before removal:", X_fe.shape)
print("Shape after removal:", X_reduced.shape)

Features to be removed due to high correlation: ['JobLevel', 'MonthlyIncome', 'RoleAvgIncome', 'RelIncomeToRoleAvg', 'Department_Sales', 'JobRole_Human Resources_Income', 'JobRole_Laboratory Technician_Income', 'JobRole_Manager_Income', 'JobRole_Manufacturing Director_Income', 'JobRole_Research Director_Income', 'JobRole_Research Scientist_Income', 'JobRole_Sales Executive_Income', 'JobRole_Sales Representative_Income']
Shape before removal: (1470, 57)
Shape after removal: (1470, 44)


In [58]:
# REF using logistic regression
# Use a logistic regression (with no penalty or L2 to not bias feature selection) for RFE
logreg = LogisticRegression(max_iter=1000, solver='lbfgs')
rfe = RFE(logreg, n_features_to_select=10)  # choose top 10 features
rfe.fit(X_fe, y_fe)
selected_indices = rfe.get_support(indices=True)
selected_features = X_fe.columns[selected_indices]
print("Top 10 features selected by RFE:", list(selected_features))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Top 10 features selected by RFE: ['OverTime', 'MonthlyIncomeLog', 'BusinessTravel_Travel_Frequently', 'EducationField_Technical Degree', 'JobRole_Human Resources', 'JobRole_Sales Executive', 'MaritalStatus_Single', 'JobRole_Laboratory Technician_Income', 'JobRole_Sales Executive_Income', 'JobRole_Sales Representative_Income']


In [60]:
# Feature importance from Random Forest

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_fe, y_fe)
importances = pd.Series(rf.feature_importances_, index=X_fe.columns).sort_values(ascending=False)
print("Top 10 features by Random Forest importance:")
print(importances.head(10))

Top 10 features by Random Forest importance:
RelIncomeToRoleAvg    0.049865
MonthlyIncomeLog      0.047681
MonthlyIncome         0.044384
DailyRate             0.042945
Age                   0.042693
OverTime              0.042634
MonthlyRate           0.038774
TotalWorkingYears     0.037628
TenureRatio           0.037091
DistanceFromHome      0.036807
dtype: float64
