In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [24]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/WA_Fn-UseC_-HR-Employee-Attrition.csv')

# Print the shape
print('Data set shape', df.shape)

Data set shape (1470, 35)


# 1. Preprocessing

In [17]:
# Handling missing values
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 Age                                  0
Attrition                            0
DailyRate                            0
DistanceFromHome                     0
Education                            0
EnvironmentSatisfaction              0
Gender                               0
HourlyRate                           0
JobInvolvement                       0
JobLevel                             0
JobSatisfaction                      0
MonthlyIncome                        0
MonthlyRate                          0
NumCompaniesWorked                   0
OverTime                             0
PercentSalaryHike                    0
PerformanceRating                    0
RelationshipSatisfaction             0
StockOptionLevel                     0
TotalWorkingYears                    0
TrainingTimesLastYear                0
WorkLifeBalance                      0
YearsAtCompany                       0
YearsInCurrentRole                   0
YearsSinceLastPromotion             

In [25]:
#dropping irrelevant and constant features
cols_to_drop = ['EmployeeCount', 'StandardHours', 'Over18', 'EmployeeNumber']
df.drop(columns=cols_to_drop, inplace=True)

# Outlier handling
# Using log transformation to reduce skewness
df['MonthlyIncomeLog'] = df['MonthlyIncome'].apply(lambda x: np.log1p(x))

# cap extream outliers for MonthlyIncome
income_99 = df['MonthlyIncome'].quantile(0.99)

# Encoding categorical variables
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})
df['Gender']   = df['Gender'].map({'Male': 1, 'Female': 0})
df['OverTime'] = df['OverTime'].map({'Yes': 1, 'No': 0})

# One hot encode multi class categories
df = pd.get_dummies(df, columns=['BusinessTravel','Department','EducationField','JobRole','MaritalStatus'], drop_first=True)

print("Shape after encoding:", df.shape)
# Separate features and target
X = df.drop('Attrition', axis=1)
y = df['Attrition']

Shape after encoding: (1470, 46)


In [26]:
# Feature Scaling (standardise numercal features)
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
# remove created dummy colums and Attrition from numeric cols
numeric_cols = [col for col in numeric_cols if col not in ['Attrition'] and '_' not in col]
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Save preprocessed dataset for future use
processed_df = pd.concat([X, y], axis=1)
processed_df.to_csv('processed_attrition_data.csv', index=False)
print("Preprocessing done. Processed data saved to CSV.")

Preprocessing done. Processed data saved to CSV.


# 2. Feature Engineering

In [28]:
# Create a copy of preprocessed data set for feature engineering
df_fe = processed_df.copy()

# Interaction between jobRole and MonthlyIncome (via one-hot roles * income)
jobrole_cols = [col for col in df_fe.columns if col.startswith('JobRole_')]
for col in jobrole_cols:
    new_col = col + '_Income'
    df_fe[new_col] = df_fe[col] * df_fe['MonthlyIncome']

In [32]:
# Relative income to avarage. of the same JobRole
orig_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/WA_Fn-UseC_-HR-Employee-Attrition.csv')
orig_df['Attrition'] = orig_df['Attrition'].map({'Yes':1,'No':0})
avg_income_by_role = orig_df.groupby('JobRole')['MonthlyIncome'].mean()

# Map each employee's JobRole to the average income of that role
df_fe['RoleAvgIncome'] = orig_df['JobRole'].map(avg_income_by_role)
# relative income = current income / avg income for that role
df_fe['RelIncomeToRoleAvg'] = df_fe['MonthlyIncome'] / df_fe['RoleAvgIncome']

# Tenure ratio (Years at company vs Age)
df_fe['TenureRatio'] = df_fe['YearsAtCompany'] / df_fe['Age']

# Promotion gap (if YearsAtCompany > 0)
df_fe['PromotionGap'] = df_fe.apply(lambda row: row['YearsSinceLastPromotion'] / row['YearsAtCompany']
                                    if row['YearsAtCompany'] > 0 else 0, axis=1)

print("Added engineered features. New shape:", df_fe.shape)

Added engineered features. New shape: (1470, 58)
