**Objective:** Detect compensation disparities across gender, department, and job role.


In [70]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error
from scipy import stats

In [71]:
df = pd.read_csv("../data/HR-Employee-Attrition.csv")

print(f"Dataset: {df.shape[0]} employees")
print(f"Salary range: ${df['MonthlyIncome'].min():,} - ${df['MonthlyIncome'].max():,}")
print(f"Salary mean:  ${df['MonthlyIncome'].mean():,.0f}")
print(f"\nGender split: {dict(df['Gender'].value_counts())}")

Dataset: 1470 employees
Salary range: $1,009 - $19,999
Salary mean:  $6,503

Gender split: {'Male': np.int64(882), 'Female': np.int64(588)}


In [72]:
gender_stats = df.groupby("Gender")["MonthlyIncome"].agg(["count", "mean", "median", "std"]).round(0)
gender_stats.columns = ["Count", "Mean", "Median", "StdDev"]

print("RAW SALARY BY GENDER (unadjusted)")
print(gender_stats.to_string())

male_mean   = df[df["Gender"] == "Male"]["MonthlyIncome"].mean()
female_mean = df[df["Gender"] == "Female"]["MonthlyIncome"].mean()
raw_gap = female_mean - male_mean
raw_gap_pct = raw_gap / male_mean * 100
print(f"\nRaw F-M gap: ${raw_gap:+,.0f} ({raw_gap_pct:+.1f}%)")
print(f"(Positive = Females earn more on average)")

RAW SALARY BY GENDER (unadjusted)
        Count    Mean  Median  StdDev
Gender                               
Female    588  6687.0  5082.0  4696.0
Male      882  6381.0  4838.0  4715.0

Raw F-M gap: $+306 (+4.8%)
(Positive = Females earn more on average)


In [73]:
# Salary by Gender AND JobLevel
print("SALARY BY GENDER x JOB LEVEL")
pivot = df.pivot_table(values="MonthlyIncome", index="JobLevel",
                       columns="Gender", aggfunc="mean").round(0)
pivot["Gap (F-M)"] = (pivot["Female"] - pivot["Male"]).round(0)
pivot["Gap %"] = (pivot["Gap (F-M)"] / pivot["Male"] * 100).round(1)
print(pivot.to_string())

SALARY BY GENDER x JOB LEVEL
Gender     Female     Male  Gap (F-M)  Gap %
JobLevel                                    
1          2780.0   2791.0      -11.0   -0.4
2          5435.0   5549.0     -114.0   -2.1
3          9963.0   9707.0      256.0    2.6
4         15431.0  15571.0     -140.0   -0.9
5         19130.0  19225.0      -95.0   -0.5



**Model:** `MonthlyIncome ~ JobLevel + TotalWorkingYears + JobRole + Department + Education + YearsAtCompany + Gender`


In [74]:
df_eq = df[["MonthlyIncome", "Gender", "Department", "JobRole", "JobLevel",
            "TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole",
            "Education", "PerformanceRating", "EmployeeNumber"]].copy()

le_dept = LabelEncoder()
le_role = LabelEncoder()
df_eq["Dept_enc"] = le_dept.fit_transform(df_eq["Department"])
df_eq["Role_enc"] = le_role.fit_transform(df_eq["JobRole"])
df_eq["Gender_Male"] = (df_eq["Gender"] == "Male").astype(int)

print(f"Analysis sample: {len(df_eq)} employees")

Analysis sample: 1470 employees


In [75]:

legitimate_features = ["JobLevel", "TotalWorkingYears", "Role_enc", "Dept_enc",
                       "YearsAtCompany", "Education", "PerformanceRating"]

X_base = df_eq[legitimate_features]
y_sal  = df_eq["MonthlyIncome"]

model_base = LinearRegression()
model_base.fit(X_base, y_sal)
y_pred_base = model_base.predict(X_base)

print(f"  R\u00b2 = {r2_score(y_sal, y_pred_base):.4f}")
print(f"  MAE = ${mean_absolute_error(y_sal, y_pred_base):,.0f}")

  RÂ² = 0.9075
  MAE = $1,085


In [76]:
df_eq["PredictedSalary"] = y_pred_base.round(0)
df_eq["PayGap"] = df_eq["MonthlyIncome"] - df_eq["PredictedSalary"]
df_eq["PayGapPct"] = (df_eq["PayGap"] / df_eq["PredictedSalary"] * 100).round(1)

df_eq["UnderpaidFlag"] = (df_eq["PayGapPct"] <= -15).astype(int)
df_eq["OverpaidFlag"]  = (df_eq["PayGapPct"] >= 15).astype(int)

print("PAY EQUITY FLAGS (15% threshold)")
print(f"  Underpaid (>15% below expected): {df_eq['UnderpaidFlag'].sum()} employees ({df_eq['UnderpaidFlag'].mean()*100:.1f}%)")
print(f"  Overpaid  (>15% above expected): {df_eq['OverpaidFlag'].sum()} employees ({df_eq['OverpaidFlag'].mean()*100:.1f}%)")
print(f"  Within range:                    {len(df_eq) - df_eq['UnderpaidFlag'].sum() - df_eq['OverpaidFlag'].sum()} employees")

PAY EQUITY FLAGS (15% threshold)
  Underpaid (>15% below expected): 361 employees (24.6%)
  Overpaid  (>15% above expected): 411 employees (28.0%)
  Within range:                    698 employees


In [77]:
gender_equity = df_eq.groupby("Gender").agg(
    Total=("EmployeeNumber", "count"),
    Underpaid=("UnderpaidFlag", "sum"),
    Overpaid=("OverpaidFlag", "sum"),
    AvgGapPct=("PayGapPct", "mean")
).round(1)

gender_equity["UnderpaidPct"] = (gender_equity["Underpaid"] / gender_equity["Total"] * 100).round(1)

print("PAY EQUITY BY GENDER")
print(gender_equity.to_string())

PAY EQUITY BY GENDER
        Total  Underpaid  Overpaid  AvgGapPct  UnderpaidPct
Gender                                                     
Female    588        157       160        3.6          26.7
Male      882        204       251        5.5          23.1


In [78]:
df_eq["Department"] = le_dept.inverse_transform(df_eq["Dept_enc"])

dept_equity = df_eq.groupby("Department").agg(
    Employees=("EmployeeNumber", "count"),
    AvgIncome=("MonthlyIncome", "mean"),
    AvgPredicted=("PredictedSalary", "mean"),
    AvgGapPct=("PayGapPct", "mean"),
    Underpaid=("UnderpaidFlag", "sum")
).round(0)

dept_equity["UnderpaidPct"] = (dept_equity["Underpaid"] / dept_equity["Employees"] * 100).round(1)

print("DEPARTMENT PAY EQUITY")
print(dept_equity.sort_values("UnderpaidPct", ascending=False).to_string())

DEPARTMENT PAY EQUITY
                        Employees  AvgIncome  AvgPredicted  AvgGapPct  Underpaid  UnderpaidPct
Department                                                                                    
Human Resources                63     6655.0        6861.0       -2.0         18          28.6
Sales                         446     6959.0        6988.0        3.0        120          26.9
Research & Development        961     6281.0        6254.0        6.0        223          23.2


In [79]:
df_eq["JobRole"] = le_role.inverse_transform(df_eq["Role_enc"])

print("TOP 5 MOST UNDERPAID EMPLOYEES")
most_underpaid = df_eq.sort_values("PayGapPct").head(5)
print(most_underpaid[["EmployeeNumber", "Department", "JobRole", "Gender", "JobLevel",
                      "MonthlyIncome", "PredictedSalary", "PayGapPct"]].to_string(index=False))

TOP 5 MOST UNDERPAID EMPLOYEES
 EmployeeNumber             Department               JobRole Gender  JobLevel  MonthlyIncome  PredictedSalary  PayGapPct
            132 Research & Development Laboratory Technician   Male         2           2042           6548.0      -68.8
           1937 Research & Development    Research Scientist Female         2           2133           6748.0      -68.4
             42                  Sales  Sales Representative   Male         2           2086           6514.0      -68.0
           1292 Research & Development    Research Scientist Female         2           2372           6839.0      -65.3
            362 Research & Development Laboratory Technician   Male         2           2176           6206.0      -64.9
