
**Objective:** Measure how training impacts engagement, performance, and retention.



In [29]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from scipy import stats

In [30]:
df = pd.read_csv("../data/employee_ml_dataset_v3.csv")

df["DaysSinceLastTraining"] = df["DaysSinceLastTraining"].replace(9999, np.nan)
df.loc[df["EngagementScore"] < -10, "EngagementScore"] = np.nan
df.loc[df["BurnoutRiskScore"] < 0, "BurnoutRiskScore"] = np.nan

print(f"Dataset: {df.shape[0]} employees")
print(f"  DaysSinceLastTraining: mean={df['DaysSinceLastTraining'].mean():.0f} days")

Dataset: 15000 employees
  DaysSinceLastTraining: mean=433 days


In [31]:
trained   = df[df["NoTrainingFlag"] == 0]
untrained = df[df["NoTrainingFlag"] == 1]

outcomes = ["EngagementScore", "AvgOverallScore", "BurnoutRiskScore", "AbsenceDays_Last6M", "AttritionFlag"]

print("TRAINED vs UNTRAINED EMPLOYEES")
print(f"{'Outcome':<25} {'Trained':>10} {'Untrained':>10} {'Diff':>10} {'p-value':>10}")

for outcome in outcomes:
    t_mean = trained[outcome].mean()
    u_mean = untrained[outcome].mean()
    diff = t_mean - u_mean
    t_stat, p_val = stats.ttest_ind(
        trained[outcome].dropna(), untrained[outcome].dropna()
    )
    sig = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
    print(f"{outcome:<25} {t_mean:>10.3f} {u_mean:>10.3f} {diff:>+10.3f} {p_val:>9.4f} {sig}")

print(f"Sample sizes: Trained={len(trained)}, Untrained={len(untrained)}")

TRAINED vs UNTRAINED EMPLOYEES
Outcome                      Trained  Untrained       Diff    p-value
EngagementScore                1.873      1.260     +0.613    0.0000 ***
AvgOverallScore                3.319      3.309     +0.010    0.6167 
BurnoutRiskScore               0.186      0.181     +0.005    0.5171 
AbsenceDays_Last6M             1.278      1.315     -0.037    0.5525 
AttritionFlag                  0.153      0.151     +0.001    0.8524 
Sample sizes: Trained=12515, Untrained=2485


In [38]:
df_recent = df.dropna(subset=["DaysSinceLastTraining"]).copy()
df_recent["TrainingRecency"] = pd.cut(
    df_recent["DaysSinceLastTraining"],
    bins=[0, 90, 180, 365, 730, 3000],
    labels=["<3 months", "3-6 months", "6-12 months", "1-2 years", ">2 years"]
)

recency = df_recent.groupby("TrainingRecency", observed=True).agg(
    Employees=("EmployeeID", "count"),
    AvgEngagement=("EngagementScore", "mean"),
    AvgOverallScore=("AvgOverallScore", "mean"),
    AvgBurnout=("BurnoutRiskScore", "mean")
).round(3)

print("OUTCOMES BY TRAINING RECENCY")
print(recency.to_string())

OUTCOMES BY TRAINING RECENCY
                 Employees  AvgEngagement  AvgOverallScore  AvgBurnout
TrainingRecency                                                       
<3 months             3303          1.995            3.391       0.182
3-6 months            1781          1.923            3.346       0.197
6-12 months           2355          1.878            3.308       0.187
1-2 years             2580          1.860            3.288       0.180
>2 years              2496          1.684            3.246       0.187


In [41]:
df_reg = df.dropna(subset=["EngagementScore", "DaysSinceLastTraining", "BurnoutRiskScore"]).copy()

controls = ["TenureYears", "Salary", "PerformanceRating", "BurnoutRiskScore"]

training_feats = ["TrainingCount", "DaysSinceLastTraining", "NoTrainingFlag"]
X2 = df_reg[controls + training_feats]
m2 = LinearRegression().fit(X2, y)
r2_full = r2_score(y, m2.predict(X2))

print("TRAINING IMPACT ON ENGAGEMENT (regression)")
print(f"  Model (controls + training): R\u00b2 = {r2_full:.4f}")
print(f"  Added R\u00b2 from training:        {r2_full - r2_controls:.4f}")
print(f"")
print(f"  Training coefficients:")
for feat, coef in zip(controls + training_feats, m2.coef_):
    if feat in training_feats:
        print(f"    {feat:<25}: {coef:+.4f}")

TRAINING IMPACT ON ENGAGEMENT (regression)
  Model (controls + training): R² = 0.5162
  Added R² from training:        0.0766

  Training coefficients:
    TrainingCount            : +0.1987
    DaysSinceLastTraining    : +0.0000
    NoTrainingFlag           : +0.0000


In [42]:
dept_training = df.groupby("Department").agg(
    Employees=("EmployeeID", "count"),
    AvgTrainingCount=("TrainingCount", "mean"),
    PctUntrained=("NoTrainingFlag", "mean"),
    AvgEngagement=("EngagementScore", "mean"),
    AvgDaysSinceTraining=("DaysSinceLastTraining", "mean")
).round(3)

dept_training["PctUntrained"] = (dept_training["PctUntrained"] * 100).round(1)
dept_training["AvgDaysSinceTraining"] = dept_training["AvgDaysSinceTraining"].round(0)

print("DEPARTMENT TRAINING SUMMARY")
print(dept_training.sort_values("AvgTrainingCount", ascending=True).to_string())

DEPARTMENT TRAINING SUMMARY
                        Employees  AvgTrainingCount  PctUntrained  AvgEngagement  AvgDaysSinceTraining
Department                                                                                            
Research & Development       1667             2.413          17.8          1.761                 441.0
Production                   1647             2.437          18.2          1.789                 437.0
Quality Control              1723             2.439          16.9          1.754                 442.0
Sales                        1679             2.457          17.3          1.737                 434.0
IT Support                   1654             2.477          16.9          1.754                 431.0
Logistics                    1691             2.527          15.3          1.775                 448.0
Marketing                    1664             2.527          15.6          1.798                 427.0
Human Resources              1623            

In [48]:
optimal = df.groupby("TrainingCount")["EngagementScore"].mean()
best_count = optimal.idxmax()
best_score = optimal.max()

print(f"  Best training count: {best_count} sessions")
print(f"  Avg engagement at that level: {best_score:.3f}")

  Best training count: 5 sessions
  Avg engagement at that level: 2.269


In [52]:
# Flag employees who need training intervention
df["NeedsTraining"] = (
    (df["NoTrainingFlag"] == 1) |
    (df["DaysSinceLastTraining"] > 365) |
    (df["TrainingCount"] < 2)
).astype(int)

needs = df["NeedsTraining"].sum()
print(f"EMPLOYEES NEEDING TRAINING INTERVENTION")
print(f"  Flagged: {needs} employees ({needs/len(df)*100:.1f}%)")
print(f"")
print(f"    Never trained:           {df['NoTrainingFlag'].sum()}")
print(f"    >1 year since training:  {(df['DaysSinceLastTraining'] > 365).sum()}")
print(f"    <2 sessions total:       {(df['TrainingCount'] < 2).sum()}")


EMPLOYEES NEEDING TRAINING INTERVENTION
  Flagged: 8525 employees (56.8%)

    Never trained:           2485
    >1 year since training:  5076
    <2 sessions total:       5045


In [53]:
# Department-level training needs
dept_needs = df.groupby("Department").agg(
    Employees=("EmployeeID", "count"),
    NeedsTraining=("NeedsTraining", "sum"),
    AvgEngagement=("EngagementScore", "mean")
).round(3)

dept_needs["NeedsPct"] = (dept_needs["NeedsTraining"] / dept_needs["Employees"] * 100).round(1)

print("TRAINING NEEDS BY DEPARTMENT")
print(dept_needs.sort_values("NeedsPct", ascending=False).to_string())

TRAINING NEEDS BY DEPARTMENT
                        Employees  NeedsTraining  AvgEngagement  NeedsPct
Department                                                               
IT Support                   1654            972          1.754      58.8
Research & Development       1667            974          1.761      58.4
Quality Control              1723           1005          1.754      58.3
Sales                        1679            973          1.737      58.0
Production                   1647            949          1.789      57.6
Logistics                    1691            957          1.775      56.6
Marketing                    1664            927          1.798      55.7
Human Resources              1623            881          1.789      54.3
Finance                      1652            887          1.788      53.7
