Behavioral Risk & Organizational Health Modeling 
---------------------------------
Diagnostic AI for Employee Behavioral Strain


In [1]:
import pandas as pd
import numpy as np


df = pd.read_csv("../data/employee_ml_dataset_v3.csv")
print("✅ Data loaded")
df.head()

✅ Data loaded


Unnamed: 0,EmployeeID,Department,JobTitle,Gender,Salary,TenureYears,EarlyTenureFlag,PerformanceRating,HighPerformerFlag,AbsenceDays_Last6M,...,TrainingCount,DaysSinceLastTraining,NoTrainingFlag,YearsSinceLastRaise,SalaryChangeCount,PayStagnationFlag,EngagementScore,CareerStagnationFlag,BurnoutRiskScore,AttritionFlag
0,PNR-10012,Production,Production Worker,Female,79833,1,1,2,0,10,...,5,25,0,0,0,0,-0.5,0,0.6,0
1,PNR-10017,Logistics,Warehouse Associate,Female,82975,3,0,5,1,0,...,1,799,0,1,2,0,1.7,0,0.0,0
2,PNR-10019,Logistics,Logistics Coordinator,Female,83588,6,0,5,1,0,...,1,1631,0,3,3,1,1.7,0,0.0,0
3,PNR-10036,Finance,Financial Analyst,Male,75311,6,0,3,0,0,...,1,149,0,4,2,1,1.7,0,0.0,0
4,PNR-10050,Quality Control,QC Inspector,Non-binary,83366,2,0,5,1,0,...,2,7,0,1,1,0,1.4,0,0.0,0


In [2]:
print(df.columns.tolist())

['EmployeeID', 'Department', 'JobTitle', 'Gender', 'Salary', 'TenureYears', 'EarlyTenureFlag', 'PerformanceRating', 'HighPerformerFlag', 'AbsenceDays_Last6M', 'AbsenceFrequency_Last6M', 'LongLeaveFlag', 'HighAbsenceFlag', 'AvgOverallScore', 'LastOverallScore', 'AvgCommunication', 'AvgTeamwork', 'AvgProblemSolving', 'PerformanceDropFlag', 'TrainingCount', 'DaysSinceLastTraining', 'NoTrainingFlag', 'YearsSinceLastRaise', 'SalaryChangeCount', 'PayStagnationFlag', 'EngagementScore', 'CareerStagnationFlag', 'BurnoutRiskScore', 'AttritionFlag']


In [3]:
df_risk = df.copy()

In [4]:
# High risk = higher value


# Rank behavioral signals
df_risk["Engagement_rank"] = 1 - df_risk["EngagementScore"].rank(pct=True)

df_risk["Burnout_rank"]    = df_risk["BurnoutRiskScore"].rank(pct=True)

df_risk["Absence_rank"]    = df_risk["AbsenceDays_Last6M"].rank(pct=True)

df_risk["CareerStagnation_rank"] = (
    df_risk["CareerStagnationFlag"].rank(pct=True)
)


In [5]:

df_risk["BehavioralRiskScore"] = (
    df_risk["Engagement_rank"] * 0.4 +
    df_risk["Burnout_rank"]    * 0.3 +
    df_risk["Absence_rank"]    * 0.2 +
    df_risk["CareerStagnation_rank"] * 0.1
)


In [6]:
df_risk["BehavioralRiskScore"].describe()

count    15000.000000
mean         0.500007
std          0.202964
min          0.232710
25%          0.335323
50%          0.458070
75%          0.613320
max          0.996457
Name: BehavioralRiskScore, dtype: float64

In [7]:
df_risk["RiskBand"] = pd.qcut(
    df_risk["BehavioralRiskScore"],
    q=3,
    labels=["Low Risk", "Medium Risk", "High Risk"]
)


Top 10 High-Risk Employees

In [8]:
top_10_risk = (
    df_risk
    .sort_values("BehavioralRiskScore", ascending=False)
    .head(10)
)

top_10_risk[
    ["EmployeeID", "BehavioralRiskScore", "RiskBand",
     "EngagementScore", "BurnoutRiskScore", "AbsenceDays_Last6M"]
]


Unnamed: 0,EmployeeID,BehavioralRiskScore,RiskBand,EngagementScore,BurnoutRiskScore,AbsenceDays_Last6M
3734,PNR-13058,0.996457,High Risk,-3.1,1.8,17
10428,PNR-6340,0.992293,High Risk,-4.5,1.2,20
3347,PNR-12423,0.991193,High Risk,-3.0,1.2,15
8719,PNR-2878,0.990233,High Risk,-2.4,1.2,13
8673,PNR-3002,0.987267,High Risk,-1.8,1.2,11
5932,PNR-13446,0.987267,High Risk,-1.8,1.2,11
6814,PNR-15379,0.987267,High Risk,-1.8,1.2,11
14633,PNR-5705,0.981543,High Risk,-1.7,1.8,9
244,PNR-11371,0.96696,High Risk,-0.9,1.2,8
14807,PNR-7707,0.947403,High Risk,-5.4,1.8,22


In [9]:
df_risk["RiskBucket"] = pd.qcut(
    df_risk["BehavioralRiskScore"],
    q=[0, 0.5, 0.8, 1.0],
    labels=["Low Risk", "Medium Risk", "High Risk"]
)

In [10]:
df_risk["RiskBucket"].value_counts(normalize=True)

RiskBucket
Low Risk       0.520867
Medium Risk    0.279600
High Risk      0.199533
Name: proportion, dtype: float64

In [11]:

# Diagnostic association with attrition
risk_attrition = pd.crosstab(
    df_risk["RiskBucket"],
    df_risk["AttritionFlag"],
    normalize="index"
)
print(risk_attrition)


AttritionFlag         0         1
RiskBucket                       
Low Risk       0.845002  0.154998
Medium Risk    0.846924  0.153076
High Risk      0.854661  0.145339


In [12]:
# dept_risk = (
#     df_risk
#     .groupby("Department")["BehavioralRiskScore"]
#     .mean()
#     .sort_values(ascending=False)
# )

# dept_risk

In [13]:

dept_high_risk = (
    df_risk
    .assign(HighRisk = df_risk["RiskBucket"] == "High Risk")
    .groupby("Department")["HighRisk"]
    .mean()
    .sort_values(ascending=False)
)

dept_high_risk

Department
IT Support                0.211004
Quality Control           0.207777
Finance                   0.203995
Research & Development    0.202759
Sales                     0.197141
Human Resources           0.195933
Production                0.195507
Marketing                 0.194712
Logistics                 0.186872
Name: HighRisk, dtype: float64

In [14]:

job_high_risk = (
    df_risk
    .assign(HighRisk = df_risk["RiskBucket"] == "High Risk")
    .groupby("JobTitle")["HighRisk"]
    .mean()
    .sort_values(ascending=False)
)
job_high_risk.head(10)

 

JobTitle
Help Desk Specialist      0.235612
QC Manager                0.214617
Controller                0.214411
Marketing Manager         0.212008
Research Lead             0.211957
Regional Sales Manager    0.210721
Recruiter                 0.207957
IT Manager                0.206522
QC Inspector              0.200929
HR Coordinator            0.200730
Name: HighRisk, dtype: float64

In [15]:
df_risk.groupby("RiskBucket")[
    ["EngagementScore", "BurnoutRiskScore", "AbsenceDays_Last6M"]
].mean()

  df_risk.groupby("RiskBucket")[


Unnamed: 0_level_0,EngagementScore,BurnoutRiskScore,AbsenceDays_Last6M
RiskBucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Low Risk,2.420952,0.0,0.0
Medium Risk,1.633882,0.160515,0.472818
High Risk,0.268627,0.701838,5.771801


In [16]:
df_risk.groupby("RiskBucket")["BehavioralRiskScore"].mean()

  df_risk.groupby("RiskBucket")["BehavioralRiskScore"].mean()


RiskBucket
Low Risk       0.342771
Medium Risk    0.548738
High Risk      0.842174
Name: BehavioralRiskScore, dtype: float64