# HR Dataset Generation

This notebook generates four synthetic datasets for HR and L&D analysis.

**Datasets generated:**
- `employee_master`: Employee demographics and job data
- `ld_spend`: Learning & Development spend per employee (randomized)
- `performance_scores`: Performance ratings across 5 quarters
- `satisfaction_engagement`: Quarterly job satisfaction and engagement scores

In [None]:
import pandas as pd
import numpy as np

rng = np.random.default_rng(42)
N_EMP = 500
DEPTS = ["Sales", "Engineering", "HR", "Marketing", "Finance"]
LEVELS = ["Junior", "Mid", "Senior", "Lead"]
GENDERS = ["Female", "Male", "Non-binary"]
ETHNICITIES = ["White", "Asian", "Black", "Latinx", "Indigenous", "Other"]
STATUSES = ["Active", "Terminated"]
TODAY = pd.Timestamp("2025-06-30")

## 1. `employee_master`

In [None]:
emp = pd.DataFrame({
    "emp_id": np.arange(1, N_EMP + 1),
    "dept": rng.choice(DEPTS, N_EMP),
    "level": rng.choice(LEVELS, N_EMP, p=[0.3, 0.4, 0.2, 0.1]),
    "gender": rng.choice(GENDERS, N_EMP),
    "ethnicity": rng.choice(ETHNICITIES, N_EMP),
    "fte": rng.choice([1.0, 0.8], size=N_EMP, p=[0.75, 0.25]),
    "hire_date": pd.to_datetime("2020-01-01") + pd.to_timedelta(rng.integers(0, 365*4, N_EMP), unit="D"),
    "status": rng.choice(STATUSES, N_EMP, p=[0.85, 0.15]),
    "age_on_hire": rng.integers(21, 60, size=N_EMP)
})
emp["tenure_yrs"] = ((TODAY - emp["hire_date"]).dt.days / 365).round(2)

## 2. `ld_spend`

In [None]:
LD_BUDGET_CAP = 1000
prob_spend = 0.70
did_spend = rng.random(N_EMP) < prob_spend
ld_spend = np.where(
    did_spend,
    np.round(rng.uniform(1, LD_BUDGET_CAP, size=N_EMP) / 10) * 10,
    0
).astype(int)

ld_spend_df = pd.DataFrame({
    "emp_id": emp["emp_id"],
    "ld_spend": ld_spend
})

## 3. `performance_scores`

In [None]:
def perf_growth(row):
    spend_amt = ld_spend_df.loc[ld_spend_df['emp_id'] == row["emp_id"], "ld_spend"].values[0]
    bump = (spend_amt / 1000) * rng.normal(2.0, 0.8)
    noise = rng.normal(0, 1.8)
    return np.clip(row["perf_q0"] + bump + noise, 30, 100)

emp["perf_q0"] = rng.normal(70, 10, N_EMP).clip(30, 95).round(1)
for i in range(1, 5):
    emp[f"perf_q{i}"] = emp.apply(perf_growth, axis=1)

performance_scores = emp.melt(
    id_vars="emp_id",
    value_vars=["perf_q0", "perf_q1", "perf_q2", "perf_q3", "perf_q4"],
    var_name="quarter",
    value_name="performance"
)
performance_scores["quarter"] = performance_scores["quarter"].str.upper()

## 4. `satisfaction_engagement`

In [None]:
satisfaction_engagement = pd.DataFrame({
    "emp_id": np.repeat(emp["emp_id"], 4),
    "quarter": ["Q1", "Q2", "Q3", "Q4"] * N_EMP,
    "satisfaction_score": rng.normal(3.5, 0.8, N_EMP * 4).clip(1, 5).round(2),
    "engagement_score": rng.normal(70, 15, N_EMP * 4).clip(20, 100).round(1)
})