# Employee Attrition EDA
Goal: understand drivers of attrition (`left`), check class balance, and surface highâ€‘signal features for modeling.

In [None]:
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

try:
    project_root = Path(__file__).resolve().parents[1]
except NameError:
    project_root = Path.cwd()
    if not (project_root / "data/raw/HR_comma_sep.csv").exists() and project_root.parent.exists():
        project_root = project_root.parent

raw_path = project_root / "data/raw/HR_comma_sep.csv"
df = pd.read_csv(raw_path)

pd.options.display.max_columns = 30
sns.set_theme(style="whitegrid")

print(f"Loaded {raw_path}")
print("Shape:", df.shape)
df.head()

In [None]:
df.info()

In [None]:
df.isna().mean().sort_values(ascending=False).to_frame('missing_rate')

In [None]:
ax = df['left'].value_counts(normalize=True).mul(100).plot(kind='bar', color=['#4C78A8', '#F58518'])
ax.set_ylabel('% of employees'); ax.set_xlabel('left (1 = departed)'); ax.set_title('Attrition class balance')
for p in ax.patches:
    ax.annotate(f"{p.get_height():.1f}%", (p.get_x()+0.1, p.get_height()+0.5))
plt.show()

In [None]:
df.describe(include='all').T

In [None]:
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols].hist(bins=20, figsize=(12, 8))
plt.suptitle('Numeric feature distributions', y=1.02)
plt.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
sns.kdeplot(data=df, x='satisfaction_level', hue='left', common_norm=False, ax=axes[0,0]); axes[0,0].set_title('Satisfaction vs attrition')
sns.kdeplot(data=df, x='average_montly_hours', hue='left', common_norm=False, ax=axes[0,1]); axes[0,1].set_title('Monthly hours vs attrition')
sns.boxplot(data=df, x='left', y='time_spend_company', ax=axes[1,0]); axes[1,0].set_title('Tenure vs attrition'); axes[1,0].set_xticklabels(['Stayed','Left'])
sns.boxplot(data=df, x='left', y='number_project', ax=axes[1,1]); axes[1,1].set_title('Projects vs attrition'); axes[1,1].set_xticklabels(['Stayed','Left'])
plt.tight_layout(); plt.show()

In [None]:
dept_rate = df.groupby('Department')['left'].mean().sort_values(ascending=False)
plt.figure(figsize=(8,5))
sns.barplot(x=dept_rate.values, y=dept_rate.index, palette='viridis')
plt.xlabel('Attrition rate'); plt.title('Attrition rate by department'); plt.show()

In [None]:
sal_rate = df.groupby('salary')['left'].mean().reindex(['low','medium','high'])
plt.figure(figsize=(6,4))
sns.barplot(x=sal_rate.index, y=sal_rate.values, palette='magma')
plt.ylabel('Attrition rate'); plt.title('Attrition rate by salary band'); plt.show()

In [None]:
plt.figure(figsize=(8,6))
corr = df.select_dtypes(include='number').corr()
sns.heatmap(corr, cmap='coolwarm', center=0, annot=False)
plt.title('Correlation heatmap (numeric)')
plt.show()
corr['left'].sort_values(ascending=False)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10,4))
sns.barplot(x='promotion_last_5years', y='left', data=df, ax=axes[0], palette='Blues'); axes[0].set_title('Attrition by promotion_last_5years')
sns.barplot(x='Work_accident', y='left', data=df, ax=axes[1], palette='Greens'); axes[1].set_title('Attrition by Work_accident')
plt.tight_layout(); plt.show()

In [None]:
dept_rate = df.groupby('Department')['left'].mean().sort_values(ascending=False)
takeaways = [
    f"Attrition rate: {df['left'].mean()*100:.1f}%",
    "Low satisfaction and high monthly hours show higher attrition density.",
    f"Top attrition departments: {', '.join(dept_rate.head(3).index)}.",
    "Lower salary bands exhibit higher attrition than medium/high.",
    "No recent promotion correlates with higher attrition; workload and tenure matter together.",
]
for t in takeaways:
    print("-", t)