# üïµÔ∏è‚Äç‚ôÇÔ∏è Feb 12: Exploratory Data Analysis (EDA) Project

**Goal**: Analyze employee data to understand satisfaction, salary distribution, and turnover factors.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_theme(style="whitegrid", palette="muted")
%matplotlib inline

# Random seed for reproducibility
np.random.seed(42)
print("‚úÖ Environment ready!")

## 1. Data Generation
Let's create a realistic employee dataset.

In [None]:
n_employees = 500

data = {
    'EmployeeID': [f'EMP{i:03d}' for i in range(1, n_employees + 1)],
    'Age': np.random.randint(22, 60, n_employees),
    'Gender': np.random.choice(['Male', 'Female', 'Non-Binary'], n_employees, p=[0.48, 0.48, 0.04]),
    'Department': np.random.choice(['Sales', 'Engineering', 'Marketing', 'HR', 'Finance'], n_employees),
    'YearsAtCompany': np.random.randint(0, 15, n_employees),
    'SatisfactionScore': np.random.randint(1, 6, n_employees),
    'WorkLifeBalance': np.random.randint(1, 6, n_employees),
    'NumberProjects': np.random.randint(1, 10, n_employees)
}

df = pd.DataFrame(data)

# Generate salary based on Age, Dept, and YearsAtCompany with some noise
base_salary = 30000
df['MonthlySalary'] = base_salary + (df['Age'] * 500) + (df['YearsAtCompany'] * 1200) + np.random.normal(0, 5000, n_employees)
df.loc[df['Department'] == 'Engineering', 'MonthlySalary'] *= 1.2

# Generate turnover (LeftCompany) based on satisfaction and salary
turnover_prob = (0.6 - (df['SatisfactionScore'] * 0.1)) + (0.3 - (df['MonthlySalary'] / 150000))
turnover_prob = np.clip(turnover_prob, 0.05, 0.95)
df['LeftCompany'] = np.random.binomial(1, turnover_prob)
df['LeftCompany'] = df['LeftCompany'].map({1: 'Yes', 0: 'No'})

print("‚úÖ Dataset generated!")
df.head()

## 2. Univariate Analysis
Analyzing columns individually.

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.histplot(df['Age'], kde=True, ax=axes[0], color='skyblue')
axes[0].set_title('Age Distribution')

sns.histplot(df['MonthlySalary'], kde=True, ax=axes[1], color='salmon')
axes[1].set_title('Salary Distribution')

sns.countplot(x='SatisfactionScore', data=df, ax=axes[2], palette='viridis')
axes[2].set_title('Satisfaction Scores Count')

plt.tight_layout()
plt.show()

## 3. Bivariate Analysis
Analyzing relationships.

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='Department', y='MonthlySalary', data=df, palette='Set2')
plt.title('Salary by Department')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Department', hue='LeftCompany', data=df, palette='coolwarm')
plt.title('Turnover by Department')
plt.show()

## 4. Correlation Analysis

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='RdBu', fmt='.2f')
plt.title('Numeric Correlation Matrix')
plt.show()

## 5. Summary Findings
Based on the plots above, write your conclusions here.