# Employees EDA: MySQL → Pandas → Visualizations

## Prerequisites
Install packages if needed:
```bash
pip install pandas mysql-connector-python matplotlib
```


In [None]:
# ---- Configuration ----
HOST = "127.0.0.1"
USER = "root"
PASSWORD = "yourpassword"
DATABASE = "your_database"
TABLE_NAME = "employees"  # change if needed


In [None]:
# ---- Imports & Connection ----
import pandas as pd
import mysql.connector
import matplotlib.pyplot as plt

%matplotlib inline

conn = mysql.connector.connect(
    host=HOST,
    user=USER,
    password=PASSWORD,
    database=DATABASE
)
print("✅ Connected to MySQL!")

In [None]:
# ---- Load employees table ----
query = f"SELECT * FROM {TABLE_NAME};"
df = pd.read_sql(query, conn)
print(f"Loaded {len(df)} rows from '{TABLE_NAME}'.")
df.head()

In [None]:
# ---- Info & Missing Values ----
print("DataFrame info:")
print(df.info())
print("\nMissing values per column:")
print(df.isna().sum())

# Numeric summary
df.describe(include='number')

In [None]:
# ---- Categorical overview ----
categorical_cols = ["degree", "gender", "country", "department"]
summary = {col: df[col].astype('category').value_counts(dropna=False) for col in categorical_cols if col in df.columns}
for col, counts in summary.items():
    print(f"\nUnique values for {col}:")
    print(counts)

In [None]:
# ---- Chart: Age histogram ----
plt.figure(figsize=(8,5))
df['age'].dropna().plot(kind='hist', bins=20)
plt.title("Age Distribution of Employees")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

In [None]:
# ---- Chart: Salary histogram ----
plt.figure(figsize=(8,5))
df['salary'].dropna().plot(kind='hist', bins=20)
plt.title("Salary Distribution of Employees")
plt.xlabel("Salary")
plt.ylabel("Frequency")
plt.show()

In [None]:
# ---- Chart: Gender bar ----
plt.figure(figsize=(6,4))
gender_counts = df['gender'].value_counts(dropna=False)
gender_counts.plot(kind='bar')
plt.title("Gender Distribution")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

In [None]:
# ---- Group By: Avg Salary by Degree ----
avg_salary_degree = df.groupby('degree', dropna=False)['salary'].mean().sort_values(ascending=False)
display(avg_salary_degree)

plt.figure(figsize=(9,5))
avg_salary_degree.plot(kind='bar')
plt.title("Average Salary by Degree")
plt.xlabel("Degree")
plt.ylabel("Average Salary")
plt.tight_layout()
plt.show()

In [None]:
# ---- Group By: Avg Salary by Country ----
avg_salary_country = df.groupby('country', dropna=False)['salary'].mean().sort_values(ascending=False)
display(avg_salary_country)

plt.figure(figsize=(10,5))
avg_salary_country.plot(kind='bar')
plt.title("Average Salary by Country")
plt.xlabel("Country")
plt.ylabel("Average Salary")
plt.tight_layout()
plt.show()

In [None]:
# ---- Group By: Employees per Department ----
dept_counts = df['department'].value_counts(dropna=False).sort_values(ascending=False)
display(dept_counts)

plt.figure(figsize=(10,5))
dept_counts.plot(kind='bar')
plt.title("Employees per Department")
plt.xlabel("Department")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
# ---- Boxplot: Age per Department ----
groups = []
labels = []
for dept, g in df.groupby('department'):
    if g['age'].notna().sum() > 0:
        groups.append(g['age'].dropna().values)
        labels.append(str(dept))

plt.figure(figsize=(12,6))
plt.boxplot(groups, labels=labels, showmeans=True)
plt.title("Age Distribution per Department")
plt.xlabel("Department")
plt.ylabel("Age")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# ---- Optional: Save charts to PNG files ----
import os
out_dir = "eda_charts"
os.makedirs(out_dir, exist_ok=True)

# Age histogram
plt.figure(figsize=(8,5))
df['age'].dropna().plot(kind='hist', bins=20)
plt.title("Age Distribution of Employees")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.savefig(os.path.join(out_dir, "age_hist.png"))
plt.close()

# Salary histogram
plt.figure(figsize=(8,5))
df['salary'].dropna().plot(kind='hist', bins=20)
plt.title("Salary Distribution of Employees")
plt.xlabel("Salary")
plt.ylabel("Frequency")
plt.savefig(os.path.join(out_dir, "salary_hist.png"))
plt.close()

# Gender distribution
plt.figure(figsize=(6,4))
gender_counts = df['gender'].value_counts(dropna=False)
gender_counts.plot(kind='bar')
plt.title("Gender Distribution")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.savefig(os.path.join(out_dir, "gender_bar.png"))
plt.close()

# Average salary by degree
plt.figure(figsize=(9,5))
avg_salary_degree.plot(kind='bar')
plt.title("Average Salary by Degree")
plt.xlabel("Degree")
plt.ylabel("Average Salary")
plt.tight_layout()
plt.savefig(os.path.join(out_dir, "avg_salary_by_degree.png"))
plt.close()

# Average salary by country
plt.figure(figsize=(10,5))
avg_salary_country.plot(kind='bar')
plt.title("Average Salary by Country")
plt.xlabel("Country")
plt.ylabel("Average Salary")
plt.tight_layout()
plt.savefig(os.path.join(out_dir, "avg_salary_by_country.png"))
plt.close()

# Employees per department
plt.figure(figsize=(10,5))
dept_counts.plot(kind='bar')
plt.title("Employees per Department")
plt.xlabel("Department")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(os.path.join(out_dir, "employees_per_department.png"))
plt.close()

# Age boxplot per department
plt.figure(figsize=(12,6))
plt.boxplot(groups, labels=labels, showmeans=True)
plt.title("Age Distribution per Department")
plt.xlabel("Department")
plt.ylabel("Age")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(out_dir, "age_boxplot_by_department.png"))
plt.close()

print(f"Saved charts to: {os.path.abspath(out_dir)}")

In [None]:
# ---- Cleanup ----
try:
    conn.close()
    print("🔌 MySQL connection closed.")
except Exception as e:
    print("Connection close error:", e)