# Exploratory Data Analysis (EDA)

Target Table: `destination_table`

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine


## Load Data from MySQL

In [None]:

engine = create_engine(
    "mysql+mysqlconnector://root:mp22pass@localhost/scu2_homeworks"
)

query = '''
SELECT id, name, age, country, salary, continent
FROM destination_table
'''

df = pd.read_sql(query, engine)
df.head()


## Dataset Overview

In [None]:

df.info()
df.describe()


## Salary Distribution

In [None]:

plt.hist(df['salary'], bins=20)
plt.xlabel("Salary")
plt.ylabel("Frequency")
plt.title("Salary Distribution")
plt.show()


## Salary by Continent

In [None]:

df.boxplot(column='salary', by='continent')
plt.title("Salary Distribution by Continent")
plt.suptitle("")
plt.xlabel("Continent")
plt.ylabel("Salary")
plt.show()


## Average Salary per Country

In [None]:

avg_salary_country = df.groupby('country')['salary'].mean().sort_values(ascending=False)
avg_salary_country.plot(kind='bar')
plt.xlabel("Country")
plt.ylabel("Average Salary")
plt.title("Average Salary per Country")
plt.show()


## Employee Count per Continent

In [None]:

df['continent'].value_counts().plot(kind='bar')
plt.xlabel("Continent")
plt.ylabel("Employee Count")
plt.title("Employee Distribution by Continent")
plt.show()


## Age vs Salary

In [None]:

plt.scatter(df['age'], df['salary'])
plt.xlabel("Age")
plt.ylabel("Salary")
plt.title("Age vs Salary")
plt.show()


## Salary Bands

In [None]:

df['salary_band'] = pd.cut(
    df['salary'],
    bins=[0, 40000, 60000, 80000, 100000, 200000],
    labels=["Low", "Mid", "Upper-Mid", "High", "Very High"]
)

df['salary_band'].value_counts().sort_index().plot(kind='bar')
plt.xlabel("Salary Band")
plt.ylabel("Count")
plt.title("Salary Band Distribution")
plt.show()


## Missing Values

In [None]:

df.isnull().sum()


## Correlation Analysis

In [None]:

corr = df[['age', 'salary']].corr()
corr


In [None]:

plt.imshow(corr)
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns)
plt.yticks(range(len(corr)), corr.columns)
plt.title("Correlation Matrix")
plt.show()
