# Kids Dataset Visualization (MySQL-Backed, Extended, Fixed KDE)
This notebook connects to MySQL, loads the `kids` table, and provides multiple visualizations.

**Note:** KDE plots are implemented using `scipy.stats.gaussian_kde` to avoid pandas' built-in KDE dependency issues.

## Install Dependencies

In [None]:
!pip install mysql-connector-python scipy

## Connect to MySQL and Load Data

In [None]:
import mysql.connector
import pandas as pd

HOST = "localhost"
USER = "root"
PASSWORD = "YOUR_PASSWORD"   # <-- UPDATE
DATABASE = "testdb"

conn = mysql.connector.connect(
    host=HOST,
    user=USER,
    password=PASSWORD,
    database=DATABASE
)

df = pd.read_sql("SELECT id, race, country, age, height, weight, sex FROM kids;", conn)
conn.close()

# Ensure numeric types for continuous variables
for col in ["age", "height", "weight"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df.head()

## Import Plotting Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde

## Age Distribution Histogram

In [None]:
plt.figure()
plt.hist(df['age'].dropna())
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.title("Age Distribution")
plt.show()

## Height vs Weight Scatter Plot

In [None]:
plt.figure()
plt.scatter(df['height'], df['weight'])
plt.xlabel("Height")
plt.ylabel("Weight")
plt.title("Height vs Weight")
plt.show()

## Average Height by Race (Bar Plot)

In [None]:
avg_height = df.groupby('race')['height'].mean()

plt.figure()
plt.bar(avg_height.index, avg_height.values)
plt.xlabel("Race")
plt.ylabel("Average Height")
plt.title("Average Height by Race")
plt.xticks(rotation=45)
plt.show()

## Height Boxplot by Race

In [None]:
plt.figure(figsize=(8,5))
df.boxplot(column='height', by='race', rot=45)
plt.title("Height Boxplot by Race")
plt.suptitle("")
plt.xlabel("Race")
plt.ylabel("Height")
plt.show()

## Weight Boxplot by Sex

In [None]:
plt.figure(figsize=(6,5))
df.boxplot(column='weight', by='sex')
plt.title("Weight Boxplot by Sex")
plt.suptitle("")
plt.xlabel("Sex")
plt.ylabel("Weight")
plt.show()

## KDE Height Distribution (Using gaussian_kde)

In [None]:
height_vals = df['height'].dropna().to_numpy()
if len(height_vals) > 1:
    kde = gaussian_kde(height_vals)
    xs = np.linspace(height_vals.min(), height_vals.max(), 200)

    plt.figure()
    plt.plot(xs, kde(xs))
    plt.xlabel("Height")
    plt.ylabel("Density")
    plt.title("KDE Density — Height")
    plt.show()
else:
    print("Not enough data points to compute KDE for height.")

## KDE Weight Distribution (Using gaussian_kde)

In [None]:
weight_vals = df['weight'].dropna().to_numpy()
if len(weight_vals) > 1:
    kde = gaussian_kde(weight_vals)
    xs = np.linspace(weight_vals.min(), weight_vals.max(), 200)

    plt.figure()
    plt.plot(xs, kde(xs))
    plt.xlabel("Weight")
    plt.ylabel("Density")
    plt.title("KDE Density — Weight")
    plt.show()
else:
    print("Not enough data points to compute KDE for weight.")

## Height KDE by Race (Violin-Style Approximation)

In [None]:
plt.figure(figsize=(8,5))

for race in df['race'].unique():
    subset = df[df['race'] == race]['height'].dropna().to_numpy()
    if len(subset) > 1:
        kde = gaussian_kde(subset)
        xs = np.linspace(subset.min(), subset.max(), 200)
        plt.plot(xs, kde(xs), label=race)

plt.xlabel("Height")
plt.ylabel("Density")
plt.title("Height KDE by Race")
plt.legend()
plt.show()