# Lesson 4.2: Seaborn - Beautiful Statistical Plots

## Matplotlib But Prettier and Smarter

Seaborn wraps Matplotlib and adds:
- Better default styling
- Statistical plots (distribution, correlation)
- Works directly with Pandas DataFrames
- Built-in datasets for practice

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

sns.set_theme(style='whitegrid')  # Clean look
%matplotlib inline

In [None]:
# Create water filter dataset for examples
np.random.seed(42)
n = 100
df = pd.DataFrame({
    'tds_output': np.concatenate([np.random.normal(50, 15, 70), np.random.normal(120, 20, 30)]),
    'flow_rate': np.random.uniform(0.5, 2.5, n),
    'age_days': np.random.randint(10, 365, n),
    'pressure': np.random.uniform(30, 70, n),
    'region': np.random.choice(['North', 'South', 'East', 'West'], n),
    'status': np.random.choice(['good', 'degraded', 'needs_repair'], n, p=[0.5, 0.3, 0.2])
})
df.head()

In [None]:
# DISTRIBUTION PLOTS
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

sns.histplot(df['tds_output'], kde=True, ax=axes[0])
axes[0].set_title('TDS Distribution')

sns.boxplot(x='status', y='tds_output', data=df, ax=axes[1])
axes[1].set_title('TDS by Status')

sns.violinplot(x='region', y='flow_rate', data=df, ax=axes[2])
axes[2].set_title('Flow Rate by Region')

plt.tight_layout()
plt.show()

In [None]:
# CORRELATION HEATMAP - see which features are related
numeric_cols = df.select_dtypes(include=[np.number])

plt.figure(figsize=(8, 6))
sns.heatmap(numeric_cols.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlations')
plt.show()

In [None]:
# PAIRPLOT - see ALL relationships at once (powerful for EDA!)
sns.pairplot(df[['tds_output', 'flow_rate', 'age_days', 'status']], 
             hue='status', diag_kind='kde', height=2.5)
plt.suptitle('Pairwise Relationships', y=1.02)
plt.show()

In [None]:
# COUNTPLOT - frequency of categories
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.countplot(x='status', data=df, ax=axes[0], order=['good', 'degraded', 'needs_repair'])
axes[0].set_title('Filter Status Distribution')

sns.countplot(x='region', data=df, ax=axes[1])
axes[1].set_title('Filters by Region')

plt.tight_layout()
plt.show()

## Exercise

1. Load seaborn's `tips` dataset: `tips = sns.load_dataset('tips')`
2. Create a boxplot of total_bill by day
3. Create a heatmap of correlations
4. Create a scatterplot of total_bill vs tip, colored by time (lunch/dinner)

In [None]:
# YOUR CODE HERE
# tips = sns.load_dataset('tips')