In [None]:
"""
Intermediate Seaborn: Distribution & Regression Plots
=====================================================
This script demonstrates Seaborn's displot, histplot, KDE, ECDF,
and regression plot functions with detailed explanations.

Dataset: wines.csv (must have 'alcohol', 'pH', 'quality', 'type' columns)
"""

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("wines.csv")

# -----------------------------------------------------------------------------
# 1. Histograms using Pandas, Seaborn histplot(), and Seaborn displot()
# -----------------------------------------------------------------------------

# Original Pandas histogram (basic, no style)
# df['alcohol'].plot.hist()

# Improved Seaborn histplot: better aesthetics + bin control
sns.histplot(df['alcohol'], bins=20, kde=False)

plt.title("Alcohol Content Distribution (Histogram)")
plt.xlabel("Alcohol (%)")
plt.ylabel("Count")
plt.show()

# Seaborn displot() — works as a figure-level function
# Original:
# sns.displot(df['alcohol'], kind='kde', bins=10)

# Displot histogram version
sns.displot(df['alcohol'], kind='hist', bins=15, color='skyblue')
plt.suptitle("Alcohol Content Distribution (displot histogram)", y=1.02)
plt.show()

# -----------------------------------------------------------------------------
# 2. Kernel Density Estimation (KDE) plots
# -----------------------------------------------------------------------------

# KDE with displot
sns.displot(df['alcohol'], kind='kde', fill=True, color='green')
plt.suptitle("Alcohol Content Distribution (KDE)", y=1.02)
plt.show()

# KDE + rug plot (small tick marks for each observation)
sns.displot(df['alcohol'], kind='kde', rug=True, fill=True, color='purple')
plt.suptitle("Alcohol Content Distribution (KDE with Rug)", y=1.02)
plt.show()

# -----------------------------------------------------------------------------
# 3. Regression & Scatter Plots
# -----------------------------------------------------------------------------

# regplot() — axes-level function, great for quick fit + scatter
# Original:
# sns.regplot(data=df, x="alcohol", y="pH")

sns.regplot(data=df, x="alcohol", y="pH", scatter_kws={"alpha":0.5}, line_kws={"color":"red"})
plt.title("Alcohol vs pH (Regression Line)")
plt.show()

# lmplot() — figure-level version of regplot() with more faceting options
# Original:
# sns.lmplot(data=df, x="quality", y="alcohol", hue="type")

sns.lmplot(data=df, x="quality", y="alcohol", hue="type", height=6, aspect=1.2)
plt.suptitle("Quality vs Alcohol by Wine Type", y=1.02)
plt.show()

# -----------------------------------------------------------------------------
# NOTES:
# - kind parameter in displot(): 'hist', 'kde', 'ecdf' (Empirical CDF)
# - histplot() is for quick, single-axes histograms
# - displot() is for multi-facet or figure-level plotting
# - regplot() adds regression lines, lmplot() adds faceting + hue options
# -----------------------------------------------------------------------------


In [None]:
# =====================================
# Intermediate Seaborn - Chapter 2 Summary
# Styles, Colors, and Matplotlib Customization
# =====================================

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Load dataset ---
df = pd.read_csv("college_tuition.csv")  # Example dataset with 'Tuition' column

# =====================================
# 1. Seaborn Default Style
# =====================================
sns.set()                                # Applies Seaborn's default style settings globally
df['Tuition'].plot.hist()                # Even Pandas plots now use Seaborn's style
plt.show()

# =====================================
# 2. Seaborn Themes
# =====================================
# Available styles: 'white', 'dark', 'whitegrid', 'darkgrid', 'ticks'
for style in ['white', 'dark', 'whitegrid', 'darkgrid', 'ticks']:
    sns.set_style(style)                 # Apply the given style
    sns.displot(df['Tuition'])           # Plot tuition distribution
    plt.show()

# =====================================
# 3. Removing Plot Spines
# =====================================
sns.set_style('white')
sns.displot(df['Tuition'])
sns.despine(left=True)                   # Removes left spine for a cleaner look
plt.show()

# =====================================
# 4. Assigning Colors
# =====================================
# Matplotlib color codes: 'b'=blue, 'g'=green, 'r'=red, 'c'=cyan, 'm'=magenta, 'y'=yellow, 'k'=black
sns.set(color_codes=True)                # Enables short color codes
sns.displot(df['Tuition'], color='g')    # Green histogram
plt.show()

# =====================================
# 5. Color Palettes
# =====================================
# Common palettes: 'deep', 'muted', 'pastel', 'bright', 'dark', 'colorblind'
palettes = ['deep', 'muted', 'pastel', 'bright', 'dark', 'colorblind']
for p in palettes:
    sns.set_palette(p)
    sns.displot(df['Tuition'])
    plt.show()

# =====================================
# 6. Displaying Palettes
# =====================================
for p in palettes:
    sns.set_palette(p)
    sns.palplot(sns.color_palette())     # Displays current palette as a color strip
    plt.show()

# =====================================
# 7. Custom Palettes
# =====================================
# Circular (categorical, unordered data)
sns.palplot(sns.color_palette("Paired", 12))

# Sequential (data from low to high)
sns.palplot(sns.color_palette("Blues", 12))

# Diverging (low and high both important)
sns.palplot(sns.color_palette("BrBG", 12))

# =====================================
# 8. Matplotlib Axes for Customization
# =====================================
fig, ax = plt.subplots()
sns.histplot(df['Tuition'], ax=ax)
ax.set(xlabel='Tuition 2013-14')         # Set custom x-axis label
plt.show()

# =====================================
# 9. Further Axes Customizations
# =====================================
fig, ax = plt.subplots()
sns.histplot(df['Tuition'], ax=ax)
ax.set(xlabel="Tuition 2013-14",
       ylabel="Distribution",
       xlim=(0, 50000),
       title="2013-14 Tuition and Fees Distribution")
plt.show()

# =====================================
# 10. Combining Multiple Plots
# =====================================
fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(7, 4))

# Left plot: All states
sns.histplot(df['Tuition'], stat='density', ax=ax0)
ax0.set(title="All States")

# Right plot: Minnesota only
sns.histplot(df.query('State == "MN"')['Tuition'], stat='density', ax=ax1)
ax1.set(xlabel='Tuition (MN)', xlim=(0, 70000))
ax1.axvline(x=20000, label='My Budget', linestyle='--')  # Vertical budget line
ax1.legend()

plt.tight_layout()
plt.show()

# =====================================
# Quick Reference
# =====================================
# Styles: sns.set_style('white' | 'dark' | 'whitegrid' | 'darkgrid' | 'ticks')
# Palettes: 'deep', 'muted', 'pastel', 'bright', 'dark', 'colorblind'
# Palette Types:
#   - Circular: unordered categories → "Paired", "Set2", etc.
#   - Sequential: ordered, one-directional data → "Blues", "Greens", etc.
#   - Diverging: two extremes important → "BrBG", "RdBu", etc.
# Matplotlib Customization via ax.set():
#   xlabel=..., ylabel=..., title=..., xlim=(min, max), ylim=(min, max)


In [None]:
# =====================================
# Intermediate Seaborn - Chapter 3 Summary
# Categorical Plots, Regression Plots, and Matrix Plots
# =====================================

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# --- Load datasets ---
df = pd.read_csv("hospital_charges.csv")   # Example dataset for categorical plots
df_bike = pd.read_csv("bike_sharing.csv")  # Example dataset for regression/matrix plots

# =====================================
# 1. Categorical Plots
# =====================================
# -------- Show each observation --------
sns.stripplot(data=df, y="DRG Definition", x="Average Covered Charges", jitter=True)  # Scatter points for each observation, jitter spreads overlapping points
plt.show()

sns.swarmplot(data=df, y="DRG Definition", x="Average Covered Charges")  # Like stripplot, but arranges points to avoid overlap
plt.show()

# -------- Abstract representations --------
sns.boxplot(data=df, y="DRG Definition", x="Average Covered Charges")  # Shows median, quartiles, and outliers
plt.show()

sns.violinplot(data=df, y="DRG Definition", x="Average Covered Charges")  # Combines boxplot with KDE to show distribution shape
plt.show()

sns.boxenplot(data=df, y="DRG Definition", x="Average Covered Charges")  # Variation of boxplot for large datasets, shows more distribution detail
plt.show()

# -------- Statistical estimates --------
sns.barplot(data=df, y="DRG Definition", x="Average Covered Charges", hue="Region")  # Shows mean values (default estimator) with confidence intervals
plt.show()

sns.pointplot(data=df, y="DRG Definition", x="Average Covered Charges", hue="Region")  # Mean points connected by lines, good for trends
plt.show()

sns.countplot(data=df, y="DRG_Code", hue="Region")  # Shows counts of each category, separated by hue
plt.show()

# =====================================
# 2. Regression Plots
# =====================================
sns.regplot(data=df_bike, x='temp', y='total_rentals', marker='+')  # Scatter + regression line for linear trend
plt.show()

sns.residplot(data=df_bike, x='temp', y='total_rentals')  # Plots residuals (errors) to check model fit
plt.show()

sns.regplot(data=df_bike, x='temp', y='total_rentals', order=2)  # Polynomial regression (order=2 = quadratic curve)
plt.show()

sns.residplot(data=df_bike, x='temp', y='total_rentals', order=2)  # Residuals for polynomial regression
plt.show()

sns.regplot(data=df_bike, x='mnth', y='total_rentals', x_jitter=.1, order=2)  # Adds jitter to x-values (good for discrete categories)
plt.show()

sns.regplot(data=df_bike, x='mnth', y='total_rentals', x_estimator=np.mean, order=2)  # Aggregates by x using mean before plotting regression
plt.show()

sns.regplot(data=df_bike, x='temp', y='total_rentals', x_bins=4)  # Splits x into 4 bins for display, regression still uses all points
plt.show()

# =====================================
# 3. Matrix Plots
# =====================================
df_crosstab = pd.crosstab(  # Converts data into a grid (month vs weekday, mean rentals)
    df_bike["mnth"], df_bike["weekday"],
    values=df_bike["total_rentals"], aggfunc="mean"
).round(0)

sns.heatmap(df_crosstab)  # Basic heatmap for visualizing values as colors
plt.show()

sns.heatmap(df_crosstab, annot=True, fmt="d", cmap="YlGnBu", cbar=False, linewidths=.5)  # Annotated heatmap, integer labels, no color bar
plt.show()

sns.heatmap(df_crosstab, annot=True, fmt="d", cmap="YlGnBu", cbar=True,
            center=df_crosstab.loc[9, 6])  # Centers colormap on a specific value for contrast
plt.show()

sns.heatmap(df_bike[['total_rentals', 'temp', 'casual', 'hum', 'windspeed']].corr(), cmap='YlGnBu', annot=True)  # Correlation heatmap between numerical columns
plt.show()
