In [None]:
import pandas as pd
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import os
import glob
import time

# --- Custom Style Import ---
import plotly.io as pio

try:
    import custom_plotly_style_te as cpte

    pio.templates["te"] = cpte.te_style_template
    pio.templates.default = "te"
except ImportError:
    print("‚ö†Ô∏è Custom style not found. Falling back to Plotly default.")
    pio.templates.default = "plotly_white"

# --- Global Config ---
# Paths
PATH_AGG_NETWORK = r"X:\Programming\Python\Projects\Data processing\TLC NYC datasets\HVFHV subsets 2019-2025 - Aggregates\Aggregates_Processed\agg_network_monthly.parquet"
PATH_SAMPLES_DIR = r"X:\Programming\Python\Projects\Data processing\TLC NYC datasets\HVFHV subsets 2019-2025 - Samples"
PATH_ZONES = "./taxi_zones_detailed.csv"  # Ensure this is in current folder

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:,.2f}".format)


In [None]:
import plotly.io as pio
import uber_style as ub


pio.templates["uber"] = ub.uber_style_template
pio.templates.default = "uber"

In [None]:
def load_zone_lookup():
    """Loads Zone names for joining."""
    return pl.read_csv(PATH_ZONES).select(["LocationID", "Zone", "Borough", "service_zone"])


def load_mart_network():
    """Loads Mart 2: Network Backbone."""
    print(f"üìÇ Loading Network Mart: {os.path.basename(PATH_AGG_NETWORK)}...")
    return pl.scan_parquet(PATH_AGG_NETWORK)


def load_samples(years=None):
    """
    Loads Sample files.
    Args:
        years (list): e.g. ['2023', '2024']. If None, loads ALL (Careful with RAM).
    """
    pattern = os.path.join(PATH_SAMPLES_DIR, "*.parquet")
    files = sorted(glob.glob(pattern))

    if years:
        files = [f for f in files if any(y in f for y in years)]

    print(f"üìÇ Loading Samples: Found {len(files)} files matching {years if years else 'ALL'}...")
    return pl.scan_parquet(files)


def plot_quadrant_scatter(df_pd, x_col, y_col, hover_name, title, x_label, y_label):
    """
    Reusable Quadrant Scatter Plotter.
    Shades the 'Attractive' quadrant (High Volume / Low Volatility).
    """
    # Calculate Quadrant Thresholds (Medians)
    x_med = df_pd[x_col].median()
    y_med = df_pd[y_col].median()

    fig = go.Figure()

    # Add Scatter
    fig.add_trace(
        go.Scatter(
            x=df_pd[x_col],
            y=df_pd[y_col],
            mode="markers",
            text=df_pd[hover_name],
            marker=dict(
                size=8,
                color=df_pd["cv_rank"],  # Color by rank/intensity
                colorscale="Viridis",
                showscale=True,
                line=dict(width=0.5, color="DarkSlateGrey"),
            ),
            hovertemplate="<b>%{text}</b><br>"
            + f"{x_label}: %{{x:,.0f}}<br>"
            + f"{y_label}: %{{y:.2f}}<extra></extra>",
        )
    )

    # Add Quadrant Lines
    fig.add_vline(x=x_med, line_width=1, line_dash="dash", line_color="grey")
    fig.add_hline(y=y_med, line_width=1, line_dash="dash", line_color="grey")

    # Shade the "Gold Mine" (High Volume / Low Volatility) -> Bottom Right
    # Assuming X is Volume (Higher is better), Y is CV (Lower is better)
    fig.add_shape(
        type="rect",
        x0=x_med,
        y0=0,
        x1=df_pd[x_col].max() * 1.05,
        y1=y_med,
        fillcolor="green",
        opacity=0.1,
        layer="below",
        line_width=0,
    )

    # Annotations
    fig.add_annotation(
        x=df_pd[x_col].max(),
        y=0,
        text="<b>STABLE CASH COWS</b>",
        showarrow=False,
        xanchor="right",
        yanchor="bottom",
        font=dict(color="green"),
    )
    fig.add_annotation(
        x=df_pd[x_col].min(),
        y=df_pd[y_col].max(),
        text="<b>NOISY / RARE</b>",
        showarrow=False,
        xanchor="left",
        yanchor="top",
        font=dict(color="red"),
    )

    fig.update_layout(title=title, xaxis_title=x_label, yaxis_title=y_label, height=600)
    return fig


In [None]:
# --- Configuration ---
# 1. Data Scope
TARGET_YEARS = None  # None = All Years (2019-2025). Set to ['2023', '2024'] for post-COVID only.
SCALING_FACTOR = 100  # Since we have a 1% sample, multiply vol by 100 to est. true numbers.

# 2. Zone Highlights (The "Story" Points)
# We map LocationIDs to specific Marker Symbols and Colors
SPECIAL_ZONES = {
    # Airports (Symbol: Airplane)
    132: {"type": "Airport", "mode": "emoji", "marker": "‚úàÔ∏è", "color": "black", "label": "JFK"},
    138: {"type": "Airport", "mode": "emoji", "marker": "‚úàÔ∏è", "color": "black", "label": "LGA"},
    1: {"type": "Airport", "mode": "emoji", "marker": "‚úàÔ∏è", "color": "black", "label": "EWR"},
    # Major Hubs (Symbol: Star)
    230: {"type": "Hub", "mode": "emoji", "marker": "‚≠ê", "color": "#FF4500", "label": "Times Sq"},
    161: {"type": "Hub", "mode": "emoji", "marker": "‚≠ê", "color": "#FF4500", "label": "Midtown Center"},
    237: {"type": "Hub", "mode": "emoji", "marker": "‚≠ê", "color": "#FF4500", "label": "Upper East Side S"},
    186: {"type": "Hub", "mode": "emoji", "marker": "‚≠ê", "color": "#FF4500", "label": "Penn Station"},
    # Transit Deserts (Use Geometric Shape)
    61: {"type": "Local Hub", "mode": "shape", "marker": "diamond", "color": "#9400D3", "label": "Crown Heights"},
    10: {"type": "Local Hub", "mode": "shape", "marker": "diamond", "color": "#9400D3", "label": "Baisley Park"},
}


In [None]:
# ==========================================
# ‚ùÑÔ∏è HUNT 4: THE SNOW PARADOX - PHASE 1
# ==========================================
def analyze_weather_impact(years=None):
    print("‚ùÑÔ∏è Initializing Weather Forensics Phase 1...")

    # Load & Filter
    # We exclude "Free Rides" (Fare=0) to keep distributions clean
    lf = load_samples(years).filter(pl.col("base_passenger_fare") > 0)

    # --- 1. Global Stats by Weather State ---
    print("   ...Computing Summary Stats...")
    w_stats = (
        lf.group_by("weather_state")
        .agg([
            pl.len().alias("count"),
            pl.col("base_passenger_fare").median().alias("med_fare"),
            pl.col("base_passenger_fare").mean().alias("avg_fare"),
            pl.col("trip_km").median().alias("med_dist"),
            pl.col("duration_min").median().alias("med_duration"),
            pl.col("speed_kmh").median().alias("med_speed"),
            pl.col("total_wait_time_min").median().alias("med_wait"),
            pl.col("tipping_pct").mean().alias("avg_tip_pct"),
            (pl.col("tipping_pct") > 0).mean().alias("tip_propensity"),
        ])
        .sort("count", descending=True)
        .collect()
        .to_pandas()
    )

    # --- 2. Distribution Sampling (For Violin Plots) ---
    print("   ...Sampling for Distribution Plots...")
    # We grab 1M rows max for plotting speed
    w_dist = (
        lf.select(["weather_state", "base_passenger_fare", "trip_km", "duration_min", "total_wait_time_min"])
        .collect()
        .sample(n=800_000, seed=105)
        .to_pandas()
    )

    return w_stats, w_dist


# Execute
df_w_stats, df_w_dist = analyze_weather_impact(TARGET_YEARS)



In [None]:
# --- VISUALIZATION ---
# We define a fixed color map for consistency across phases
weather_colors = {
    "clear_cloudy": "#1f77b4",  # Blue
    "raining": "#ff7f0e",  # Orange
    "snow_on_ground": "#2ca02c",  # Green
    "snowing": "#d62728",  # Red
}

fig = make_subplots(
    rows=2,
    cols=2,
    subplot_titles=(
        "<b>1. Fare Distribution ($)</b><br><i>Is Snow cheaper?</i>",
        "<b>2. Distance Distribution (km)</b><br><i>Are trips shorter?</i>",
        "<b>3. Duration Distribution (min)</b><br><i>Are trips faster?</i>",
        "<b>4. Wait Time Distribution (min)</b><br><i>Is service worse?</i>",
    ),
    vertical_spacing=0.15,
)


# Helper for Violins
def add_violin(fig, df, y_col, row, col, name_map, y_max=None):
    for state in ["clear_cloudy", "raining", "snow_on_ground", "snowing"]:
        subset = df[df["weather_state"] == state]
        fig.add_trace(
            go.Violin(
                y=subset[y_col],
                name=state,
                line_color=weather_colors[state],
                box_visible=True,
                meanline_visible=True,
                showlegend=(row == 1 and col == 1),  # Only show legend once
                points=False,
            ),
            row=row,
            col=col,
        )
    if y_max:
        fig.update_yaxes(range=[0, y_max], row=row, col=col)


# Plot 1: Fare (Cap at $60 to see the median shift clearly)
add_violin(fig, df_w_dist, "base_passenger_fare", 1, 1, weather_colors, y_max=60)

# Plot 2: Distance (Cap at 15km)
add_violin(fig, df_w_dist, "trip_km", 1, 2, weather_colors, y_max=15)

# Plot 3: Duration (Cap at 60min)
add_violin(fig, df_w_dist, "duration_min", 2, 1, weather_colors, y_max=60)

# Plot 4: Wait Time (Cap at 20min)
add_violin(fig, df_w_dist, "total_wait_time_min", 2, 2, weather_colors, y_max=20)

fig.update_layout(
    height=900,
    title_text="<b>Phase 1: Weather State Vital Signs</b>",
    template="te",
    hovermode=False,
)
fig.show()

print("\nüîé KEY STATS (Medians & Means):")
display(df_w_stats.style.background_gradient(cmap="Reds", subset=["med_fare", "med_dist", "med_wait"]))


In [None]:
# ==========================================
# ‚ùÑÔ∏è HUNT 4: THE SNOW PARADOX - PHASE 2 (Composition)
# ==========================================


def analyze_weather_composition(years=None):
    print("‚ùÑÔ∏è Initializing Weather Forensics Phase 2: Composition...")

    lf = load_samples(years).filter(pl.col("base_passenger_fare") > 0)

    # --- 1. Archetype Breakdown ---
    print("   ...Analyzing Trip Types...")
    arch_stats = (
        lf.group_by(["weather_state", "trip_archetype"])
        .agg(pl.len().alias("count"))
        .with_columns((pl.col("count") / pl.col("count").sum().over("weather_state")).alias("pct_share"))
        .collect()
        .to_pandas()
    )

    # --- 2. Borough Breakdown ---
    print("   ...Analyzing Geography...")
    boro_stats = (
        lf.group_by(["weather_state", "pickup_borough"])
        .agg(pl.len().alias("count"))
        .with_columns((pl.col("count") / pl.col("count").sum().over("weather_state")).alias("pct_share"))
        .collect()
        .to_pandas()
    )

    # --- 3. Unit Economics (Surge Check) ---
    print("   ...Sampling Unit Costs...")
    # We assume Cost/Km is the best proxy for "Surge Intensity"
    unit_dist = (
        lf.select(["weather_state", "cost_per_km", "speed_kmh"]).collect().sample(n=500_000, seed=42).to_pandas()
    )

    return arch_stats, boro_stats, unit_dist


# Execute
df_arch, df_boro, df_unit = analyze_weather_composition(TARGET_YEARS)



In [None]:
# --- VISUALIZATION ---
fig = make_subplots(
    rows=2,
    cols=2,
    specs=[[{"type": "xy"}, {"type": "xy"}], [{"colspan": 2}, None]],
    subplot_titles=(
        "<b>1. The 'Airport Collapse': Trip Types by Weather</b>",
        "<b>2. The 'Manhattan Exodus': Geography by Weather</b>",
        "<b>3. The 'Surge' Reality: Cost per Km ($/km)</b>",
    ),
    vertical_spacing=0.15,
)

# Plot 1: Archetype Share
for archetype in df_arch["trip_archetype"].unique():
    subset = df_arch[df_arch["trip_archetype"] == archetype]
    fig.add_trace(
        go.Bar(
            x=subset["weather_state"],
            y=subset["pct_share"],
            name=archetype,
            text=subset["pct_share"].apply(lambda x: f"{x:.1%}"),
            textposition="auto",
        ),
        row=1,
        col=1,
    )

# Plot 2: Borough Share
# We highlight Manhattan vs Others
manhattan = df_boro[df_boro["pickup_borough"] == "Manhattan"]
others = df_boro[df_boro["pickup_borough"] != "Manhattan"].groupby("weather_state")["pct_share"].sum().reset_index()

fig.add_trace(
    go.Bar(
        x=manhattan["weather_state"],
        y=manhattan["pct_share"],
        name="Manhattan",
        marker_color="#1f77b4",
        text=manhattan["pct_share"].apply(lambda x: f"{x:.1%}"),
        textposition="auto",
    ),
    row=1,
    col=2,
)
fig.add_trace(
    go.Bar(
        x=others["weather_state"],
        y=others["pct_share"],
        name="Outer Boroughs",
        marker_color="grey",
        text=others["pct_share"].apply(lambda x: f"{x:.1%}"),
        textposition="auto",
    ),
    row=1,
    col=2,
)

# Plot 3: Unit Cost Boxplot
# We cap Y at $15/km to hide extreme outliers
add_violin(
    fig,
    df_unit,
    "cost_per_km",
    2,
    1,
    weather_colors,
)

fig.update_layout(
    height=900,
    title_text="<b>Phase 2: Unmasking the Composition Shift</b>",
    template="te",
    barmode="stack",
)
fig.update_traces(hoverinfo="skip", row=2, col=1)
fig.update_yaxes(range=[-11, 35], row=2, col=1)
fig.show()

print("\nüîé AIRPORT SHARE CHECK:")
display(df_arch[df_arch["trip_archetype"] == "airport"].sort_values("pct_share"))


In [None]:
def visualize_composition_shift_v2(df_arch, df_boro):
    """
    Refined Stacked Bar Analysis: Trip Types & Geography.
    Focuses on the 'Compositional Shift' during weather events.
    """

    # 1. Define Color Schemas
    # Purposeful mapping to keep "Commute" as the heavy base and "Airport" as the highlight
    archetype_colors = {
        "commute": ub.UBER_BLACK,  # The Baseline
        "leisure": ub.UBER_GREEN,  # The Variable
        "nightlife": ub.UBER_PURPLE,  # The Niche
        "airport": ub.UBER_ORANGE,  # The Highlight (showing the collapse)
    }

    # 2. Setup Subplots
    fig = make_subplots(
        rows=1,
        cols=2,
        subplot_titles=(
            "<b>1. The 'Airport Collapse'</b><br><span style='font-size:14px; color:gray'>Trip Types by Weather</span>",
            "<b>2. The 'Manhattan Exodus'</b><br><span style='font-size:14px; color:gray'>Geography by Weather</span>",
        ),
        horizontal_spacing=0.15,
    )

    # --- PLOT 1: Archetype Share (Stacked) ---
    # We explicitly order the stack: Commute (Base) -> Leisure -> Nightlife -> Airport (Top)
    stack_order = ["commute", "leisure", "nightlife", "airport"]

    for archetype in stack_order:
        subset = df_arch[df_arch["trip_archetype"] == archetype]

        # Determine text color for contrast (White text on dark bars)
        text_col = ub.UBER_WHITE if archetype in ["commute", "nightlife", "leisure"] else ub.UBER_BLACK

        fig.add_trace(
            go.Bar(
                x=subset["weather_state"],
                y=subset["pct_share"],
                name=archetype.capitalize(),
                marker_color=archetype_colors.get(archetype, ub.GRAY_500),
                marker_line_width=0,
                text=subset["pct_share"].apply(lambda x: f"{x:.1%}" if x > 0.04 else ""),  # Hide labels < 5%
                textposition="auto",
                textfont=dict(color=text_col),
                hovertemplate="<b>%{x}</b><br>%{y:.1%} Share<extra></extra>",
            ),
            row=1,
            col=1,
        )

    # --- PLOT 2: Geography Share (Stacked) ---
    # Manhattan (Focus) vs Outer Boroughs (Context)

    # 1. Manhattan (The Focus - Black)
    manhattan = df_boro[df_boro["pickup_borough"] == "Manhattan"]
    fig.add_trace(
        go.Bar(
            x=manhattan["weather_state"],
            y=manhattan["pct_share"],
            name="Manhattan",
            marker_color=ub.UBER_BLACK,
            marker_line_width=0,
            text=manhattan["pct_share"].apply(lambda x: f"{x:.1%}"),
            textposition="auto",
            textfont=dict(color=ub.UBER_WHITE),
            hovertemplate="<b>Manhattan</b><br>%{y:.1%} Share<extra></extra>",
        ),
        row=1,
        col=2,
    )

    # 2. Outer Boroughs (The Context - Gray)
    # We aggregate everything that isn't Manhattan
    others = df_boro[df_boro["pickup_borough"] != "Manhattan"].groupby("weather_state")["pct_share"].sum().reset_index()
    fig.add_trace(
        go.Bar(
            x=others["weather_state"],
            y=others["pct_share"],
            name="Outer Boroughs",
            marker_color=ub.GRAY_300,  # Subtle neutral
            marker_line_width=0,
            text=others["pct_share"].apply(lambda x: f"{x:.1%}"),
            textposition="auto",
            textfont=dict(color=ub.UBER_BLACK),
            hovertemplate="<b>Outer Boroughs</b><br>%{y:.1%} Share<extra></extra>",
        ),
        row=1,
        col=2,
    )

    # 4. Branding
    title = "Trip Volume Composition Shifts Under Severe Weather Conditions"
    subtitle = (
        f"In more destructive weather (Snowing and snow on ground), <span style='color:{ub.UBER_ORANGE}'><b>Airport</b></span> trips vanish <br>and activity shifts to the "
        f"<span style='color:{ub.GRAY_600}'><b>Outer Boroughs</b></span>, dropping Manhattan share lower than usual."
    )

    fig = ub.apply_uber_branding(
        fig,
        title=title,
        subtitle=subtitle,
        source="Source: TLC NYC High Volume FHV Records (2019-2025)",
        footer_y=-0.15,
        logo_y=-0.16,
    )

    # 5. Layout Adjustments
    fig.update_layout(
        barmode="stack",
        bargap=0.3,
        legend=dict(
            orientation="h",
            y=1.25,
            # x=0.5,
            # xanchor="center",  # Legend below charts
        ),
        height=900,
        margin=dict(t=230, b=130),
    )

    # Y-Axis Standardizaion (0 to 100%)
    fig.update_yaxes(tickformat=".0%", range=[0, 1.03], showgrid=True, gridcolor=ub.GRAY_100)

    fig.show()
    # fig.write_image("composition_shift_severe_weather.png", scale=10, height=900, width=1800)
    fig.write_json("composition_shift_severe_weather_figure_data.json")
    fig.write_html("composition_shift_severe_weather_figure.html")


# Execute
visualize_composition_shift_v2(df_arch, df_boro)


In [None]:
# ==========================================
# ‚ùÑÔ∏è HUNT 4: THE SNOW PARADOX - PHASE 3 (Mechanics)
# ==========================================


def analyze_weather_mechanics(years=None):
    print("‚ùÑÔ∏è Initializing Weather Mechanics Phase 3...")

    lf = load_samples(years)

    # --- 1. The "Volume Crash" (Trips per Hour) ---
    print("   ...Calculating True Velocity (Trips/Hour)...")

    # Step A: Count total trips per state
    trip_counts = lf.group_by("weather_state").agg(pl.len().alias("total_trips"))

    # Step B: Count distinct hours observed per state (To normalize)
    # We group by [Date, Hour] to find unique time slots
    time_counts = (
        lf.select(["pickup_date", "pickup_hour", "weather_state"])
        .unique()
        .group_by("weather_state")
        .agg(pl.len().alias("hours_observed"))
    )

    # Join
    vol_velocity = (
        trip_counts.join(time_counts, on="weather_state")
        .with_columns(((pl.col("total_trips") / pl.col("hours_observed")) * SCALING_FACTOR).alias("est_trips_per_hour"))
        .collect()
        .to_pandas()
        .sort_values("est_trips_per_hour", ascending=False)
    )

    # --- 2. The "Borough Price War" (Sample for Boxplot) ---
    print("   ...Sampling Borough Pricing...")
    # We grab Manhattan vs Queens vs Brooklyn
    boro_price = (
        lf.filter(
            (pl.col("base_passenger_fare") > 0) & (pl.col("pickup_borough").is_in(["Manhattan", "Queens", "Brooklyn"]))
        )
        .select(["weather_state", "pickup_borough", "cost_per_km", "speed_kmh"])
        .collect()
        .sample(n=500_000, seed=42)
        .to_pandas()
    )

    return vol_velocity, boro_price


# Execute
df_velocity, df_mechanics = analyze_weather_mechanics(TARGET_YEARS)

# --- VISUALIZATION ---
fig = make_subplots(
    rows=2,
    cols=2,
    specs=[[{"colspan": 2}, None], [{"type": "xy"}, {"type": "xy"}]],
    subplot_titles=(
        "<b>1. The 'Volume Crash': True Demand (Est. Trips per Hour)</b>",
        "<b>2. The Price War: Cost ($/km) by Borough & Weather</b>",
        "<b>3. The Empty Road: Speed (km/h) by Borough & Weather</b>",
    ),
    vertical_spacing=0.15,
)

# Plot 1: Volume Velocity (Bar)
# Color map matching previous phases
colors_mapped = [weather_colors[s] for s in df_velocity["weather_state"]]

fig.add_trace(
    go.Bar(
        x=df_velocity["weather_state"],
        y=df_velocity["est_trips_per_hour"],
        marker_color=colors_mapped,
        text=df_velocity["est_trips_per_hour"].apply(lambda x: f"{x:,.0f}"),
        textposition="auto",
        name="Trips/Hour",
    ),
    row=1,
    col=1,
)

# Plot 2: Borough Pricing (Faceted Boxplot)
# We iterate boroughs to create grouped boxplots
for boro in ["Manhattan", "Queens", "Brooklyn"]:
    subset = df_mechanics[df_mechanics["pickup_borough"] == boro]
    fig.add_trace(
        go.Box(x=subset["weather_state"], y=subset["cost_per_km"], name=boro, visible=True, showlegend=True),
        row=2,
        col=1,
    )
# Cap Y to focus on the mass
fig.update_yaxes(range=[0, 12], title="Cost per Km ($)", row=2, col=1)

# Plot 3: Speed Impact (Faceted Boxplot)
for boro in ["Manhattan", "Queens", "Brooklyn"]:
    subset = df_mechanics[df_mechanics["pickup_borough"] == boro]
    fig.add_trace(
        go.Box(
            x=subset["weather_state"],
            y=subset["speed_kmh"],
            name=boro,
            showlegend=True,  # Shared legend with Plot 2
            visible=True,
        ),
        row=2,
        col=2,
    )
fig.update_yaxes(range=[0, 60], title="Speed (km/h)", row=2, col=2)

fig.update_layout(
    height=900,
    title_text="<b>Phase 3: The Mechanics of Demand Destruction</b>",
    template="te",
    boxmode="group",  # Group boxes by x-axis (Weather)
    hovermode=False,
)
fig.show()

# Output Table
print("\nüîé VELOCITY STATS (Volume Crash):")
display(df_velocity[["weather_state", "hours_observed", "est_trips_per_hour"]])


In [None]:
from tkinter import font


def visualize_volume_crash_v2(df_velocity):
    """
    Refined Volume Crash Plot.
    - Sorted Descending (Peak -> Crash).
    - Unified Red for Snow categories.
    - Baseline reference line (Clear/Cloudy).
    """

    # 1. Sort Data Descending (Highest Volume First)
    df_sorted = df_velocity.sort_values("est_trips_per_hour", ascending=False)

    # 2. Identify Baseline (Clear/Cloudy) for calculations
    # We compare everything to "Normal" weather, not "Rain" (which is a surge)
    baseline_val = df_velocity[df_velocity["weather_state"] == "clear_cloudy"]["est_trips_per_hour"].values[0]

    # 3. Color Logic: Black (Normal/Surge) vs Red (Destructive)
    colors = []
    for state in df_sorted["weather_state"]:
        if "snow" in state:
            colors.append(ub.UBER_RED)  # Destructive Group
        else:
            colors.append(ub.GRAY_900)  # Standard/Surge Group

    # 4. Build Trace
    fig = go.Figure()

    fig.add_trace(
        go.Bar(
            x=df_sorted["weather_state"],
            y=df_sorted["est_trips_per_hour"],
            marker_color=colors,
            marker_line_width=0,
            marker_cornerradius=4,
            text=df_sorted["est_trips_per_hour"].apply(lambda x: f"{x:,.0f}"),
            textposition="auto",
            textfont=dict(color=ub.UBER_WHITE),
            name="Trips/Hour",
            hovertemplate="<b>%{x}</b><br>Volume: %{y:,.0f} trips/hr<extra></extra>",
        )
    )

    # 5. Add Baseline Line (The "Normal" Watermark)
    fig.add_hline(
        y=baseline_val,
        line_width=1.5,
        line_dash="dot",
        line_color=ub.GRAY_600,
        annotation_text="Baseline (Clear)",
        annotation_position="top right",
        annotation_font=dict(size=10, color=ub.GRAY_900),
    )

    # 6. Add "Crash" Annotations for Snow States
    for idx, row in df_sorted.iterrows():
        if "snow" in row["weather_state"]:
            # Calculate drop vs Baseline
            drop_pct = (row["est_trips_per_hour"] - baseline_val) / baseline_val

            # Only label if it's actually a drop (negative)
            if drop_pct < -0.005:
                fig.add_annotation(
                    x=row["weather_state"],
                    y=row["est_trips_per_hour"],
                    text=f"<b>{drop_pct:.1%}</b>",
                    yshift=25,  # Push above the bar
                    showarrow=False,
                    font=dict(color=ub.UBER_RED, size=14),
                )

    # 7. Branding
    title = "The Volume Crash: Impact of Destructive Weather"
    subtitle = (
        f"While rain induces a demand surge, <span style='color:{ub.UBER_RED}'><b>snow conditions</b></span> cause network liquidity to collapse "
        f"significantly below the clear-weather baseline."
    )

    fig = ub.apply_uber_branding(
        fig,
        title=title,
        subtitle=subtitle,
        source="Source: TLC NYC High Volume FHV Records (2019-2025)",
        footer_y=-0.155,
        logo_y=-0.15,
    )

    # 8. Layout Specs
    fig.update_layout(
        xaxis=dict(title="Weather Condition", showgrid=False),
        yaxis=dict(
            title="<b>Estimated Trips per Hour</b>",
            showgrid=True,
            gridcolor=ub.GRAY_300,
            # Add headroom for annotations
            range=[0, df_sorted["est_trips_per_hour"].max() * 1.15],
        ),
        bargap=0.3,
        height=700,
        margin=dict(b=100, l=100, r=60),
    )

    fig.show()
    # fig.write_image("volume_crash_severe_weather.png", scale=10, height=700, width=1400)
    fig.write_json("volume_crash_severe_weather_figure_data.json")
    fig.write_html("volume_crash_severe_weather_figure.html")


# Execute
visualize_volume_crash_v2(df_velocity)


In [None]:
def visualize_borough_price_war_v4(df_mechanics):
    """
    Refined Price War Plot (V4).
    - Opacity: 0.8 (Elegant transparency).
    - Lines: Darker shades of the main color (No more harsh black outlines).
    - Range: Capped at $12 to zoom in on the median dynamics.
    """

    # 1. Define Strict Order
    weather_order = ["clear_cloudy", "raining", "snow_on_ground", "snowing"]

    # 2. Define "Fill" vs "Stroke" Colors (Darker Borders)
    # Using RGBA for fill (0.8 opacity) and Hex for solid dark borders
    style_map = {
        "Manhattan": {
            "fill": "rgba(0, 0, 0, 0.8)",  # Uber Black (80%)
            "line": "#000000",  # Pure Black
        },
        "Brooklyn": {
            "fill": "rgba(71, 178, 117, 0.8)",  # Uber Green (80%)
            "line": "#0E3F25",  # Deep Forest Green (Darker)
        },
        "Queens": {
            "fill": "rgba(255, 125, 73, 0.8)",  # Uber Orange (80%)
            "line": "#993311",  # Burnt Orange (Darker)
        },
    }

    fig = go.Figure()

    # 3. Build Traces
    for boro in ["Manhattan", "Brooklyn", "Queens"]:
        subset = df_mechanics[df_mechanics["pickup_borough"] == boro]
        styles = style_map.get(boro)

        fig.add_trace(
            go.Box(
                x=subset["weather_state"],
                y=subset["cost_per_km"],
                name=boro,
                # == COLOR STYLING ==
                fillcolor=styles["fill"],
                line=dict(color=styles["line"], width=1.75),  # Darker border
                marker=dict(color=styles["line"]),  # Outliers match border color
                # == VISUAL SIMPLIFICATION ==
                boxpoints=False,  # Hide outliers to reduce cognitive load
                quartilemethod="exclusive",
                # == GROUPING ==
                offsetgroup=boro,
            )
        )

    # 4. Branding & Storytelling
    # The graph shows Manhattan spikes in Rain, but Queens spikes in spread during Snow
    title = "Cost per Km ($) by Borough x Weather"
    subtitle = (
        f"<b>Rain</b> drives <span style='color:{ub.UBER_BLACK}'><b>Manhattan</b></span> prices up (High Median). "
        f"<b>Snow</b> drops prices across all boroughs."
    )

    fig = ub.apply_uber_branding(
        fig, title=title, subtitle=subtitle, source="Source: TLC NYC High Volume FHV Records (2019-2025)", footer_y=-0.15
    )

    # 5. Layout Improvements
    fig.update_layout(
        boxmode="group",
        boxgap=0.37,  # Gap between weather groups
        boxgroupgap=0.08,  # Gap between boroughs
        xaxis=dict(title="Weather Condition", categoryorder="array", categoryarray=weather_order, showgrid=False),
        yaxis=dict(
            title="Cost per Km ($)",
            showgrid=True,
            gridcolor=ub.GRAY_300,
            # == ZOOMING IN ==
            # We cap at $12.50. Even if whiskers go higher, we focus on the "Body".
            # This makes the differences in the boxes much easier to see.
            range=[0, 12.5],
            zeroline=True,
            zerolinecolor=ub.UBER_BLACK,
        ),
        # legend=dict(orientation="h", y=1.02, x=1.0, xanchor="right"),
        height=600,
        hovermode="closest",
    )

    # 6. Add Annotation for the "Insight"
    # Pointing out the Manhattan Rain Spike
    rain_spike = df_mechanics[
        (df_mechanics["weather_state"] == "raining") & (df_mechanics["pickup_borough"] == "Manhattan")
    ]["cost_per_km"].median()

    fig.add_annotation(
        x="raining",
        y=rain_spike,
        text="<b>Rain Surge</b>",
        yshift=140,
        xshift=-120,  # Shift to sit over Manhattan bar
        showarrow=True,
        font=dict(size=11, color=ub.UBER_BLACK),
    )

    fig.show()
    # fig.write_image("cost_per_km_by_borough_weather.png", scale=10, height=600, width=1300)
    fig.write_json("cost_per_km_by_borough_weather_figure_data.json")
    fig.write_html("cost_per_km_by_borough_weather_figure.html")


# Execute
visualize_borough_price_war_v4(df_mechanics)


In [None]:
# ==========================================
# ‚ùÑÔ∏è HUNT 4: THE SNOW PARADOX - PHASE 4 (Driver Economics)
# ==========================================

def analyze_driver_economics(years=None):
    print("‚ùÑÔ∏è Initializing Weather Forensics Phase 4: Driver Economics...")
    
    # Load & Filter
    # Filter: Trips > 5 mins (to stabilize hourly pay calc)
    lf = load_samples(years).filter(
        (pl.col("duration_min") > 5) & 
        (pl.col("pay_per_hour").is_between(10, 100)) # Filter extreme outliers
    )
    
    # Sampling for Distributions
    print("   ...Sampling Economics...")
    eco_dist = (
        lf.select([
            "weather_state", 
            "pay_per_hour", 
            "driver_response_time_min",
            "driver_revenue_share"
        ])
        .collect()
        .sample(n=500_000, seed=42)
        .to_pandas()
    )
    
    return eco_dist

# Execute
df_eco = analyze_driver_economics(TARGET_YEARS)

# --- VISUALIZATION ---
fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=(
        "<b>1. The Bottom Line: Hourly Pay ($/hr)</b>",
        "<b>2. Supply Stress: Driver Arrival Time (min)</b>",
        "<b>3. The Platform's Hand: Driver Share (%)</b>"
    ),
    horizontal_spacing=0.05
)

# Plot 1: Hourly Pay (Violin)
# Does speed compensate for low fares?
add_violin(fig, df_eco, "pay_per_hour", 1, 1, weather_colors, y_max=80)

# Plot 2: Response Time (Box)
# Did drivers stay home?
for state in ["clear_cloudy", "raining", "snow_on_ground", "snowing"]:
    subset = df_eco[df_eco["weather_state"] == state]
    fig.add_trace(
        go.Box(
            y=subset["driver_response_time_min"],
            name=state,
            marker_color=weather_colors[state],
            showlegend=False,
            boxpoints=False 
        ),
        row=1, col=2
    )
fig.update_yaxes(range=[0, 15], title="Arrival Time (min)", row=1, col=2)

# Plot 3: Driver Share (Box)
# Did Uber subsidize?
for state in ["clear_cloudy", "raining", "snow_on_ground", "snowing"]:
    subset = df_eco[df_eco["weather_state"] == state]
    fig.add_trace(
        go.Box(
            y=subset["driver_revenue_share"],
            name=state,
            marker_color=weather_colors[state],
            showlegend=False,
            boxpoints=False
        ),
        row=1, col=3
    )
fig.update_yaxes(range=[0.4, 1.0], title="Driver Share of Fare", row=1, col=3)

fig.update_layout(
    height=500,
    title_text="<b>Phase 4: The Driver's Dilemma (To Drive or Not to Drive?)</b>",
    template="te",
    hovermode=False,
)
fig.show()

# Summary Stats
print("\nüîé ECONOMIC SUMMARY:")
print(df_eco.groupby("weather_state")[["pay_per_hour", "driver_response_time_min", "driver_revenue_share"]].median())

In [None]:
def visualize_driver_pay_economics_v3(df_eco):
    """
    Refined Hourly Pay Plot (V3).
    - Fixes invisible boxes by forcing internal lines to WHITE.
    - Removes 'hard' spanmode to smooth out the top of the distribution.
    - Increases Y-Range to show the full shape.
    """

    # 1. Sort & Prepare Data
    weather_order = ["clear_cloudy", "raining", "snow_on_ground", "snowing"]

    # 2. Define Color Map
    color_map = {
        "clear_cloudy": ub.UBER_BLACK,
        "raining": ub.UBER_BLACK,
        "snow_on_ground": ub.UBER_RED,
        "snowing": ub.UBER_RED,
    }

    fig = go.Figure()

    # 3. Build Traces
    for state in weather_order:
        subset = df_eco[df_eco["weather_state"] == state]
        color = color_map.get(state)

        fig.add_trace(
            go.Violin(
                y=subset["pay_per_hour"],
                name=state,
                
                # == OUTER SHAPE ==
                line_color=color,    # The border of the violin shape
                fillcolor=color,     # The body color
                opacity=0.8,
                width=0.8,
                
                # == INTERNAL BOX ("The X-Ray View") ==
                # This was missing before. We must style the INNER box specifically.
                box=dict(
                    visible=True, 
                    width=0.15,
                    line=dict(color=ub.UBER_WHITE, width=1.5) # Force White Lines
                ),
                
                # == MEAN LINE ==
                meanline=dict(visible=True, color=ub.UBER_WHITE, width=1),
                
                # == SMOOTHING ==
                # Removing 'spanmode="hard"' lets the KDE curve smooth naturally 
                # past the data max, avoiding the "flat top" look.
                points=False, 
            )
        )

    # 4. Branding
    title = "Drivers' Hourly Earnings Distribution"
    subtitle = (
        f"During <span style='color:{ub.UBER_BLACK}'><b>Standard Operations (Clear/Rain)</b></span>, pay is concentrated and more reliable.<br>"
        f"During <span style='color:{ub.UBER_RED}'><b>Hazard Operations (Snow)</b></span>, drivers are paid less much more frequently, dragging the mean and median down."
    )

    fig = ub.apply_uber_branding(
        fig,
        title=title,
        subtitle=subtitle,
        source="Source: TLC NYC High Volume FHV Records (2019-2025)",
        footer_y=-0.15,
        logo_y=-0.15,
    )

    # 5. Layout Specs
    fig.update_layout(
        xaxis=dict(title="Weather Condition", showgrid=False),
        yaxis=dict(
            title="Driver Pay per Hour ($)",
            showgrid=True,
            gridcolor=ub.GRAY_300,
            # Extended range to give the "peaks" room to breathe visually
            range=[0, 115], 
            zeroline=True,
            zerolinecolor=ub.UBER_BLACK,
        ),
        violingap=0.35,
        height=700,
        showlegend=False,
        margin=dict(t=120, l=100, b=100),
    )
    
    # # 6. Annotation for Median (Rain Peak)
    # rain_med = df_eco[df_eco["weather_state"] == "raining"]["pay_per_hour"].median()
    # fig.add_annotation(
    #     x="raining", y=rain_med,
    #     text=f"<b>Median: ${rain_med:.0f}</b>",
    #     font=dict(color=ub.UBER_WHITE, size=10), # White text to sit on top of black violin
    #     showarrow=False,
    #     yshift=0
    # )

    fig.show()
    # fig.write_image("driver_hourly_pay_weather_state.png", scale=10, height=700, width=1400)
    fig.write_json("driver_hourly_pay_weather_state_figure_data.json")
    fig.write_html("driver_hourly_pay_weather_state_figure.html")

# Execute
visualize_driver_pay_economics_v3(df_eco)