In [1]:
import pandas as pd
import plotly.graph_objects as go

import plotly.io as pio
import uber_style as ub


pio.templates["uber"] = ub.uber_style_template
pio.templates.default = "uber"

# --- CONFIGURATION ---
# Paths to your Aggregate Data Marts
PATH_RAW = r"X:\Programming\Python\Projects\Data processing\TLC NYC datasets\HVFHV subsets 2019-2025 - Aggregates\Aggregates_Raw\agg_executive_daily.csv"
PATH_PROC = r"X:\Programming\Python\Projects\Data processing\TLC NYC datasets\HVFHV subsets 2019-2025 - Aggregates\Aggregates_Processed\agg_executive_daily.csv"


In [2]:
# --- DATA PREP ---
# 1. Load Data
df_raw = pd.read_csv(PATH_RAW)
df_proc = pd.read_csv(PATH_PROC)

# 2. Convert Dates
df_raw["pickup_date"] = pd.to_datetime(df_raw["pickup_date"])
df_proc["pickup_date"] = pd.to_datetime(df_proc["pickup_date"])

# 3. Extract Day of Week (0=Monday, 6=Sunday)
# Note: Aggregates are Daily, so we group by DOW sum
df_raw["dow"] = df_raw["pickup_date"].dt.day_name()
df_proc["dow"] = df_proc["pickup_date"].dt.day_name()

# 4. Group & Sum
# We define the order explicitly to ensure Mon-Sun sorting
dow_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

agg_raw = df_raw.groupby("dow")["total_trips"].sum().reindex(dow_order)
agg_proc = df_proc.groupby("dow")["total_trips"].sum().reindex(dow_order)

# 5. Calculate the "Filtered" Segment (The Red part)
# Filtered = Raw - Processed
agg_filtered = agg_raw - agg_proc


In [91]:
# --- PLOTTING ---
fig = go.Figure()

# Trace 1: Processed Data (The Clean Foundation)
fig.add_trace(
    go.Bar(
        x=agg_proc.index,
        y=agg_proc.values,
        name="Clean Uber Data",
        marker_color=ub.UBER_GREEN,
        text=agg_proc.values / 1e6,  # Convert to Millions
        texttemplate="%{text:.0f}M",  # Format: "140M"
        textposition="inside",
        insidetextanchor="middle",
        textfont=dict(color="white", size=12),
        hovertemplate="<b>%{x}</b><br>Clean: %{y:,.0f} trips<extra></extra>",
    )
)

# Trace 2: Filtered Data (The Noise)
fig.add_trace(
    go.Bar(
        x=agg_filtered.index,
        y=agg_filtered.values,
        name="Filtered (Lyft/Errors)",
        marker_color=ub.UBER_BLACK,
        opacity=0.25,
        text=agg_filtered.values / 1e6,  # Convert to Millions
        texttemplate="<b>%{text:.0f}M</b>",
        textposition="outside",  # Place label above the ghost bar
        textfont=dict(color=ub.GRAY_600, size=11),
        hovertemplate="<b>%{x}</b><br>Filtered: %{y:,.0f} trips<br><i>(Lyft, Via, Errors)</i><extra></extra>",
    )
)

# --- STYLING ---
# Apply Uber Branding
fig = ub.apply_uber_branding(
    fig,
    title="<b>Impact of Filtering: Trip Volume</b>",
    subtitle="Total Trip Volume by Day of Week (2019-2025) - <span style='color:#bfbfbf;'>Raw</span> vs. <span style='color:#47b275;'>Processed</span>",
    source="TLC High Volume Records",
    footer_y=-0.1,
)

# Custom Stacked Layout
fig.update_layout(
    barmode="stack",  # This creates the comparison effect
    yaxis_title="<b>Total Trips (Billions)</b>",
    height=600,
    # Add annotation explaining the gap
    annotations=[
        dict(
            x=5,
            y=agg_raw["Saturday"],  # Point at Saturday Top
            xref="x",
            yref="y",
            text="<b>~30-40% Reduction</b><br>Removing Competitors & Noise",
            showarrow=True,
            arrowhead=2,
            ax=-70,
            ay=-55,
            font=dict(color=ub.GRAY_600, size=12),
        )
    ],
    margin=dict(l=80, b=100),
    bargap=0.38,
)
fig.update_yaxes(range=[0, 280_000_000])
fig.show()
# fig.write_image(
#     "raw_vs_processed_trip_volume.png",
#     height=600,
#     width=1200,
#     scale=8,
# )
fig.write_html("raw_vs_processed_trip_volume.html")


In [25]:
# --- DATA PREP ---
# 1. Extract Year
df_raw["year"] = df_raw["pickup_date"].dt.year
df_proc["year"] = df_proc["pickup_date"].dt.year

# 2. Group by Year & Sum Revenue
# Note: Raw uses 'total_fare_revenue', Processed uses 'total_gross_booking_value' (which is broader),
# but for apples-to-apples volume filtering, we compare the base 'total_fare_revenue' if available,
# OR we compare the aggregate totals.
# Let's use the 'total_fare_revenue' column which exists in both Marts to keep it consistent.
agg_rev_raw = df_raw.groupby("year")["total_fare_revenue"].sum()
agg_rev_proc = df_proc.groupby("year")["total_fare_revenue"].sum()

# 3. Calculate Filtered
agg_rev_filtered = agg_rev_raw - agg_rev_proc

In [90]:
# --- PLOTTING ---
fig2 = go.Figure()

# Trace 1: Clean Revenue (Uber)
fig2.add_trace(
    go.Bar(
        x=agg_rev_proc.index,
        y=agg_rev_proc.values,
        name="Valid Uber Revenue",
        marker_color=ub.UBER_GREEN,
        text=agg_rev_proc.values / 1e9,  # Convert to Billions for label
        texttemplate="<b>$%{text:.1f}B</b>",
        textposition="inside",
        insidetextanchor="middle",
        textfont=dict(color="white"),
        hovertemplate="<b>%{x}</b><br>Clean: $%{y:,.0f}<extra></extra>",
    )
)

# Trace 2: Filtered Revenue (Competitors/Errors)
fig2.add_trace(
    go.Bar(
        x=agg_rev_filtered.index,
        y=agg_rev_filtered.values,
        name="Filtered Revenue",
        marker_color=ub.UBER_BLACK,
        opacity=0.25,
        text=agg_rev_filtered.values / 1e9,
        texttemplate="<b>$%{text:.1f}B</b>",
        textposition="outside",  # Place on top of the ghost bar
        insidetextanchor="end",
        textfont=dict(color=ub.UBER_BLACK, size=12),
        hovertemplate="<b>%{x}</b><br>Filtered: $%{y:,.0f}<extra></extra>",
    )
)

# --- STYLING ---
fig2 = ub.apply_uber_branding(
    fig2,
    title="<b>The Revenue Reality Check</b>",
    subtitle="Total Base Fare Revenue by Year (<span style='color:#bfbfbf;'>Raw</span> vs. <span style='color:#47b275;'>Processed</span>)",
    source="TLC High Volume Records",
    footer_y=-0.15,
    logo_y=-0.13,
)

fig2.update_layout(
    barmode="stack",
    yaxis_title="<b>Total Fare Revenue ($)</b>",
    height=600,
    xaxis=dict(
        tickmode="linear",  # Show every year
        dtick=1,
    ),
    legend=dict(y=1.15),  # Push legend up slightly
    margin=dict(t=120, l=80),  # More room for title
    bargap=0.38,
)

fig2.show()
# fig2.write_image(
#     "raw_vs_processed_yearly_revenue.png",
#     height=600,
#     width=1200,
#     scale=8,
# )
fig2.write_html("raw_vs_processed_yearly_revenue.html")

In [67]:
# --- DATA PREP ---
# 1. Group by Date & Sum Volume
daily_raw = df_raw.groupby("pickup_date")["total_trips"].sum()
daily_proc = df_proc.groupby("pickup_date")["total_trips"].sum()

# 2. Rolling Average (7-Day) to smooth weekly seasonality noise
rolling_raw = daily_raw.rolling(window=7).mean()
rolling_proc = daily_proc.rolling(window=7).mean()


In [92]:
# --- PLOTTING ---
fig3 = go.Figure()

# Trace 1: Raw Data (The Shadow)
fig3.add_trace(
    go.Scatter(
        x=rolling_raw.index,
        y=rolling_raw.values,
        mode="lines",
        name="Raw Input (All Platforms)",
        line=dict(color=ub.GRAY_500, width=1.5, dash="dot"),  # Faint, dotted line
        opacity=0.9,
        hovertemplate="<b>Raw</b>: %{y:,.0f} trips<extra></extra>",
    )
)

# Trace 2: Processed Data (The Signal)
fig3.add_trace(
    go.Scatter(
        x=rolling_proc.index,
        y=rolling_proc.values,
        mode="lines",
        name="Clean Uber Data",
        line=dict(color=ub.UBER_GREEN, width=2.5),  # Bold, solid line
        hovertemplate="<b>Clean</b>: %{y:,.0f} trips<extra></extra>",
    )
)

# --- ANNOTATIONS (Storytelling) ---
# annotations = [
#     dict(
#         x="2020-03-20",
#         y=100000,
#         text="<b>COVID Crash</b>",
#         showarrow=True,
#         arrowhead=2,
#         ax=40,
#         ay=-40,
#         font=dict(color=ub.UBER_RED),
#     ),
# ]

# --- STYLING ---
fig3 = ub.apply_uber_branding(
    fig3,
    title="<b>Signal vs. Noise: Daily Trip Volume</b>",
    subtitle="7-Day Rolling Average (2019-2025) - <span style='color:#bfbfbf;'>Raw</span> vs. <span style='color:#47b275;'>Processed</span>",
    source="TLC High Volume Records",
    footer_y=-0.18,
)

fig3.update_layout(
    yaxis_title="<b>Daily Trips (7-Day Avg)</b>",
    height=600,
    # annotations=annotations,
    # Zoom in slightly to avoid the empty space at start/end
    xaxis=dict(
        tickformat="%Y",  # <-- Add this line
        # range=["2019-01-01", "2025-10-01"] # If you uncommented this
    ),
    # legend=dict(y=1.12, x=0.01, xanchor="left"),  # Legend on left for lines
    margin=dict(l=90, t=100),
)

fig3.show()
fig3.write_image(
    "signal_vs_noise_daily_trip_volume.png",
    height=600,
    width=1300,
    scale=8,
)
# fig3.write_html("signal_vs_noise_daily_trip_volume.html")

OSError: [Errno 22] Invalid argument: 'signal_vs_noise_daily_trip_volume.png'