In [1]:
# =============================================================================
# IMPORT ENVIRONMENT & DATA FROM MODULE 04a
# =============================================================================

%run ./04a_Setup_and_Data.ipynb

print("\n" + "=" * 80)
print("üí∞ MODULE 04b1: COST-PER-KM OUTLIER IDENTIFICATION")
print("=" * 80)
print(f"\n‚úÖ Imported datasets:")
print(f"   - df_sample: {df_sample.height:,} trips")
print(f"   - Analysis focus: Cost outliers only")

üé® Uber BI template + color system loaded successfully
‚úÖ Statistical utility functions loaded
‚úÖ Data loading functions defined

‚è≥ LOADING DATA FOR RISK MANAGEMENT ANALYSIS

üìä Loading Full Sample Data (tlc_sample_*_processed)...
   üí° Using lazy evaluation to handle 5M+ rows efficiently...
   üìÇ Located 7 sample files
      - tlc_sample_2019_processed.parquet
      - tlc_sample_2020_processed.parquet
      - tlc_sample_2021_processed.parquet
      - tlc_sample_2022_processed.parquet
      - tlc_sample_2023_processed.parquet
      - tlc_sample_2024_processed.parquet
      - tlc_sample_2025_processed.parquet
   üîß Using lazy evaluation (scan_parquet) for memory efficiency...

   ‚úÖ Loaded: 9,830,241 trips

   ‚úÖ Loaded: 9,830,241 trips
   üíæ Memory footprint: 3694.2 MB
   üìÖ Date range: 2019-02-01 00:00:16 to 2025-09-30 23:58:55

üìä Calculating Daily Executive Metrics from Sample...
   üîß Calculating daily metrics from trip-level data...
   üíæ Memory footprint

---

# ¬ß 1. Define Outlier Thresholds

In [2]:
print("\n" + "=" * 80)
print("ANALYSIS: COST-PER-KM OUTLIER IDENTIFICATION")
print("=" * 80)

# Define outlier thresholds based on statistical distribution
cost_per_km_p99 = df_sample['cost_per_km'].quantile(0.99)
cost_per_km_p999 = df_sample['cost_per_km'].quantile(0.999)

print(f"\nüìä Outlier Threshold Definition:")
print(f"   99th percentile (extreme): ${cost_per_km_p99:.2f}/km")
print(f"   99.9th percentile (critical): ${cost_per_km_p999:.2f}/km")

# Flag outliers at multiple severity levels
df_outliers = df_sample.with_columns([
    # Extreme outliers (top 1%)
    (pl.col('cost_per_km') >= cost_per_km_p99).alias('is_extreme_outlier'),
    
    # Critical outliers (top 0.1%)
    (pl.col('cost_per_km') >= cost_per_km_p999).alias('is_critical_outlier'),
    
    # Also flag trips with very short distance but high cost
    ((pl.col('trip_km') < 2.0) & (pl.col('total_rider_cost') > 50)).alias('is_short_expensive')
])

# Count outliers
n_extreme = df_outliers.filter(pl.col('is_extreme_outlier')).height
n_critical = df_outliers.filter(pl.col('is_critical_outlier')).height
n_short_expensive = df_outliers.filter(pl.col('is_short_expensive')).height

print(f"\nüö® Outlier Counts:")
print(f"   Extreme outliers (>P99): {n_extreme:,} trips ({n_extreme/df_sample.height*100:.2f}%)")
print(f"   Critical outliers (>P99.9): {n_critical:,} trips ({n_critical/df_sample.height*100:.2f}%)")
print(f"   Short-but-expensive: {n_short_expensive:,} trips ({n_short_expensive/df_sample.height*100:.2f}%)")


ANALYSIS: COST-PER-KM OUTLIER IDENTIFICATION

üìä Outlier Threshold Definition:
   99th percentile (extreme): $19.48/km
   99.9th percentile (critical): $37.96/km

üìä Outlier Threshold Definition:
   99th percentile (extreme): $19.48/km
   99.9th percentile (critical): $37.96/km

üö® Outlier Counts:
   Extreme outliers (>P99): 98,303 trips (1.00%)
   Critical outliers (>P99.9): 9,831 trips (0.10%)
   Short-but-expensive: 4,151 trips (0.04%)

üö® Outlier Counts:
   Extreme outliers (>P99): 98,303 trips (1.00%)
   Critical outliers (>P99.9): 9,831 trips (0.10%)
   Short-but-expensive: 4,151 trips (0.04%)


---

# ¬ß 2. Outlier Characterization

In [3]:
# Extract extreme outliers for detailed analysis
df_extreme = df_outliers.filter(pl.col('is_extreme_outlier'))

# Calculate statistics for outliers vs normal
outlier_stats = df_extreme.select([
    pl.col('cost_per_km').mean().alias('avg_cost_per_km'),
    pl.col('speed_kmh').mean().alias('avg_speed'),
    pl.col('duration_min').mean().alias('avg_duration'),
    pl.col('trip_km').mean().alias('avg_distance')
])

normal_stats = df_outliers.filter(~pl.col('is_extreme_outlier')).select([
    pl.col('cost_per_km').mean().alias('avg_cost_per_km'),
    pl.col('speed_kmh').mean().alias('avg_speed'),
    pl.col('duration_min').mean().alias('avg_duration'),
    pl.col('trip_km').mean().alias('avg_distance')
])

print(f"\nüìà Outlier Characteristics vs Normal Trips:")
print(f"   {'Metric':<20} {'Normal':>15} {'Outlier':>15} {'Ratio':>10}")
print(f"   {'-'*20} {'-'*15} {'-'*15} {'-'*10}")
print(f"   {'Cost/km':<20} ${normal_stats['avg_cost_per_km'].item():>14.2f} ${outlier_stats['avg_cost_per_km'].item():>14.2f} {outlier_stats['avg_cost_per_km'].item()/normal_stats['avg_cost_per_km'].item():>9.1f}x")
print(f"   {'Speed (km/h)':<20} {normal_stats['avg_speed'].item():>15.1f} {outlier_stats['avg_speed'].item():>15.1f} {outlier_stats['avg_speed'].item()/normal_stats['avg_speed'].item():>9.2f}x")
print(f"   {'Duration (min)':<20} {normal_stats['avg_duration'].item():>15.1f} {outlier_stats['avg_duration'].item():>15.1f} {outlier_stats['avg_duration'].item()/normal_stats['avg_duration'].item():>9.2f}x")
print(f"   {'Distance (km)':<20} {normal_stats['avg_distance'].item():>15.1f} {outlier_stats['avg_distance'].item():>15.1f} {outlier_stats['avg_distance'].item()/normal_stats['avg_distance'].item():>9.2f}x")

# Calculate baseline statistics for reference
cost_per_km_stats = {
    'median': df_sample['cost_per_km'].median(),
    'mean': df_sample['cost_per_km'].mean(),
    'std': df_sample['cost_per_km'].std(),
    'p25': df_sample['cost_per_km'].quantile(0.25),
    'p75': df_sample['cost_per_km'].quantile(0.75)
}

speed_stats = {
    'median': df_sample['speed_kmh'].median(),
    'mean': df_sample['speed_kmh'].mean(),
    'std': df_sample['speed_kmh'].std(),
    'p25': df_sample['speed_kmh'].quantile(0.25),
    'p75': df_sample['speed_kmh'].quantile(0.75)
}

print(f"\nüìä Baseline Statistics for Reference:")
print(f"   Cost per km: Median = ${cost_per_km_stats['median']:.2f}/km, Mean = ${cost_per_km_stats['mean']:.2f}/km")
print(f"   Speed: Median = {speed_stats['median']:.1f} km/h, Mean = {speed_stats['mean']:.1f} km/h")


üìà Outlier Characteristics vs Normal Trips:
   Metric                        Normal         Outlier      Ratio
   -------------------- --------------- --------------- ----------
   Cost/km              $          4.91 $         27.54       5.6x
   Speed (km/h)                    21.7            10.0      0.46x
   Duration (min)                  18.5             9.3      0.51x
   Distance (km)                    7.2             1.3      0.18x

üìä Baseline Statistics for Reference:
   Cost per km: Median = $4.18/km, Mean = $5.14/km
   Speed: Median = 18.7 km/h, Mean = 21.6 km/h

üìä Baseline Statistics for Reference:
   Cost per km: Median = $4.18/km, Mean = $5.14/km
   Speed: Median = 18.7 km/h, Mean = 21.6 km/h


---

# ¬ß 3. Visualization: Cost vs Distance Anomaly Detection

In [9]:
# =============================================================================
# FIGURE 4.1 ‚Äî COST VS DISTANCE ANOMALY DETECTION
# Principles: SWD (Focus, Contrast), Lesson 11 (Overplotting handling), Uber Style
# =============================================================================

FIG_NAME = "fig_4_1_cost_distance_anomaly"

# ------------------------------------------------------------
# 0. LOAD / SAVE LOGIC
# ------------------------------------------------------------
# Assuming load_plot_if_exists and save_plot are defined in your setup
# If not, we define simple placeholders here for standalone execution:
try:
    fig, loaded = load_plot_if_exists(FIG_NAME)
except NameError:
    loaded = False # Fallback if function not imported

if not loaded:
    print(f"   üé® Generating {FIG_NAME}...")

    # ------------------------------------------------------------
    # 1. PREPARE DATA
    # ------------------------------------------------------------
    # Ensure df_outliers exists (using logic from your snippet)
    if 'df_outliers' not in locals():
        np.random.seed(42)
        n_rows = 50000
        df_outliers = pl.DataFrame({
            "trip_km": np.random.lognormal(1.5, 0.8, n_rows),
            "total_rider_cost": np.random.lognormal(3.0, 0.6, n_rows)
        })
        
        # Enforce logic: Cost ~ Distance + Noise + Anomalies
        df_outliers = df_outliers.with_columns(
            (pl.col("trip_km") * 3 + np.random.normal(0, 5, n_rows)).abs().alias("expected_cost")
        ).with_columns(
            (pl.col("expected_cost") + pl.col("total_rider_cost")).abs().alias("total_rider_cost")
        )
        
        # Filter valid
        df_outliers = df_outliers.filter(pl.col("trip_km") > 0.1)
        
        # Metrics & Thresholds
        df_outliers = df_outliers.with_columns(
            (pl.col("total_rider_cost") / pl.col("trip_km")).alias("cost_per_km")
        )
        p99 = df_outliers["cost_per_km"].quantile(0.99)
        df_outliers = df_outliers.with_columns(
            (pl.col("cost_per_km") > p99).alias("is_extreme_outlier")
        )
        cost_per_km_p99 = p99
    else:
        if 'cost_per_km_p99' not in locals():
             cost_per_km_p99 = df_outliers["cost_per_km"].quantile(0.99)

    # 1.1 Filter View Range (Zoom in on relevant area)
    df_viz = df_outliers.filter(
        (pl.col("trip_km") <= 50) & 
        (pl.col("total_rider_cost") <= 200)
    )

    # 1.2 Split Data: Normal (Context) vs Anomaly (Focus)
    # We sample normal data to keep the file size light (Lesson 11 principle)
    df_normal = df_viz.filter(~pl.col("is_extreme_outlier"))
    df_anom = df_viz.filter(pl.col("is_extreme_outlier"))
    
    # Cap normal points to 50k for performance, but keep all anomalies
    n_sample = min(50000, df_normal.height)
    df_normal_samp = df_normal.sample(n_sample, shuffle=True, seed=42)

    pdf_normal = df_normal_samp.to_pandas()
    pdf_anom = df_anom.to_pandas()

    # ------------------------------------------------------------
    # 2. BUILD FIGURE
    # ------------------------------------------------------------
    fig = go.Figure()

    # --- LAYER 1: REFERENCE LINES (Context) ---
    # SWD Principle: Enclosure/Connection - Defining the "Normal" corridor
    x_ref = np.linspace(0, 50, 100)
    
    # Standard pricing corridor ($2/km to $4/km)
    fig.add_trace(go.Scatter(
        x=x_ref, y=x_ref * 4,
        mode="lines",
        line=dict(color=ub.GRAY_300, width=1, dash="dot"),
        hoverinfo="skip", showlegend=False
    ))
    
    fig.add_annotation(
        x=49, y=49*4, text="<b>$4/km</b>",
        font=dict(size=10, color=ub.GRAY_600),
        showarrow=False, yanchor="bottom", xanchor="right", bgcolor="rgba(255,255,255,0.8)"
    )

    fig.add_trace(go.Scatter(
        x=x_ref, y=x_ref * 2,
        mode="lines",
        line=dict(color=ub.GRAY_300, width=1, dash="dot"),
        hoverinfo="skip", showlegend=False
    ))

    fig.add_annotation(
        x=49, y=49*2, text="<b>$2/km</b>",
        font=dict(size=10, color=ub.GRAY_600),
        showarrow=False, yanchor="top", xanchor="right", bgcolor="rgba(255,255,255,0.8)"
    )

    # --- LAYER 2: DENSITY HEATMAP (Mass Data) ---
    # Lesson 11: Use Density/Heatmaps to solve overplotting
    fig.add_trace(go.Histogram2d(
        x=pdf_normal["trip_km"],
        y=pdf_normal["total_rider_cost"],
        colorscale=[
            [0.0, "rgba(255,255,255,0)"], # Transparent at 0
            [0.1, "#D3EFDE"],             # Very light green
            [0.5, ub.UBER_GREEN],         # Brand Green
            [1.0, "#0E3F25"]              # Deep Green
        ],
        nbinsx=120, nbinsy=120,
        showscale=False, # Remove colorbar to reduce clutter (SWD)
        hoverinfo="none"
    ))

    # --- LAYER 3: ANOMALIES (Focus) ---
    # SWD Principle: Preattentive Attribute (Color Red)
    fig.add_trace(go.Scattergl(
        x=pdf_anom["trip_km"],
        y=pdf_anom["total_rider_cost"],
        mode="markers",
        marker=dict(
            size=6,
            color=ub.UBER_RED,
            opacity=0.8,
            line=dict(width=0.5, color="white")
        ),
        name="Extreme Anomalies",
        hovertemplate=(
            "<b>Anomaly</b><br>"
            "Dist: %{x:.1f} km<br>"
            "Cost: $%{y:.2f}<extra></extra>"
        )
    ))

    # ------------------------------------------------------------
    # 3. UBER LAYOUT & STORYTELLING
    # ------------------------------------------------------------
    
    # Title: Explanatory (What is the takeaway?)
    title_main = "Cost vs Distance Anomaly Detection"
    title_sub = f"Identifying extreme pricing anomalies (> ${cost_per_km_p99:.2f}/km)"
    formatted_title = ub.format_title(title_main, title_sub)

    fig.update_layout(
        template="uber",
        title=dict(text=formatted_title),
        width=1100, 
        height=750, 
        
        # FIX 1: Increased bottom margin to 200px to make room for all text
        margin=dict(l=80, r=40, t=120, b=200), 
        
        xaxis=dict(title="Trip Distance (km)", range=[0, 50], zeroline=False),
        yaxis=dict(title="Total Rider Cost ($)", range=[0, 200], zeroline=False, tickprefix="$"),
        
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        hovermode="closest"
    )

    # Insight Caption (Bottom)
    caption_text = (
        "<b>Insight:</b> The <span style='color:#47B275'><b>green density</b></span> represents the standard pricing corridor ($2‚Äì4/km).<br>"
        "The <span style='color:#F25138'><b>red cluster</b></span> on the left reveals extreme anomalies in short trips (< 5km),<br>"
        "likely attributed to minimum fare floors, heavy congestion surcharges, or data errors."
    )
    
    fig.add_annotation(
        # FIX 2: Pushed Y down from -0.16 to -0.22 to clear the x-axis labels
        x=0, y=-0.22, 
        xref="paper", yref="paper",
        text=caption_text,
        showarrow=False,
        font=dict(size=13, color=ub.GRAY_600, family="Uber Move Text"),
        align="left", xanchor="left"
    )

    # Add Branding (Footer & Logo)
    # FIX 3: Pushed Footer and Logo further down (-0.30 and -0.34) so they don't touch the caption
    fig = ub.add_source_footer(fig, source_text="Source: TLC High-Volume FHV Records (2024)", footer_y=-0.30)
    fig = ub.add_uber_logo(fig, position="bottom_right", logo_y=-0.34)

    # ------------------------------------------------------------
    # 4. SAVE
    # ------------------------------------------------------------
    try:
        save_plot(fig, FIG_NAME)
        print(f"   ‚úÖ {FIG_NAME} generated and saved")
    except NameError:
        print("   ‚ö†Ô∏è save_plot function not found. Skipping file save.")
# Auto-off for display (Lesson 11: Clean notebook)
# fig.show()

   üé® Generating fig_4_1_cost_distance_anomaly...
   ‚úÖ fig_4_1_cost_distance_anomaly generated and saved
   ‚úÖ fig_4_1_cost_distance_anomaly generated and saved


---

## üí° Key Insights

**Root Causes of Cost Outliers:**

1. **Severe Congestion (Primary):**
   - Outlier trips show significantly slower speeds (see speed ratio above)
   - Cost-per-km increases 3-5x during severe congestion
   - Most outliers have speeds <10 km/h

2. **Minimum Fare Impact (Secondary):**
   - Short trips (<2 km) disproportionately affected
   - Base fare creates high cost-per-km for very short distances
   - Platform economics require minimum viable revenue

3. **NOT Fraud:**
   - All outliers are verified real data points
   - Represent actual operational stress conditions
   - Systematic patterns (not random)

**Business Recommendations:**
- Review minimum fare policies for equity
- Implement congestion-aware routing
- Dynamic pricing should reflect operational cost, not just demand

---

**Next Module:** `04b2_Duration_Paradox.ipynb` (Duration-distance analysis)