In [1]:
# =============================================================================
# PROJECT ATLAS: 001c1. DISTANCE DISTRIBUTION ANALYSIS
# =============================================================================
#
# OBJECTIVE: Analyze trip distance distribution patterns
# DATA SOURCE: tlc_sample_*_processed.parquet
# =============================================================================

# -----------------------------------------------------------------------------
# ¬ß 1. ENVIRONMENT SETUP
# -----------------------------------------------------------------------------

import polars as pl
import pandas as pd
import numpy as np
import os
import warnings
from datetime import datetime
from pathlib import Path
import plotly.graph_objects as go
import plotly.io as pio
from scipy import stats

warnings.filterwarnings('ignore')

# Configuration
SAMPLE_DIR = './HVFHV subsets 2019-2025 - Samples/'

# Uber Style Template
import uber_style as ub
pio.templates["uber"] = ub.uber_style_template
pio.templates.default = "uber"
from uber_style import *

# Plot cache directory
PLOT_DIR = Path("plots")
PLOT_DIR.mkdir(exist_ok=True)

def _plot_paths(fig_name: str):
    json_path = PLOT_DIR / f"{fig_name}.json"
    html_path = PLOT_DIR / f"{fig_name}.html"
    return json_path, html_path

def load_plot_if_exists(fig_name: str):
    json_path, _ = _plot_paths(fig_name)
    if json_path.exists():
        with open(json_path, "r", encoding="utf-8") as f:
            fig = pio.from_json(f.read())
        return fig, True
    return None, False

def save_plot(fig, fig_name: str):
    json_path, html_path = _plot_paths(fig_name)
    with open(json_path, "w", encoding="utf-8") as f:
        f.write(pio.to_json(fig))
    pio.write_html(fig, file=str(html_path), include_plotlyjs="cdn", auto_open=False)

print("‚úÖ Environment configured successfully")
print(f"   - Notebook: 001c1_Distance_Distribution")

‚úÖ Environment configured successfully
   - Notebook: 001c1_Distance_Distribution


In [2]:
# -----------------------------------------------------------------------------
# ¬ß 2. DATA LOADING
# -----------------------------------------------------------------------------

def load_sample_data(directory: str, max_rows_per_file: int = 50000) -> pl.DataFrame:
    """Load and concatenate sample parquet files with memory-safe sampling.
    
    Args:
        directory: Path to parquet files
        max_rows_per_file: Max rows to load per file (default 50K per file = ~350K total)
    """
    sample_files = [
        f for f in os.listdir(directory)
        if f.startswith('tlc_sample_') and f.endswith('_processed.parquet')
    ]
    
    if not sample_files:
        raise FileNotFoundError(f"No sample files found in {directory}")
    
    print(f"   üìÅ Found {len(sample_files)} files, loading max {max_rows_per_file:,} rows each...")
    
    dfs = []
    for fname in sample_files:
        fpath = os.path.join(directory, fname)
        # Use lazy loading and limit rows
        df_chunk = pl.scan_parquet(fpath).head(max_rows_per_file).collect()
        dfs.append(df_chunk)
        print(f"      ‚Ä¢ {fname}: {df_chunk.height:,} rows")
    
    df_combined = pl.concat(dfs, how='vertical')
    
    required_cols = ['trip_km']
    missing = [col for col in required_cols if col not in df_combined.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")
    
    return df_combined

print("‚è≥ Loading data for distance analysis...")
print("-" * 60)

try:
    print("üìä Loading Sample Data (tlc_sample_*_processed)...")
    df_sample = load_sample_data(SAMPLE_DIR)
    print(f"   ‚úÖ Loaded: {df_sample.height:,} trip samples")
    print(f"   üìÖ Coverage: Full 2019-2025 period (stratified sampling)")
    
    print("\n" + "=" * 60)
    print("‚úÖ DATA LOADING COMPLETE - Ready for distance analysis")
    print("   Note: Only ¬ß5.1 Distance Distribution in this notebook")
    print("=" * 60)
    
except Exception as e:
    print(f"\n‚ùå ERROR: Data loading failed")
    print(f"   Details: {str(e)}")
    raise

‚è≥ Loading data for distance analysis...
------------------------------------------------------------
üìä Loading Sample Data (tlc_sample_*_processed)...
   üìÅ Found 7 files, loading max 50,000 rows each...
      ‚Ä¢ tlc_sample_2019_processed.parquet: 50,000 rows      ‚Ä¢ tlc_sample_2019_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2020_processed.parquet: 50,000 rows

      ‚Ä¢ tlc_sample_2020_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2021_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2022_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2021_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2022_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2023_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2024_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2023_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2024_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2025_processed.parquet: 50,000 rows
   ‚úÖ Loaded: 350,000 trip samples
   üìÖ Cove

In [3]:
# =============================================================================
# ¬ß 5.1 DISTANCE DISTRIBUTION ANALYSIS
# =============================================================================

def analyze_distribution(df: pl.DataFrame, column: str) -> dict:
    """Calculate comprehensive distribution statistics."""
    return {
        'mean': df[column].mean(),
        'std': df[column].std(),
        'min': df[column].min(),
        'max': df[column].max(),
        'p5': df[column].quantile(0.05),
        'p25': df[column].quantile(0.25),
        'p50': df[column].quantile(0.50),
        'p75': df[column].quantile(0.75),
        'p95': df[column].quantile(0.95),
        'p99': df[column].quantile(0.99)
    }

print("\n" + "=" * 80)
print("ANALYSIS 3.1: TRIP DISTANCE DISTRIBUTION")
print("=" * 80)

# Filter outliers for visualization clarity
distance_data = df_sample.filter(
    (pl.col('trip_km') > 0) & (pl.col('trip_km') <= 50)
)

# Calculate statistics
dist_stats = analyze_distribution(distance_data, 'trip_km')

print("\nüìä Distance Distribution Statistics:")
print(f"   Sample size (filtered): {distance_data.height:,} trips")
print(f"   Median (P50): {dist_stats['p50']:.2f} km")
print(f"   Mean: {dist_stats['mean']:.2f} km")
print(f"   Std Dev: {dist_stats['std']:.2f} km")
print(f"   IQR (P25-P75): {dist_stats['p25']:.2f} - {dist_stats['p75']:.2f} km")
print(f"   90% of trips: < {dist_stats['p95']:.2f} km")

# Convert to numpy for plotting
dist_array = distance_data.select('trip_km').to_numpy().flatten()


ANALYSIS 3.1: TRIP DISTANCE DISTRIBUTION

üìä Distance Distribution Statistics:
   Sample size (filtered): 349,858 trips
   Median (P50): 4.38 km
   Mean: 6.89 km
   Std Dev: 6.81 km
   IQR (P25-P75): 2.38 - 8.75 km
   90% of trips: < 21.68 km


In [4]:
# =============================================================================
# FIGURE 3.1 ‚Äî DISTANCE DISTRIBUTION (Refined: Uber Style Module)
# =============================================================================

from scipy import stats
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np
import uber_style as ub  # Importing the provided style module

FIG_NAME = "fig_1_4_distance_distribution"

# ------------------------------------------------------------
# 0. LOAD / SAVE LOGIC
# ------------------------------------------------------------
try:
    fig, loaded = load_plot_if_exists(FIG_NAME)
except NameError:
    loaded = False

if not loaded:
    print(f"   üé® Generating {FIG_NAME}...")

    # ------------------------------------------------------------
    # 1. PREPARE DATA
    # ------------------------------------------------------------
    # Mock data generation if 'dist_array' is missing (for standalone execution)
    if 'dist_array' not in locals():
        np.random.seed(42)
        dist_array = np.random.lognormal(mean=1.5, sigma=0.8, size=10000)
        dist_array = dist_array[dist_array <= 50]
        
    dist_stats = {
        'mean': np.mean(dist_array),
        'p25': np.percentile(dist_array, 25),
        'p50': np.median(dist_array),
        'p75': np.percentile(dist_array, 75)
    }

    data_plot = dist_array
    x_kde = np.linspace(0, 50, 500)
    kde = stats.gaussian_kde(data_plot)
    y_kde = kde(x_kde)
    y_max_kde = max(y_kde) 

    p25, p50, p75 = dist_stats['p25'], dist_stats['p50'], dist_stats['p75']
    mean_val = dist_stats['mean']

    # ------------------------------------------------------------
    # 2. BUILD FIGURE
    # ------------------------------------------------------------
    fig = go.Figure()

    # A. Histogram (Context - Background Layer)
    fig.add_trace(go.Histogram(
        x=data_plot,
        histnorm='probability density',
        # Use Uber Green with transparency for context
        marker=dict(color=ub.UBER_GREEN),
        opacity=0.2,
        name="Trips",
        xbins=dict(size=1.0),
        hoverinfo="x"
    ))

    # B. KDE Curve (Signal - Foreground Layer)
    fig.add_trace(go.Scatter(
        x=x_kde,
        y=y_kde,
        mode='lines',
        line=dict(color=ub.UBER_GREEN, width=3),
        name="Distribution",
        fill='tozeroy',
        fillcolor='rgba(71, 178, 117, 0.05)', # Very light fill
        hoverinfo="skip"
    ))

    # ------------------------------------------------------------
    # 3. ANNOTATIONS (STRATEGIC LAYERING)
    # ------------------------------------------------------------
    
    # --- LAYER 1: MEDIAN (PRIMARY FOCUS) ---
    # Red Dashed Line for Median (Skewness Indicator)
    fig.add_shape(
        type="line",
        x0=p50, x1=p50,
        y0=0, y1=y_max_kde * 1.1, 
        line=dict(color=ub.UBER_RED, width=2, dash="dash"),
        layer="above"
    )
    
    fig.add_annotation(
        x=p50, y=y_max_kde * 1.1,
        text=f"<b>Median<br>{p50:.1f} km</b>",
        font=dict(color=ub.UBER_RED, size=12, family="Uber Move Text"),
        showarrow=False,
        yshift=10,
        bgcolor="white", bordercolor=ub.UBER_RED, borderwidth=1, borderpad=4
    )

    # --- LAYER 2: MEAN (REFERENCE) ---
    mean_height = y_max_kde * 0.75
    
    fig.add_shape(
        type="line",
        x0=mean_val, x1=mean_val,
        y0=0, y1=mean_height, 
        line=dict(color=ub.GRAY_900, width=1.5, dash="dot")
    )
    
    fig.add_annotation(
        x=mean_val, y=mean_height,
        text=f"Mean: {mean_val:.1f} km",
        font=dict(color=ub.GRAY_900, size=11, family="Uber Move Text"),
        showarrow=True,
        arrowhead=0, arrowsize=1,
        ax=40, ay=0, 
        bgcolor="rgba(255,255,255,0.9)", 
        borderpad=2
    )

    # --- LAYER 3: IQR (CONTEXT) ---
    iqr_height = y_max_kde * 0.15 
    
    fig.add_annotation(
        x=(p25 + p75) / 2, 
        y=iqr_height,
        text=f"<b>Middle 50% (IQR)</b><br>{p25:.1f} - {p75:.1f} km",
        showarrow=False,
        font=dict(size=10, color=ub.GRAY_600, family="Uber Move Text"),
        bgcolor="rgba(255,255,255,0.95)", 
        yshift=15 
    )
    
    # IQR Bracket
    fig.add_shape(type="line", x0=p25, x1=p75, y0=iqr_height, y1=iqr_height, line=dict(color=ub.GRAY_600, width=1))
    fig.add_shape(type="line", x0=p25, x1=p25, y0=iqr_height*0.8, y1=iqr_height*1.2, line=dict(color=ub.GRAY_600, width=1))
    fig.add_shape(type="line", x0=p75, x1=p75, y0=iqr_height*0.8, y1=iqr_height*1.2, line=dict(color=ub.GRAY_600, width=1))


    # ------------------------------------------------------------
    # 3. UBER LAYOUT & STORYTELLING
    # ------------------------------------------------------------
    
    # Title: Descriptive with hierarchy
    formatted_title = ub.format_title(
        "Trip Distance Distribution",
        "Demand is highly concentrated in short trips (Median < Mean)"
    )

    fig.update_layout(
        template="uber",
        title=dict(text=formatted_title),
        width=1200,
        height=600,
        margin=dict(l=80, r=60, t=120, b=100), # Adjusted for footer

        # X-Axis
        xaxis=dict(
            title="Distance (km)",
            title_font=dict(size=12, color=ub.GRAY_600),
            showgrid=False,
            range=[0, 30], # Zoom to relevant area
            tickfont=dict(color=ub.GRAY_600),
            zeroline=True, 
            zerolinecolor=ub.GRAY_300
        ),
        
        # Y-Axis (Hidden to reduce clutter)
        yaxis=dict(visible=False),
        
        showlegend=False,
        hovermode="x"
    )
    
    # Branding Footer
    fig = ub.add_source_footer(fig, source_text="Source: TLC High-Volume FHV Records", footer_y=-0.15)
    fig = ub.add_uber_logo(fig, position="bottom_right", logo_y=-0.2)

    # ------------------------------------------------------------
    # 4. SAVE
    # ------------------------------------------------------------
    try:
        save_plot(fig, FIG_NAME)
        print(f"   ‚úÖ {FIG_NAME} generated and saved")
        
        print(f"\nüí° KEY INSIGHT (Fig 3.1):")
        print(f"   Typical trip distance: {dist_stats['p50']:.1f} km (median)")
        print("   This aligns with intra-borough travel (neighborhood-scale mobility)")
        print("   Platform serves primarily short-distance urban trips")
        
    except NameError:
        print("   ‚ö†Ô∏è save_plot function not found. Skipping file save.")

# fig.show() 


   üé® Generating fig_1_4_distance_distribution...


   ‚úÖ fig_1_4_distance_distribution generated and saved

üí° KEY INSIGHT (Fig 3.1):
   Typical trip distance: 4.4 km (median)
   This aligns with intra-borough travel (neighborhood-scale mobility)
   Platform serves primarily short-distance urban trips


### Technical Analysis: Trip Distance Distribution

#### 1\. Visualization Strategy and Chart Selection

The **Histogram combined with a Kernel Density Estimate (KDE)** is the statistically robust choice for visualizing the distribution of continuous numerical data (Trip Distance).

  * **Dual Representation:** The histogram provides the "ground truth" of the data frequency in discrete bins, while the KDE curve offers a smooth, generalized approximation of the probability density function. This dual approach validates the distribution's shape (unimodal, right-skewed) to the analyst while presenting a clean curve to the executive audience.
  * **Alternative Rejection:** A **Box Plot** was considered for its compactness in showing statistical summary (Median, IQR). However, box plots often obscure the underlying distribution shape (e.g., hiding bimodality or the specific nature of the tail). The chosen design overcomes this by *overlaying* the statistical markers (Median, IQR) directly onto the distribution curve.

#### 2\. Adherence to Storytelling with Data (SWD) Principles

### A. Decluttering

  * **Y-Axis Removal:** The absolute frequency counts (or probability density values) on the Y-axis are often abstract and less meaningful to a business audience than the relative shape of the curve. By hiding the Y-axis (`yaxis=dict(visible=False)`), the visual load is reduced, forcing the viewer to focus on the *distribution* and the *central tendency* rather than arbitrary axis ticks.
  * **Layering:** The histogram is rendered with low opacity (`0.2`), pushing it to the visual background as "context," while the KDE line is solid and prominent, acting as the "signal."

### B. Preattentive Attributes (Color & Position)

  * **Semantic Coloring:**
      * **Green (`UBER_GREEN`):** Used for the distribution curve, establishing it as the primary data entity.
      * **Red (`UBER_RED`):** Strategically applied to the **Median** line. In skewed distributions (like this Log-normal one), the Median is a more robust measure of "central tendency" than the Mean. The red color acts as a visual anchor, drawing the eye immediately to the "typical" trip distance (4.5 km).
      * **Grey (`GRAY_900`):** Used for the Mean line, subordinating it to the Median to highlight the skewness (Mean \> Median) without competing for primary attention.

### C. Narrative Structure

  * **Annotations as Narrative:** Instead of a legend, statistical indicators are labeled directly on the chart. The annotations are vertically stratified to avoid collision:
      * **Top Layer:** Median (Most important).
      * **Middle Layer:** Mean (Reference).
      * **Bottom Layer:** IQR (Spread/Variance).
  * **Action Title:** The subtitle explicitly states the takeaway: "Demand is highly concentrated in short trips (Median \< Mean)," guiding the interpretation of the visual skew.

#### 3\. Conclusion

This visualization effectively synthesizes statistical rigor with business clarity. By prioritizing the Median via color contrast and simplifying the axis layout, it communicates the "short-haul" nature of the service model instantly, facilitating strategic discussions on pricing tiers or driver incentives for short vs. long trips.