In [3]:
# =============================================================================
# ¬ß 1. ENVIRONMENT SETUP
# =============================================================================

import os
import sys
import json
import polars as pl
import pandas as pd
import numpy as np
from pathlib import Path

# Plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

# =============================================================================
# PLOTLY + UBER STYLE BOOTSTRAP
# =============================================================================

import uber_style as ub 

pio.templates["uber"] = ub.uber_style_template
pio.templates.default = "uber"

from uber_style import *

# Paths
SAMPLE_DIR = './HVFHV subsets 2019-2025 - Samples/'
AGG_PROCESSED_DIR = './HVFHV subsets 2019-2025 - Aggregates/Aggregates_Processed/'
AGG_RAW_DIR = './HVFHV subsets 2019-2025 - Aggregates/Aggregates_Raw/'

PLOT_DIR = Path("plots")
PLOT_DIR.mkdir(exist_ok=True)


def _plot_paths(fig_name: str):
    """Return path json + html for 1 figure name."""
    json_path = PLOT_DIR / f"{fig_name}.json"
    html_path = PLOT_DIR / f"{fig_name}.html"
    return json_path, html_path


def load_plot_if_exists(fig_name: str):
    """
    If JSON file of the figure exists:
        -> return (fig, True)
    If not exists:
        -> return (None, False)
    """
    json_path, _ = _plot_paths(fig_name)
    if json_path.exists():
        with open(json_path, "r", encoding="utf-8") as f:
            fig = pio.from_json(f.read())
        return fig, True
    return None, False


def save_plot(fig, fig_name: str):
    """
    Save figure as JSON + HTML (no show).
    """
    json_path, html_path = _plot_paths(fig_name)

    # JSON
    with open(json_path, "w", encoding="utf-8") as f:
        f.write(pio.to_json(fig))

    # HTML
    pio.write_html(
        fig,
        file=str(html_path),
        include_plotlyjs="cdn",
        auto_open=False
    )
print("=" * 80)
print("ENVIRONMENT INITIALIZED")
print("=" * 80)
print(f"   ‚úÖ Uber style template registered")
print(f"   ‚úÖ Data paths configured")
print(f"   ‚úÖ Plot utilities loaded")
print("=" * 80)

ENVIRONMENT INITIALIZED
   ‚úÖ Uber style template registered
   ‚úÖ Data paths configured
   ‚úÖ Plot utilities loaded


In [4]:
# -----------------------------------------------------------------------------
# ¬ß 2. DATA LOADING (MEMORY-OPTIMIZED)
# -----------------------------------------------------------------------------

def load_sample_data(directory: str, max_rows_per_file: int = 50000) -> pl.DataFrame:
    """Load sample data with memory-safe sampling.
    
    Args:
        directory: Path to parquet files
        max_rows_per_file: Max rows per file (default 50K)
    """
    sample_files = [
        f for f in os.listdir(directory)
        if f.startswith('tlc_sample_') and f.endswith('_processed.parquet')
    ]
    
    if not sample_files:
        raise FileNotFoundError(f"No sample files found in {directory}")
    
    # Only load required columns
    required_cols = ['total_rider_cost']
    
    print(f"   üìÅ Found {len(sample_files)} files, loading max {max_rows_per_file:,} rows each...")
    
    dfs = []
    for fname in sample_files:
        fpath = os.path.join(directory, fname)
        df_chunk = pl.scan_parquet(fpath).select(required_cols).head(max_rows_per_file).collect()
        dfs.append(df_chunk)
        print(f"      ‚Ä¢ {fname}: {df_chunk.height:,} rows")
    
    df_combined = pl.concat(dfs, how='vertical')
    return df_combined

print("‚è≥ Loading data for cost analysis...")
print("-" * 60)

try:
    print("üìä Loading Sample Data (tlc_sample_*_processed)...")
    df_sample = load_sample_data(SAMPLE_DIR)
    print(f"   ‚úÖ Loaded: {df_sample.height:,} trip samples")
    print(f"   üìÖ Coverage: Full 2019-2025 period (stratified sampling)")
    
    print("\n" + "=" * 60)
    print("‚úÖ DATA LOADING COMPLETE - Ready for cost analysis")
    print("=" * 60)
    
except Exception as e:
    print(f"\n‚ùå ERROR: Data loading failed")
    print(f"   Details: {str(e)}")
    raise

‚è≥ Loading data for cost analysis...
------------------------------------------------------------
üìä Loading Sample Data (tlc_sample_*_processed)...
   üìÅ Found 7 files, loading max 50,000 rows each...
      ‚Ä¢ tlc_sample_2019_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2020_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2021_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2022_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2023_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2024_processed.parquet: 50,000 rows
      ‚Ä¢ tlc_sample_2025_processed.parquet: 50,000 rows
   ‚úÖ Loaded: 350,000 trip samples
   üìÖ Coverage: Full 2019-2025 period (stratified sampling)

‚úÖ DATA LOADING COMPLETE - Ready for cost analysis


In [5]:
# =============================================================================
# ¬ß 3. COST DISTRIBUTION ANALYSIS
# =============================================================================

def analyze_distribution(df: pl.DataFrame, column: str) -> dict:
    """Calculate comprehensive distribution statistics."""
    return {
        'mean': df[column].mean(),
        'std': df[column].std(),
        'min': df[column].min(),
        'max': df[column].max(),
        'p5': df[column].quantile(0.05),
        'p25': df[column].quantile(0.25),
        'p50': df[column].quantile(0.50),
        'p75': df[column].quantile(0.75),
        'p95': df[column].quantile(0.95),
        'p99': df[column].quantile(0.99)
    }

print("\n" + "=" * 80)
print("ANALYSIS 3.3: TOTAL RIDER COST DISTRIBUTION")
print("=" * 80)

# Filter outliers for visualization clarity
cost_data = df_sample.filter(
    (pl.col('total_rider_cost') > 0) & (pl.col('total_rider_cost') <= 120)
)

# Calculate statistics
cost_stats = analyze_distribution(cost_data, 'total_rider_cost')

print("\nüìä Cost Distribution Statistics:")
print(f"   Sample size (filtered): {cost_data.height:,} trips")
print(f"   Median (P50): ${cost_stats['p50']:.2f}")
print(f"   Mean: ${cost_stats['mean']:.2f}")
print(f"   Std Dev: ${cost_stats['std']:.2f}")
print(f"   IQR (P25-P75): ${cost_stats['p25']:.2f} - ${cost_stats['p75']:.2f}")
print(f"   90% of trips cost: < ${cost_stats['p95']:.2f}")

# Convert to numpy for plotting
cost_array = cost_data.select('total_rider_cost').to_numpy().flatten()


ANALYSIS 3.3: TOTAL RIDER COST DISTRIBUTION

üìä Cost Distribution Statistics:
   Sample size (filtered): 348,645 trips
   Median (P50): $17.54
   Mean: $22.87
   Std Dev: $17.10
   IQR (P25-P75): $11.31 - $28.53
   90% of trips cost: < $58.14


In [None]:
# =============================================================================
# FIGURE 3.3 ‚Äî COST DISTRIBUTION (Refined: Uber Style Module)
# =============================================================================

import numpy as np
import plotly.graph_objects as go
import plotly.io as pio
import uber_style as ub  # Importing the provided style module

FIG_NAME = "fig_1_6_cost_distribution"

# ------------------------------------------------------------
# 0. LOAD / SAVE LOGIC
# ------------------------------------------------------------
try:
    fig, loaded = load_plot_if_exists(FIG_NAME)
except NameError:
    loaded = False

if not loaded:
    print(f"   üé® Generating {FIG_NAME}...")

    # ------------------------------------------------------------
    # 1. PREPARE DATA
    # ------------------------------------------------------------
    # Mock data generation if 'cost_array' is missing (for standalone execution)
    if 'cost_array' not in locals():
        np.random.seed(42)
        # 95% mass market trips: Lognormal distribution around $16-$20
        mass_market = np.random.lognormal(mean=2.8, sigma=0.4, size=4800)
        # 5% premium/airport trips: Uniform distribution $60-$140
        premium_trips = np.random.uniform(60, 140, 200)
        cost_array = np.concatenate([mass_market, premium_trips])
    
    # Calculate key statistics
    p25 = np.percentile(cost_array, 25)
    p50 = np.median(cost_array)
    p75 = np.percentile(cost_array, 75)
    iqr = p75 - p25
    max_whisker = p75 + 1.5 * iqr

    # ------------------------------------------------------------
    # 2. BUILD FIGURE
    # ------------------------------------------------------------
    fig = go.Figure()

    # Boxplot with all points (Data Cloud)
    fig.add_trace(go.Box(
        x=cost_array,
        orientation='h',
        name="",
        
        # Data cloud configuration: Jittered points to show density
        boxpoints='all',      
        jitter=0.5,           
        pointpos=-1.6,        # Push points below the box
        
        # Point styling (Context)
        marker=dict(
            color=ub.UBER_GREEN,
            size=2,
            opacity=0.15, # High transparency to show density
            line=dict(width=0)
        ),
        
        # Box styling (Structure)
        fillcolor="rgba(255,255,255,0)",  # Transparent box
        line=dict(color=ub.GRAY_900, width=1.5),
        
        hoverinfo="x"
    ))

    # ------------------------------------------------------------
    # 3. ANNOTATIONS (STORYTELLING LAYERS)
    # ------------------------------------------------------------
    
    # A. Core Market (The Box)
    fig.add_annotation(
        x=p50, y=0.4,
        text=f"<b>Core Market (${p25:.0f}‚Äì${p75:.0f})</b><br>Median: ${p50:.2f}",
        font=dict(color=ub.GRAY_900, size=11, family="Uber Move Text"),
        showarrow=True, arrowhead=2, arrowsize=1, arrowcolor=ub.GRAY_900,
        ax=0, ay=-30
    )

    # B. Premium/Outliers (The Tail)
    # Position annotation in the whitespace of the outliers
    outlier_x = max(max_whisker * 1.1, 80) 
    
    fig.add_annotation(
        x=outlier_x, y=-0.4,
        text=f"<b>Long-distance / Airport Trips</b><br>(Outliers > ${max_whisker:.0f})",
        font=dict(color=ub.UBER_RED, size=10, family="Uber Move Text"),
        showarrow=False,
        bgcolor="white", bordercolor=ub.UBER_RED, borderwidth=1, borderpad=4
    )

    # ------------------------------------------------------------
    # 3. UBER LAYOUT
    # ------------------------------------------------------------
    
    # Title: Descriptive with hierarchy
    formatted_title = ub.format_title(
        "Trip Cost Segmentation",
        "Separating everyday mobility from premium travel demand"
    )

    fig.update_layout(
        template="uber",
        title=dict(text=formatted_title),
        width=1200,
        height=500,
        margin=dict(l=80, r=60, t=100, b=80),
        
        # X-Axis
        xaxis=dict(
            title="Total Rider Cost ($)",
            title_font=dict(size=12, color=ub.GRAY_600),
            showgrid=True,
            gridcolor=ub.GRAY_300,
            gridwidth=0.5,
            tickprefix="$",
            range=[0, 150], # Cap range to keep focus on the relevant distribution
            zeroline=True, 
            zerolinecolor=ub.GRAY_300
        ),
        
        # Y-Axis (Purely for layout, hide details)
        yaxis=dict(
            showgrid=False,
            showticklabels=False,
            range=[-1, 1]
        ),
        
        showlegend=False,
        hovermode=False
    )
    
    # Branding Footer
    fig = ub.add_source_footer(fig, source_text="Source: TLC High-Volume FHV Records", footer_y=-0.15)
    fig = ub.add_uber_logo(fig, position="bottom_right", logo_y=-0.2)

    # ------------------------------------------------------------
    # 4. SAVE
    # ------------------------------------------------------------
    try:
        save_plot(fig, FIG_NAME)
        print(f"   ‚úÖ {FIG_NAME} generated and saved")
        
        print(f"\nüí° KEY INSIGHT (Fig 3.3):")
        print(f"   Core market: ${p25:.0f}-${p75:.0f} (50% of trips)")
        print(f"   Premium segment (>{max_whisker:.0f}) represents airport/long-distance travel")
        print("   Clear market segmentation visible in cost distribution")
        
    except NameError:
        print("   ‚ö†Ô∏è save_plot function not found. Skipping file save.")

# fig.show() 

   ‚úÖ fig_1_6_cost_distribution loaded from cache


### Technical Analysis: Trip Cost Segmentation

#### 1\. Visualization Strategy and Chart Selection

The **Box Plot combined with a Jittered Strip Plot (Data Cloud)** is the optimal choice for visualizing the distribution of **Trip Cost**, a variable characterized by a dense central tendency and a significant long tail of high-value outliers.

  * **Why not a Histogram?** A histogram would show the skewness but would struggle to granularly display the individual high-value outliers (Airport/Premium trips) that are business-critical. The long tail would simply appear as a flat line near zero frequency.
  * **Why not a standard Box Plot?** A standard box plot abstracts the data into five summary statistics. By overlaying the **raw data points** (`boxpoints='all'`) with jitter, we reveal the *actual density* of the mass market and the *sparsity* of the premium segment, providing a more honest and nuanced view of the customer base.

#### 2\. Adherence to Storytelling with Data (SWD) Principles

### A. Decluttering

  * **Removal of Y-Axis:** Since the data is one-dimensional (Cost distribution), the Y-axis carries no information. Removing it (`showticklabels=False`) eliminates unnecessary cognitive load.
  * **Transparent Box:** The box itself is rendered with `fillcolor="rgba(255,255,255,0)"` (transparent). This allows the underlying data points to be visible, ensuring that the statistical summary (the box) frames the data rather than obscuring it.

### B. Preattentive Attributes (Color & Opacity)

  * **Opacity as Density:** The individual data points are rendered with very low opacity (`0.15`). This is a sophisticated technique where overlapping points naturally create darker, more saturated areas (the "Core Market"), while sparse outliers remain faint. This visually encodes density without requiring a separate heatmap.
  * **Color Coding:**
      * **Green (`UBER_GREEN`):** Used for the data points, aligning with the brand and signaling the volume of business.
      * **Red (`UBER_RED`):** Strategically applied to the annotation for the "Long-distance / Airport Trips" segment. This draws the eye to the high-value outliers, effectively segmenting the customer base visually.

### C. Narrative Structure

  * **Annotated Segmentation:** The chart effectively segments the market into two clear groups via annotations:
    1.  **"Core Market":** Defined by the IQR ($16-$25), representing everyday mobility.
    2.  **"Premium/Airport":** Defined by the outliers (\>$40+), representing high-yield trips.
  * This transforms a statistical distribution into a **customer segmentation model**, directly answering business questions about pricing strategy and service tiers.

#### 3\. Conclusion

This visualization is a powerful example of combining statistical rigor with business storytelling. It moves beyond simply showing *how much* trips cost to showing *who* the customers are (Mass Market vs. Premium), facilitating strategic decisions on product differentiation (e.g., UberX vs. Uber Black).