In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from rich.console import Console
from rich.table import Table
from IPython.display import display, HTML

# --- Visual Styling ---
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.float_format", "{:,.2f}".format)
console = Console()

# Standard Color Palette
COLOR_PASS = "#2E8B57"  # SeaGreen
COLOR_FAIL = "#CD5C5C"  # IndianRed
COLOR_WARN = "#FF8C00"  # DarkOrange


def display_header(text, subtext=""):
    """Orion Standard Header"""
    console.rule(f"[bold cyan]{text}")
    if subtext:
        console.print(f"[italic dim]{subtext}[/]", justify="center")


In [2]:
# ==========================================
# ‚öôÔ∏è CONFIGURATION
# ==========================================
AUDIT_FILE_PATH = "TLC_Universal_Audit_Report_Processed.csv"  # <--- CHANGE THIS PATH
# ==========================================

# Load Data
try:
    df = pd.read_csv(AUDIT_FILE_PATH)

    # Handle Date Parsing (Audit script produces 'audit_month' usually YYYY-MM-DD)
    if "audit_month" in df.columns:
        df["audit_month"] = pd.to_datetime(df["audit_month"])
        df = df.sort_values("audit_month")
        TIME_COL = "audit_month"
    else:
        # Fallback if column name differs
        TIME_COL = df.columns[0]

    print(f"‚úÖ Successfully loaded: {AUDIT_FILE_PATH}")
    print(f"üìÖ Range: {df[TIME_COL].min().date()} to {df[TIME_COL].max().date()}")
    print(f"üìä Months: {len(df)}")
    print(f"üöï Total Rows Audited: {df['total_rows'].sum():,.0f}")

except FileNotFoundError:
    print(f"‚ùå Error: File not found at {AUDIT_FILE_PATH}")


‚úÖ Successfully loaded: TLC_Universal_Audit_Report_Processed.csv
üìÖ Range: 2019-02-01 to 2025-09-01
üìä Months: 80
üöï Total Rows Audited: 983,027,963


<!-- HIDDEN H1 FOR OUTLINE VIEW -->
<h1 id="paradox-check" style="display: none;">1. The Paradox Check (Physics & Logic)</h1>
<!-- VISIBLE H1 -->
<h1 id="paradox-check-visible" style="font-family: 'Roboto Condensed', 'Arial Narrow', sans-serif; color: white; font-size: 22px; font-weight: bold; background-color: #0771A4; border-radius: 4px; padding: 12px 0px 12px 15px; margin-top: 20px;">1. The Paradox Check (Physics & Logic)</h1>

In [3]:
# Identify Paradox Columns dynamically
paradox_cols = [c for c in df.columns if "paradox" in c]

if not paradox_cols:
    print("‚úÖ No Paradox Columns found (Data might be too raw or pre-cleaned).")
else:
    # Summary Table
    p_summary = df[paradox_cols].sum().sort_values(ascending=False).to_frame(name="Total Failures")
    p_summary["% Failure"] = (p_summary["Total Failures"] / df["total_rows"].sum()) * 100

    # Styling
    def highlight_fail(val):
        color = "red" if val > 0 else "green"
        return f"color: {color}; font-weight: bold"

    display(p_summary.style.applymap(highlight_fail).format({"Total Failures": "{:,.0f}", "% Failure": "{:.6f}%"}))

    # Visualization over time
    if p_summary["Total Failures"].sum() > 0:
        fig = px.line(
            df,
            x=TIME_COL,
            y=paradox_cols,
            title="<b>Paradox Violations Over Time</b><br><i>(Spikes indicate bad data ingestion or corruption)</i>",
            markers=True,
        )
        fig.update_layout(hovermode="x unified", height=400)
        fig.show()
    else:
        console.print("[bold green]‚ú® ZERO PARADOXES DETECTED. PHYSICS INTEGRITY: 100%[/]")


  display(p_summary.style.applymap(highlight_fail).format({"Total Failures": "{:,.0f}", "% Failure": "{:.6f}%"}))


Unnamed: 0,Total Failures,% Failure
paradox_teleport_count,0,0.000000%
paradox_slave_labor_count,0,0.000000%
paradox_time_travel_count,0,0.000000%


<!-- HIDDEN H1 FOR OUTLINE VIEW -->
<h1 id="health-matrix" style="display: none;">2. The Health Matrix (Nulls, Zeros, Negatives)</h1>
<!-- VISIBLE H1 -->
<h1 id="health-matrix-visible" style="font-family: 'Roboto Condensed', 'Arial Narrow', sans-serif; color: white; font-size: 22px; font-weight: bold; background-color: #0771A4; border-radius: 4px; padding: 12px 0px 12px 15px; margin-top: 20px;">2. The Health Matrix (Nulls, Zeros, Negatives)</h1>

In [4]:
# Extract Quality Columns
quality_cols = [c for c in df.columns if c.endswith(("_nulls", "_zeros", "_negatives"))]

if quality_cols:
    # Aggregate totals
    q_total = df[quality_cols].sum().reset_index()
    q_total.columns = ["Metric_Full", "Count"]

    # Parse Feature Name and Issue Type
    q_total["Feature"] = q_total["Metric_Full"].apply(lambda x: x.rsplit("_", 1)[0])
    q_total["Issue"] = q_total["Metric_Full"].apply(lambda x: x.rsplit("_", 1)[1])

    # Pivot
    q_pivot = q_total.pivot(index="Feature", columns="Issue", values="Count").fillna(0)

    # Calculate %
    total_global = df["total_rows"].sum()
    for col in ["nulls", "zeros", "negatives"]:
        if col in q_pivot.columns:
            q_pivot[f"{col}_%"] = (q_pivot[col] / total_global) * 100

    # Order columns
    display_cols = []
    for issue in ["nulls", "zeros", "negatives"]:
        if issue in q_pivot.columns:
            display_cols.extend([issue, f"{issue}_%"])

    # Display Heatmap
    display(
        q_pivot[display_cols]
        .style.background_gradient(cmap="Reds", subset=[c for c in display_cols if "%" in c])
        .format("{:,.0f}", subset=[c for c in display_cols if "%" not in c])
        .format("{:.4f}%", subset=[c for c in display_cols if "%" in c])
    )
else:
    print("No health metrics found in report.")


Issue,nulls,nulls_%,zeros,zeros_%,negatives,negatives_%
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
airport_fee,0,0.0000%,929405287,94.5452%,0,0.0000%
base_passenger_fare,0,0.0000%,0,0.0000%,0,0.0000%
bcf,0,0.0000%,67913591,6.9086%,0,0.0000%
borough_flow_type,0,0.0000%,0,0.0000%,0,0.0000%
cbd_congestion_fee,0,0.0000%,941896648,95.8159%,0,0.0000%
congestion_surcharge,0,0.0000%,593866897,60.4120%,0,0.0000%
cost_per_km,0,0.0000%,0,0.0000%,0,0.0000%
displacement_speed_kmh,481305,0.0490%,68262554,6.9441%,0,0.0000%
driver_pay,0,0.0000%,0,0.0000%,0,0.0000%
driver_response_time_min,8588775,0.8737%,13769,0.0014%,0,0.0000%


<!-- HIDDEN H1 FOR OUTLINE VIEW -->
<h1 id="dist-atlas" style="display: none;">3. The Distribution Atlas (Statistical Deep Dive)</h1>
<!-- VISIBLE H1 -->
<h1 id="dist-atlas-visible" style="font-family: 'Roboto Condensed', 'Arial Narrow', sans-serif; color: white; font-size: 22px; font-weight: bold; background-color: #0771A4; border-radius: 4px; padding: 12px 0px 12px 15px; margin-top: 20px;">3. The Distribution Atlas (Statistical Deep Dive)</h1>

In [13]:
# ==========================================
# 3. THE DISTRIBUTION ATLAS
# ==========================================
import warnings
display_header("3. The Distribution Atlas (Statistical Deep Dive)")

# 1. Dynamic Feature Detection
all_cols = df.columns.tolist()
potential_features = set()
for c in all_cols:
    if c.endswith("_mean"):
        potential_features.add(c.replace("_mean", ""))

sorted_features = sorted(list(potential_features))


# 2. Display Function
def show_feature_stats(feature_name):
    # Filter columns
    target_suffixes = ["_min", "_p01", "_p50", "_mean", "_std", "_p99", "_p99.9", "_max"]
    selected_cols = []
    for suffix in target_suffixes:
        col = f"{feature_name}{suffix}"
        if col in df.columns:
            selected_cols.append(col)

    if not selected_cols:
        return

    # Create mini dataframe
    cols_to_show = [TIME_COL, "total_rows"] + selected_cols
    temp = df[cols_to_show].copy()

    # Rename columns (Remove prefix, Upper Case)
    rename_map = {c: c.replace(f"{feature_name}_", "").upper() for c in selected_cols}
    temp = temp.rename(columns=rename_map)

    # --- Define Style Groups ---
    # 1. Low/Safe Distribution -> Blue Gradient
    cols_grad_blue = [c for c in ["MIN", "P01", "P50"] if c in temp.columns]

    # 2. Magnitude/Average -> Blue Bar
    cols_bar_blue = [c for c in ["MEAN"] if c in temp.columns]

    # 3. High/Risk Distribution -> Red Gradient
    cols_grad_red = [c for c in ["STD", "P99", "P99.9"] if c in temp.columns]

    # 4. Maximums -> Red Bar
    cols_bar_red = [c for c in ["MAX"] if c in temp.columns]

    # Formatting
    format_cols = cols_grad_blue + cols_bar_blue + cols_grad_red + cols_bar_red

    print(f"\nüîπ STATISTICS FOR: {feature_name.upper()}")

    # Suppress "Divide by Zero" warnings for columns with constant values (like 0.0 fees)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        styler = (
            temp.style.background_gradient(cmap="Blues", subset=cols_grad_blue)
            .bar(subset=cols_bar_blue, color="#5DA5DA", height=70, width=80)
            .background_gradient(cmap="Reds", subset=cols_grad_red)
            .bar(subset=cols_bar_red, color="#D9534F", height=70, width=80)
            .format({TIME_COL: "{:%b %Y}"})
            .format("{:,.2f}", subset=format_cols)
            .format("{:,.0f}", subset=["total_rows"])
        )

        display(styler)


# 3. Execute Loop
for feat in sorted_features:
    show_feature_stats(feat)



üîπ STATISTICS FOR: AIRPORT_FEE


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Mar 2019,16731889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Apr 2019,15181141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,May 2019,15383051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Jun 2019,13926600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Jul 2019,12953745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Aug 2019,11224729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Sep 2019,13460235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Oct 2019,13986805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Nov 2019,15098569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



üîπ STATISTICS FOR: BASE_PASSENGER_FARE


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.1,2.04,10.28,14.45,13.06,64.34,110.6,282.52
1,Mar 2019,16731889,0.1,2.4,9.64,13.92,12.84,63.21,110.51,298.9
2,Apr 2019,15181141,0.1,2.22,10.5,14.73,13.2,64.89,111.76,292.53
3,May 2019,15383051,0.1,2.88,11.77,16.43,14.37,70.73,120.7,278.32
4,Jun 2019,13926600,0.1,3.46,12.46,17.34,14.95,74.67,125.04,296.53
5,Jul 2019,12953745,0.1,3.06,11.63,16.29,14.32,72.08,121.28,292.3
6,Aug 2019,11224729,0.1,2.99,11.72,16.4,14.2,70.39,117.21,275.17
7,Sep 2019,13460235,0.1,2.99,12.15,16.9,14.51,71.41,119.98,287.73
8,Oct 2019,13986805,0.1,2.85,11.91,16.59,14.25,70.48,117.82,288.81
9,Nov 2019,15098569,0.1,2.75,10.7,15.25,13.66,68.05,113.26,296.49



üîπ STATISTICS FOR: BCF


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.0,0.0,0.26,0.37,0.36,1.81,3.02,7.04
1,Mar 2019,16731889,0.0,0.0,0.24,0.36,0.35,1.79,3.02,7.46
2,Apr 2019,15181141,0.0,0.0,0.26,0.38,0.36,1.83,3.04,7.01
3,May 2019,15383051,0.0,0.0,0.0,0.18,0.32,1.5,2.7,6.93
4,Jun 2019,13926600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Jul 2019,12953745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Aug 2019,11224729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Sep 2019,13460235,0.0,0.0,0.0,0.22,0.37,1.74,2.95,9.11
8,Oct 2019,13986805,0.0,0.0,0.31,0.44,0.41,2.09,3.38,11.46
9,Nov 2019,15098569,0.0,0.0,0.34,0.57,0.71,3.57,7.09,15.0



üîπ STATISTICS FOR: CBD_CONGESTION_FEE


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Mar 2019,16731889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Apr 2019,15181141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,May 2019,15383051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Jun 2019,13926600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Jul 2019,12953745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Aug 2019,11224729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Sep 2019,13460235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Oct 2019,13986805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Nov 2019,15098569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



üîπ STATISTICS FOR: CONGESTION_SURCHARGE


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.0,0.0,0.0,1.0,1.27,2.75,2.75,2.75
1,Mar 2019,16731889,0.0,0.0,0.0,0.99,1.28,2.75,2.75,2.75
2,Apr 2019,15181141,0.0,0.0,0.0,0.99,1.29,2.75,2.75,2.75
3,May 2019,15383051,0.0,0.0,0.0,1.03,1.3,2.75,2.75,2.75
4,Jun 2019,13926600,0.0,0.0,0.0,1.05,1.31,2.75,2.75,2.75
5,Jul 2019,12953745,0.0,0.0,0.0,1.04,1.31,2.75,2.75,2.75
6,Aug 2019,11224729,0.0,0.0,0.0,1.04,1.31,2.75,2.75,2.75
7,Sep 2019,13460235,0.0,0.0,0.0,1.1,1.32,2.75,2.75,2.75
8,Oct 2019,13986805,0.0,0.0,0.0,1.12,1.33,2.75,2.75,2.75
9,Nov 2019,15098569,0.0,0.0,0.0,1.03,1.31,2.75,2.75,2.75



üîπ STATISTICS FOR: COST_PER_KM


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.01,0.6,2.83,3.49,2.9,13.86,27.54,801.25
1,Mar 2019,16731889,0.01,0.68,2.64,3.28,2.64,12.89,26.04,791.4
2,Apr 2019,15181141,0.01,0.68,2.82,3.45,2.74,13.2,26.1,865.06
3,May 2019,15383051,0.01,0.86,3.1,3.81,2.91,14.53,28.54,685.56
4,Jun 2019,13926600,0.01,0.99,3.27,3.99,2.98,14.87,29.26,824.58
5,Jul 2019,12953745,0.01,0.9,3.1,3.8,2.89,14.33,28.32,704.22
6,Aug 2019,11224729,0.01,0.88,3.12,3.74,2.76,13.69,27.16,743.24
7,Sep 2019,13460235,0.01,0.91,3.27,3.94,2.9,14.38,28.31,733.35
8,Oct 2019,13986805,0.01,0.89,3.38,4.12,3.1,15.4,29.95,804.96
9,Nov 2019,15098569,0.01,0.85,3.11,3.87,3.04,14.82,28.52,938.4



üîπ STATISTICS FOR: DISPLACEMENT_SPEED_KMH


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.0,0.0,13.23,14.56,8.87,41.46,57.3,399.61
1,Mar 2019,16731889,0.0,0.0,13.38,14.61,8.76,41.14,56.57,398.47
2,Apr 2019,15181141,0.0,0.0,13.44,14.68,8.73,41.23,55.92,262.44
3,May 2019,15383051,0.0,0.0,13.2,14.44,8.66,40.94,55.88,288.26
4,Jun 2019,13926600,0.0,0.0,13.32,14.51,8.65,41.02,55.99,279.11
5,Jul 2019,12953745,0.0,0.0,13.79,14.96,8.83,41.65,56.96,266.42
6,Aug 2019,11224729,0.0,0.0,13.99,15.14,8.8,41.64,56.92,204.46
7,Sep 2019,13460235,0.0,0.0,13.3,14.58,8.79,41.51,56.3,257.69
8,Oct 2019,13986805,0.0,0.0,13.16,14.43,8.82,41.26,56.57,413.32
9,Nov 2019,15098569,0.0,0.0,13.37,14.61,8.98,41.87,57.56,298.87



üîπ STATISTICS FOR: DRIVER_PAY


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.01,2.83,10.52,14.3,11.92,61.79,99.65,199.8
1,Mar 2019,16731889,0.03,3.04,10.86,14.75,12.22,62.87,100.89,199.97
2,Apr 2019,15181141,0.01,3.11,10.72,14.73,12.33,63.29,101.41,199.98
3,May 2019,15383051,0.09,3.26,11.21,15.73,13.61,69.25,110.12,199.99
4,Jun 2019,13926600,0.01,3.65,11.24,15.69,13.6,69.67,112.5,199.91
5,Jul 2019,12953745,0.1,3.56,10.78,15.03,12.96,66.88,108.58,199.71
6,Aug 2019,11224729,0.11,3.5,10.75,15.05,12.98,66.39,107.26,199.76
7,Sep 2019,13460235,0.01,3.66,11.09,15.13,12.5,63.34,102.08,199.93
8,Oct 2019,13986805,0.02,3.82,10.81,14.31,11.07,55.69,88.62,197.98
9,Nov 2019,15098569,0.02,4.33,10.78,14.03,10.6,54.05,85.19,199.9



üîπ STATISTICS FOR: DRIVER_RESPONSE_TIME_MIN


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.02,0.08,1.87,2.8,3.07,13.48,24.28,137.58
1,Mar 2019,16731889,0.0,0.1,2.07,2.93,3.01,13.15,22.92,162.73
2,Apr 2019,15181141,0.0,0.1,1.97,2.8,2.86,12.62,20.02,1100.53
3,May 2019,15383051,0.0,0.12,2.18,3.0,2.99,13.42,22.27,297.8
4,Jun 2019,13926600,0.0,0.1,1.95,2.71,2.88,12.88,23.58,181.7
5,Jul 2019,12953745,0.0,0.12,1.58,2.34,2.67,12.12,22.4,655.47
6,Aug 2019,11224729,0.0,0.12,1.52,2.25,2.56,11.55,21.0,187.53
7,Sep 2019,13460235,0.0,0.1,1.7,2.48,2.7,12.03,20.85,181.42
8,Oct 2019,13986805,0.0,0.1,1.63,2.37,2.7,11.77,23.07,201.32
9,Nov 2019,15098569,0.0,0.1,1.9,2.57,2.57,11.52,20.97,238.15



üîπ STATISTICS FOR: DRIVER_REVENUE_SHARE


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.0,0.54,0.94,1.17,1.36,3.83,13.44,846.0
1,Mar 2019,16731889,0.0,0.57,1.08,1.21,1.31,3.44,11.7,344.17
2,Apr 2019,15181141,0.0,0.54,0.97,1.15,1.24,3.46,10.17,488.75
3,May 2019,15383051,0.0,0.51,0.91,1.04,1.02,2.78,7.41,374.8
4,Jun 2019,13926600,0.0,0.49,0.87,0.97,0.82,2.41,6.15,530.5
5,Jul 2019,12953745,0.0,0.5,0.9,1.01,0.95,2.57,7.26,551.33
6,Aug 2019,11224729,0.0,0.5,0.87,1.01,0.99,2.62,7.98,619.62
7,Sep 2019,13460235,0.0,0.51,0.84,0.99,0.96,2.67,8.0,440.09
8,Oct 2019,13986805,0.0,0.51,0.82,0.98,0.96,2.63,7.95,386.18
9,Nov 2019,15098569,0.0,0.52,0.9,1.08,1.0,2.84,8.45,322.64



üîπ STATISTICS FOR: DURATION_MIN


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,1.0,3.12,15.23,18.18,11.94,58.0,84.87,249.12
1,Mar 2019,16731889,1.0,3.13,15.33,18.4,12.26,60.07,86.98,248.8
2,Apr 2019,15181141,1.0,3.18,15.2,18.35,12.33,60.82,89.08,248.23
3,May 2019,15383051,1.0,3.18,15.38,18.96,13.41,67.17,98.37,248.07
4,Jun 2019,13926600,1.0,3.25,15.2,18.67,13.14,66.25,98.6,249.63
5,Jul 2019,12953745,1.0,3.17,14.6,17.76,12.19,61.03,89.78,248.3
6,Aug 2019,11224729,1.0,3.17,14.6,17.75,12.1,60.58,87.72,249.42
7,Sep 2019,13460235,1.0,3.23,15.18,18.66,13.11,65.8,97.72,250.0
8,Oct 2019,13986805,1.0,3.17,14.87,18.14,12.51,62.48,92.37,246.55
9,Nov 2019,15098569,1.0,3.08,14.42,17.52,12.06,60.73,90.37,239.95



üîπ STATISTICS FOR: DURATION_SECONDS


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,60.0,187.0,914.0,1090.91,716.22,3480.0,5092.0,14947.0
1,Mar 2019,16731889,60.0,188.0,920.0,1104.01,735.43,3604.0,5219.0,14928.0
2,Apr 2019,15181141,60.0,191.0,912.0,1100.71,739.69,3649.0,5345.0,14894.0
3,May 2019,15383051,60.0,191.0,923.0,1137.4,804.64,4030.0,5902.0,14884.0
4,Jun 2019,13926600,60.0,195.0,912.0,1120.38,788.47,3975.0,5916.0,14978.0
5,Jul 2019,12953745,60.0,190.0,876.0,1065.67,731.11,3662.0,5387.0,14898.0
6,Aug 2019,11224729,60.0,190.0,876.0,1065.23,726.18,3635.0,5263.0,14965.0
7,Sep 2019,13460235,60.0,194.0,911.0,1119.82,786.4,3948.0,5863.0,15000.0
8,Oct 2019,13986805,60.0,190.0,892.0,1088.16,750.65,3749.0,5542.0,14793.0
9,Nov 2019,15098569,60.0,185.0,865.0,1051.48,723.83,3644.0,5422.0,14397.0



üîπ STATISTICS FOR: PAY_PER_HOUR


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.02,17.28,42.73,46.6,19.69,118.68,179.84,2214.33
1,Mar 2019,16731889,0.04,17.63,43.16,47.28,18.8,116.33,175.24,1286.27
2,Apr 2019,15181141,0.04,17.74,42.92,47.05,18.21,113.88,170.21,2629.76
3,May 2019,15383051,0.04,18.17,43.6,48.56,19.37,119.92,179.67,2569.85
4,Jun 2019,13926600,0.06,19.44,43.63,48.88,18.96,120.52,180.17,1172.96
5,Jul 2019,12953745,0.21,19.54,43.82,49.03,18.95,120.82,181.35,2500.11
6,Aug 2019,11224729,0.36,19.36,43.9,48.95,18.79,119.57,178.2,4092.24
7,Sep 2019,13460235,0.09,19.28,42.88,47.65,17.81,115.44,175.41,1197.64
8,Oct 2019,13986805,0.02,20.02,42.5,46.87,16.9,110.99,176.23,1464.2
9,Nov 2019,15098569,0.05,21.16,43.12,47.82,17.28,113.68,181.95,3578.29



üîπ STATISTICS FOR: SALES_TAX


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.0,0.0,0.91,1.28,1.18,5.76,9.72,24.52
1,Mar 2019,16731889,0.0,0.0,0.86,1.25,1.17,5.69,9.76,26.48
2,Apr 2019,15181141,0.0,0.0,0.93,1.31,1.2,5.79,9.79,24.42
3,May 2019,15383051,0.0,0.0,1.02,1.43,1.28,6.24,10.51,25.64
4,Jun 2019,13926600,0.0,0.0,1.07,1.49,1.32,6.52,10.82,30.69
5,Jul 2019,12953745,0.0,0.0,1.0,1.41,1.26,6.29,10.49,24.01
6,Aug 2019,11224729,0.0,0.0,1.01,1.42,1.25,6.16,10.16,28.01
7,Sep 2019,13460235,0.0,0.0,1.05,1.48,1.3,6.34,10.53,25.51
8,Oct 2019,13986805,0.0,0.0,1.05,1.47,1.29,6.34,10.55,28.85
9,Nov 2019,15098569,0.0,0.0,0.95,1.37,1.25,6.15,10.21,28.55



üîπ STATISTICS FOR: SPEED_KMH


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,1.0,6.83,18.6,21.32,10.45,56.1,68.34,98.49
1,Mar 2019,16731889,1.0,6.71,18.64,21.27,10.28,55.45,67.79,96.69
2,Apr 2019,15181141,1.0,6.51,18.68,21.28,10.31,55.45,67.75,98.67
3,May 2019,15383051,1.0,6.24,18.36,20.87,10.19,54.99,67.59,99.83
4,Jun 2019,13926600,1.0,6.08,18.44,20.9,10.16,55.01,67.7,98.82
5,Jul 2019,12953745,1.0,6.26,18.98,21.53,10.32,55.41,67.71,99.91
6,Aug 2019,11224729,1.0,6.73,19.24,21.82,10.26,55.4,67.81,99.27
7,Sep 2019,13460235,1.0,6.04,18.44,21.06,10.5,56.03,68.47,99.75
8,Oct 2019,13986805,1.0,5.94,18.25,20.84,10.47,55.57,68.04,99.45
9,Nov 2019,15098569,1.0,6.15,18.41,21.09,10.59,56.25,68.72,97.82



üîπ STATISTICS FOR: TIPS


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.0,0.0,0.0,0.36,1.41,5.0,15.0,100.0
1,Mar 2019,16731889,0.0,0.0,0.0,0.36,1.4,5.0,15.0,100.0
2,Apr 2019,15181141,0.0,0.0,0.0,0.38,1.48,6.13,15.0,100.0
3,May 2019,15383051,0.0,0.0,0.0,0.47,1.76,8.94,17.96,100.0
4,Jun 2019,13926600,0.0,0.0,0.0,0.53,1.93,9.85,19.71,100.0
5,Jul 2019,12953745,0.0,0.0,0.0,0.5,1.85,9.31,19.11,100.0
6,Aug 2019,11224729,0.0,0.0,0.0,0.5,1.85,9.39,18.76,100.0
7,Sep 2019,13460235,0.0,0.0,0.0,0.51,1.9,9.77,19.11,100.0
8,Oct 2019,13986805,0.0,0.0,0.0,0.54,1.96,10.0,19.62,100.0
9,Nov 2019,15098569,0.0,0.0,0.0,0.5,1.86,9.41,19.08,100.0



üîπ STATISTICS FOR: TOLLS


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.0,0.0,0.0,0.52,2.21,5.76,23.45,49.4
1,Mar 2019,16731889,0.0,0.0,0.0,0.53,2.21,5.76,23.45,49.95
2,Apr 2019,15181141,0.0,0.0,0.0,0.56,2.29,6.12,23.45,49.72
3,May 2019,15383051,0.0,0.0,0.0,0.61,2.41,6.12,23.45,49.95
4,Jun 2019,13926600,0.0,0.0,0.0,0.6,2.41,6.12,23.45,49.92
5,Jul 2019,12953745,0.0,0.0,0.0,0.58,2.38,6.12,23.45,49.92
6,Aug 2019,11224729,0.0,0.0,0.0,0.61,2.43,6.12,23.45,49.72
7,Sep 2019,13460235,0.0,0.0,0.0,0.63,2.48,6.12,23.45,49.92
8,Oct 2019,13986805,0.0,0.0,0.0,0.61,2.48,6.12,23.45,49.92
9,Nov 2019,15098569,0.0,0.0,0.0,0.56,2.42,6.12,25.0,49.92



üîπ STATISTICS FOR: TORTUOSITY_INDEX


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.01,0.57,1.45,16.33,101.47,291.29,1173.21,11965.44
1,Mar 2019,16731889,0.01,0.57,1.44,16.07,96.17,284.85,1065.38,11691.85
2,Apr 2019,15181141,0.02,0.57,1.43,13.83,77.53,257.49,661.44,11838.31
3,May 2019,15383051,0.03,0.56,1.43,13.78,75.68,257.49,640.52,11963.83
4,Jun 2019,13926600,0.03,0.56,1.43,13.72,75.17,259.1,637.3,11719.21
5,Jul 2019,12953745,0.02,0.56,1.43,13.71,75.69,257.49,634.08,11661.28
6,Aug 2019,11224729,0.07,0.56,1.43,13.67,75.96,257.49,629.25,11135.02
7,Sep 2019,13460235,0.05,0.57,1.43,13.74,77.36,257.49,651.78,11992.8
8,Oct 2019,13986805,0.05,0.56,1.43,14.13,76.05,257.49,640.52,11926.82
9,Nov 2019,15098569,0.03,0.55,1.43,14.89,76.31,260.71,646.95,11946.13



üîπ STATISTICS FOR: TOTAL_RIDER_COST


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.1,2.72,12.78,17.98,16.51,83.16,138.52,419.61
1,Mar 2019,16731889,0.11,3.12,12.05,17.41,16.28,81.92,138.64,371.39
2,Apr 2019,15181141,0.1,2.95,12.99,18.36,16.78,84.02,139.48,340.49
3,May 2019,15383051,0.11,3.64,14.3,20.15,18.02,90.12,148.98,338.12
4,Jun 2019,13926600,0.1,4.0,14.96,21.0,18.55,93.48,152.83,351.71
5,Jul 2019,12953745,0.1,3.82,14.06,19.82,17.82,90.45,148.08,324.26
6,Aug 2019,11224729,0.1,3.86,14.17,19.98,17.73,88.81,145.66,308.4
7,Sep 2019,13460235,0.11,3.9,14.84,20.83,18.34,91.62,149.73,394.38
8,Oct 2019,13986805,0.1,3.82,14.79,20.78,18.31,91.63,148.28,388.85
9,Nov 2019,15098569,0.11,3.54,13.42,19.28,17.75,89.44,145.22,437.97



üîπ STATISTICS FOR: TOTAL_WAIT_TIME_MIN


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.07,0.97,4.3,5.09,3.25,16.17,28.52,170.57
1,Mar 2019,16731889,0.0,1.03,4.33,5.12,3.22,15.83,26.93,575.62
2,Apr 2019,15181141,0.0,0.98,4.07,4.83,2.99,15.07,23.48,1103.78
3,May 2019,15383051,0.0,1.0,4.2,4.98,3.14,15.97,26.32,432.93
4,Jun 2019,13926600,0.0,0.97,3.93,4.67,3.09,15.5,28.82,380.95
5,Jul 2019,12953745,0.0,0.92,3.63,4.31,2.86,14.58,26.68,655.47
6,Aug 2019,11224729,0.0,0.88,3.55,4.2,2.74,13.98,25.27,190.1
7,Sep 2019,13460235,0.0,0.9,3.7,4.4,2.84,14.38,24.28,184.57
8,Oct 2019,13986805,0.0,0.88,3.58,4.26,2.83,14.07,26.58,201.32
9,Nov 2019,15098569,0.0,0.93,3.68,4.3,2.63,13.45,23.3,243.42



üîπ STATISTICS FOR: TRIP_KM


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,13056381,0.16,0.79,4.54,6.94,6.77,32.04,47.52,119.93
1,Mar 2019,16731889,0.16,0.79,4.6,6.98,6.77,31.99,46.98,119.93
2,Apr 2019,15181141,0.16,0.8,4.55,6.92,6.68,31.66,46.27,119.24
3,May 2019,15383051,0.16,0.79,4.52,6.96,6.81,31.95,46.56,119.64
4,Jun 2019,13926600,0.16,0.8,4.46,6.88,6.72,31.7,46.24,119.78
5,Jul 2019,12953745,0.16,0.79,4.43,6.8,6.65,31.51,46.08,118.69
6,Aug 2019,11224729,0.16,0.8,4.49,6.95,6.8,31.96,46.38,117.35
7,Sep 2019,13460235,0.16,0.8,4.44,6.93,6.84,32.07,46.85,119.93
8,Oct 2019,13986805,0.16,0.79,4.25,6.72,6.73,31.7,46.69,119.27
9,Nov 2019,15098569,0.16,0.77,4.2,6.57,6.59,31.48,46.61,119.69


<!-- HIDDEN H1 FOR OUTLINE VIEW -->
<h1 id="cat-flags" style="display: none;">4. Categorical & Boolean Analysis</h1>
<!-- VISIBLE H1 -->
<h1 id="cat-flags-visible" style="font-family: 'Roboto Condensed', 'Arial Narrow', sans-serif; color: white; font-size: 22px; font-weight: bold; background-color: #0771A4; border-radius: 4px; padding: 12px 0px 12px 15px; margin-top: 20px;">4. Categorical & Boolean Analysis</h1>

In [6]:
# 1. Boolean Flags (Columns ending in _count_true)
flag_cols = [c for c in df.columns if c.endswith("_count_true")]

if flag_cols:
    display_header("Boolean Flag Prevalence")

    # Calculate % for visualization
    flag_df = df[[TIME_COL, "total_rows"] + flag_cols].copy()
    plot_cols = []

    for c in flag_cols:
        short_name = c.replace("_count_true", "")
        pct_col = f"{short_name} (%)"
        flag_df[pct_col] = (flag_df[c] / flag_df["total_rows"]) * 100
        plot_cols.append(pct_col)

    # Plot
    fig = px.line(flag_df, x=TIME_COL, y=plot_cols, title="<b>Boolean Flags over Time (%)</b>", markers=True)
    fig.update_layout(hovermode="x unified", height=450, yaxis_title="Percentage of Trips")
    fig.show()

# 2. Categorical Null Checks
# If we tracked nulls for categoricals (like weather_state_nulls)
cat_nulls = [c for c in df.columns if c.endswith("_nulls") and c.replace("_nulls", "") not in sorted_features]

if cat_nulls:
    display_header("Categorical Data Completeness")
    cat_df = df[cat_nulls].sum().to_frame(name="Total Missing")
    cat_df["% Missing"] = (cat_df["Total Missing"] / df["total_rows"].sum()) * 100
    display(cat_df[cat_df["Total Missing"] > 0].style.format({"Total Missing": "{:,.0f}", "% Missing": "{:.4f}%"}))


Unnamed: 0,Total Missing,% Missing
