In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from rich.console import Console
from rich.table import Table
from IPython.display import display, HTML

# --- Visual Styling ---
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.float_format", "{:,.2f}".format)
console = Console()

# Standard Color Palette
COLOR_PASS = "#2E8B57"  # SeaGreen
COLOR_FAIL = "#CD5C5C"  # IndianRed
COLOR_WARN = "#FF8C00"  # DarkOrange


def display_header(text, subtext=""):
    """Orion Standard Header"""
    console.rule(f"[bold cyan]{text}")
    if subtext:
        console.print(f"[italic dim]{subtext}[/]", justify="center")


In [2]:
# ==========================================
# ‚öôÔ∏è CONFIGURATION
# ==========================================
AUDIT_FILE_PATH = "TLC_Universal_Audit_Report_Sampled.csv"  # <--- CHANGE THIS PATH
# ==========================================

# Load Data
try:
    df = pd.read_csv(AUDIT_FILE_PATH)

    # Handle Date Parsing (Audit script produces 'audit_month' usually YYYY-MM-DD)
    if "audit_month" in df.columns:
        df["audit_month"] = pd.to_datetime(df["audit_month"])
        df = df.sort_values("audit_month")
        TIME_COL = "audit_month"
    else:
        # Fallback if column name differs
        TIME_COL = df.columns[0]

    print(f"‚úÖ Successfully loaded: {AUDIT_FILE_PATH}")
    print(f"üìÖ Range: {df[TIME_COL].min().date()} to {df[TIME_COL].max().date()}")
    print(f"üìä Months: {len(df)}")
    print(f"üöï Total Rows Audited: {df['total_rows'].sum():,.0f}")

except FileNotFoundError:
    print(f"‚ùå Error: File not found at {AUDIT_FILE_PATH}")


‚úÖ Successfully loaded: TLC_Universal_Audit_Report_Sampled.csv
üìÖ Range: 2019-02-01 to 2025-09-01
üìä Months: 80
üöï Total Rows Audited: 9,830,241


<!-- HIDDEN H1 FOR OUTLINE VIEW -->
<h1 id="paradox-check" style="display: none;">1. The Paradox Check (Physics & Logic)</h1>
<!-- VISIBLE H1 -->
<h1 id="paradox-check-visible" style="font-family: 'Roboto Condensed', 'Arial Narrow', sans-serif; color: white; font-size: 22px; font-weight: bold; background-color: #0771A4; border-radius: 4px; padding: 12px 0px 12px 15px; margin-top: 20px;">1. The Paradox Check (Physics & Logic)</h1>

In [3]:
# Identify Paradox Columns dynamically
paradox_cols = [c for c in df.columns if "paradox" in c]

if not paradox_cols:
    print("‚úÖ No Paradox Columns found (Data might be too raw or pre-cleaned).")
else:
    # Summary Table
    p_summary = df[paradox_cols].sum().sort_values(ascending=False).to_frame(name="Total Failures")
    p_summary["% Failure"] = (p_summary["Total Failures"] / df["total_rows"].sum()) * 100

    # Styling
    def highlight_fail(val):
        color = "red" if val > 0 else "green"
        return f"color: {color}; font-weight: bold"

    display(p_summary.style.applymap(highlight_fail).format({"Total Failures": "{:,.0f}", "% Failure": "{:.6f}%"}))

    # Visualization over time
    if p_summary["Total Failures"].sum() > 0:
        fig = px.line(
            df,
            x=TIME_COL,
            y=paradox_cols,
            title="<b>Paradox Violations Over Time</b><br><i>(Spikes indicate bad data ingestion or corruption)</i>",
            markers=True,
        )
        fig.update_layout(hovermode="x unified", height=400)
        fig.show()
    else:
        console.print("[bold green]‚ú® ZERO PARADOXES DETECTED. PHYSICS INTEGRITY: 100%[/]")


  display(p_summary.style.applymap(highlight_fail).format({"Total Failures": "{:,.0f}", "% Failure": "{:.6f}%"}))


Unnamed: 0,Total Failures,% Failure
paradox_teleport_count,0,0.000000%
paradox_slave_labor_count,0,0.000000%
paradox_time_travel_count,0,0.000000%


<!-- HIDDEN H1 FOR OUTLINE VIEW -->
<h1 id="health-matrix" style="display: none;">2. The Health Matrix (Nulls, Zeros, Negatives)</h1>
<!-- VISIBLE H1 -->
<h1 id="health-matrix-visible" style="font-family: 'Roboto Condensed', 'Arial Narrow', sans-serif; color: white; font-size: 22px; font-weight: bold; background-color: #0771A4; border-radius: 4px; padding: 12px 0px 12px 15px; margin-top: 20px;">2. The Health Matrix (Nulls, Zeros, Negatives)</h1>

In [4]:
# Extract Quality Columns
quality_cols = [c for c in df.columns if c.endswith(("_nulls", "_zeros", "_negatives"))]

if quality_cols:
    # Aggregate totals
    q_total = df[quality_cols].sum().reset_index()
    q_total.columns = ["Metric_Full", "Count"]

    # Parse Feature Name and Issue Type
    q_total["Feature"] = q_total["Metric_Full"].apply(lambda x: x.rsplit("_", 1)[0])
    q_total["Issue"] = q_total["Metric_Full"].apply(lambda x: x.rsplit("_", 1)[1])

    # Pivot
    q_pivot = q_total.pivot(index="Feature", columns="Issue", values="Count").fillna(0)

    # Calculate %
    total_global = df["total_rows"].sum()
    for col in ["nulls", "zeros", "negatives"]:
        if col in q_pivot.columns:
            q_pivot[f"{col}_%"] = (q_pivot[col] / total_global) * 100

    # Order columns
    display_cols = []
    for issue in ["nulls", "zeros", "negatives"]:
        if issue in q_pivot.columns:
            display_cols.extend([issue, f"{issue}_%"])

    # Display Heatmap
    display(
        q_pivot[display_cols]
        .style.background_gradient(cmap="Reds", subset=[c for c in display_cols if "%" in c])
        .format("{:,.0f}", subset=[c for c in display_cols if "%" not in c])
        .format("{:.4f}%", subset=[c for c in display_cols if "%" in c])
    )
else:
    print("No health metrics found in report.")


Issue,nulls,nulls_%,zeros,zeros_%,negatives,negatives_%
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
airport_fee,0,0.0000%,9294569,94.5508%,0,0.0000%
base_passenger_fare,0,0.0000%,0,0.0000%,0,0.0000%
bcf,0,0.0000%,679076,6.9080%,0,0.0000%
borough_flow_type,0,0.0000%,0,0.0000%,0,0.0000%
cbd_congestion_fee,0,0.0000%,9418531,95.8118%,0,0.0000%
congestion_surcharge,0,0.0000%,5937864,60.4041%,0,0.0000%
cost_per_km,0,0.0000%,0,0.0000%,0,0.0000%
displacement_speed_kmh,4810,0.0489%,682776,6.9457%,0,0.0000%
driver_pay,0,0.0000%,0,0.0000%,0,0.0000%
driver_response_time_min,86322,0.8781%,119,0.0012%,0,0.0000%


<!-- HIDDEN H1 FOR OUTLINE VIEW -->
<h1 id="dist-atlas" style="display: none;">3. The Distribution Atlas (Statistical Deep Dive)</h1>
<!-- VISIBLE H1 -->
<h1 id="dist-atlas-visible" style="font-family: 'Roboto Condensed', 'Arial Narrow', sans-serif; color: white; font-size: 22px; font-weight: bold; background-color: #0771A4; border-radius: 4px; padding: 12px 0px 12px 15px; margin-top: 20px;">3. The Distribution Atlas (Statistical Deep Dive)</h1>

In [5]:
# ==========================================
# 3. THE DISTRIBUTION ATLAS
# ==========================================
import warnings
display_header("3. The Distribution Atlas (Statistical Deep Dive)")

# 1. Dynamic Feature Detection
all_cols = df.columns.tolist()
potential_features = set()
for c in all_cols:
    if c.endswith("_mean"):
        potential_features.add(c.replace("_mean", ""))

sorted_features = sorted(list(potential_features))


# 2. Display Function
def show_feature_stats(feature_name):
    # Filter columns
    target_suffixes = ["_min", "_p01", "_p50", "_mean", "_std", "_p99", "_p99.9", "_max"]
    selected_cols = []
    for suffix in target_suffixes:
        col = f"{feature_name}{suffix}"
        if col in df.columns:
            selected_cols.append(col)

    if not selected_cols:
        return

    # Create mini dataframe
    cols_to_show = [TIME_COL, "total_rows"] + selected_cols
    temp = df[cols_to_show].copy()

    # Rename columns (Remove prefix, Upper Case)
    rename_map = {c: c.replace(f"{feature_name}_", "").upper() for c in selected_cols}
    temp = temp.rename(columns=rename_map)

    # --- Define Style Groups ---
    # 1. Low/Safe Distribution -> Blue Gradient
    cols_grad_blue = [c for c in ["MIN", "P01", "P50"] if c in temp.columns]

    # 2. Magnitude/Average -> Blue Bar
    cols_bar_blue = [c for c in ["MEAN"] if c in temp.columns]

    # 3. High/Risk Distribution -> Red Gradient
    cols_grad_red = [c for c in ["STD", "P99", "P99.9"] if c in temp.columns]

    # 4. Maximums -> Red Bar
    cols_bar_red = [c for c in ["MAX"] if c in temp.columns]

    # Formatting
    format_cols = cols_grad_blue + cols_bar_blue + cols_grad_red + cols_bar_red

    print(f"\nüîπ STATISTICS FOR: {feature_name.upper()}")

    # Suppress "Divide by Zero" warnings for columns with constant values (like 0.0 fees)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        styler = (
            temp.style.background_gradient(cmap="Blues", subset=cols_grad_blue)
            .bar(subset=cols_bar_blue, color="#5DA5DA", height=70, width=80)
            .background_gradient(cmap="Reds", subset=cols_grad_red)
            .bar(subset=cols_bar_red, color="#D9534F", height=70, width=80)
            .format({TIME_COL: "{:%b %Y}"})
            .format("{:,.2f}", subset=format_cols)
            .format("{:,.0f}", subset=["total_rows"])
        )

        display(styler)


# 3. Execute Loop
for feat in sorted_features:
    show_feature_stats(feat)



üîπ STATISTICS FOR: AIRPORT_FEE


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Mar 2019,167318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Apr 2019,151811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,May 2019,153830,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Jun 2019,139266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Jul 2019,129537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Aug 2019,112247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Sep 2019,134602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Oct 2019,139868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Nov 2019,150985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



üîπ STATISTICS FOR: BASE_PASSENGER_FARE


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.1,2.05,10.25,14.41,12.99,64.26,108.89,256.9
1,Mar 2019,167318,0.1,2.38,9.64,13.9,12.78,62.83,110.63,215.68
2,Apr 2019,151811,0.1,2.22,10.53,14.77,13.24,64.73,111.94,247.44
3,May 2019,153830,0.11,2.92,11.76,16.43,14.46,71.37,122.51,262.66
4,Jun 2019,139266,0.16,3.5,12.47,17.36,15.02,75.18,124.67,242.82
5,Jul 2019,129537,0.1,3.06,11.59,16.21,14.28,71.8,122.86,219.6
6,Aug 2019,112247,0.1,3.04,11.73,16.39,14.14,70.45,116.88,207.02
7,Sep 2019,134602,0.15,3.06,12.13,16.81,14.27,69.88,116.81,272.38
8,Oct 2019,139868,0.1,2.84,11.88,16.57,14.21,70.13,118.99,231.01
9,Nov 2019,150985,0.1,2.73,10.74,15.3,13.75,68.46,116.35,239.22



üîπ STATISTICS FOR: BCF


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.0,0.0,0.26,0.37,0.35,1.8,2.99,6.57
1,Mar 2019,167318,0.0,0.0,0.24,0.36,0.35,1.77,3.06,5.9
2,Apr 2019,151811,0.0,0.0,0.26,0.38,0.36,1.82,3.01,6.34
3,May 2019,153830,0.0,0.0,0.0,0.18,0.32,1.49,2.66,6.72
4,Jun 2019,139266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Jul 2019,129537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Aug 2019,112247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Sep 2019,134602,0.0,0.0,0.0,0.22,0.36,1.72,2.86,6.96
8,Oct 2019,139868,0.0,0.0,0.31,0.44,0.41,2.09,3.4,6.36
9,Nov 2019,150985,0.0,0.0,0.34,0.57,0.71,3.64,7.2,14.86



üîπ STATISTICS FOR: CBD_CONGESTION_FEE


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Mar 2019,167318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Apr 2019,151811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,May 2019,153830,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Jun 2019,139266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Jul 2019,129537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Aug 2019,112247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Sep 2019,134602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Oct 2019,139868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Nov 2019,150985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



üîπ STATISTICS FOR: CONGESTION_SURCHARGE


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.0,0.0,0.0,1.0,1.27,2.75,2.75,2.75
1,Mar 2019,167318,0.0,0.0,0.0,0.99,1.28,2.75,2.75,2.75
2,Apr 2019,151811,0.0,0.0,0.0,1.0,1.29,2.75,2.75,2.75
3,May 2019,153830,0.0,0.0,0.0,1.03,1.3,2.75,2.75,2.75
4,Jun 2019,139266,0.0,0.0,0.0,1.06,1.31,2.75,2.75,2.75
5,Jul 2019,129537,0.0,0.0,0.0,1.04,1.31,2.75,2.75,2.75
6,Aug 2019,112247,0.0,0.0,0.0,1.05,1.31,2.75,2.75,2.75
7,Sep 2019,134602,0.0,0.0,0.0,1.09,1.32,2.75,2.75,2.75
8,Oct 2019,139868,0.0,0.0,0.0,1.11,1.33,2.75,2.75,2.75
9,Nov 2019,150985,0.0,0.0,0.0,1.03,1.31,2.75,2.75,2.75



üîπ STATISTICS FOR: COST_PER_KM


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.04,0.6,2.84,3.5,2.9,13.8,27.98,174.03
1,Mar 2019,167318,0.05,0.68,2.64,3.27,2.54,12.84,25.43,114.08
2,Apr 2019,151811,0.03,0.68,2.82,3.46,2.66,13.14,26.14,160.6
3,May 2019,153830,0.02,0.86,3.1,3.81,2.89,14.66,28.84,119.96
4,Jun 2019,139266,0.05,0.99,3.27,3.99,2.94,14.8,29.92,151.99
5,Jul 2019,129537,0.03,0.88,3.1,3.79,2.78,14.22,28.23,81.06
6,Aug 2019,112247,0.02,0.88,3.12,3.73,2.65,13.61,25.55,100.53
7,Sep 2019,134602,0.06,0.91,3.27,3.93,2.8,14.2,27.63,145.15
8,Oct 2019,139868,0.02,0.89,3.37,4.11,3.0,15.33,29.65,100.58
9,Nov 2019,150985,0.02,0.85,3.11,3.87,2.96,14.73,27.91,107.47



üîπ STATISTICS FOR: DISPLACEMENT_SPEED_KMH


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.0,0.0,13.21,14.53,8.84,41.03,57.42,201.25
1,Mar 2019,167318,0.0,0.0,13.37,14.59,8.7,40.82,55.73,142.49
2,Apr 2019,151811,0.0,0.0,13.41,14.68,8.73,41.06,55.93,125.95
3,May 2019,153830,0.0,0.0,13.21,14.45,8.67,41.01,56.1,156.23
4,Jun 2019,139266,0.0,0.0,13.29,14.51,8.68,40.96,55.74,165.29
5,Jul 2019,129537,0.0,0.0,13.81,14.96,8.8,41.62,56.44,133.21
6,Aug 2019,112247,0.0,0.0,14.01,15.14,8.82,41.47,57.53,172.39
7,Sep 2019,134602,0.0,0.0,13.3,14.57,8.77,41.52,56.58,115.68
8,Oct 2019,139868,0.0,0.0,13.14,14.4,8.81,41.31,57.04,159.23
9,Nov 2019,150985,0.0,0.0,13.37,14.63,8.99,41.96,56.72,127.32



üîπ STATISTICS FOR: DRIVER_PAY


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.06,2.85,10.53,14.25,11.83,61.76,98.1,191.15
1,Mar 2019,167318,0.05,3.03,10.87,14.75,12.15,62.66,99.69,195.91
2,Apr 2019,151811,0.11,3.15,10.76,14.75,12.33,63.27,101.05,188.8
3,May 2019,153830,0.21,3.3,11.21,15.71,13.62,69.02,111.41,195.68
4,Jun 2019,139266,0.19,3.67,11.26,15.72,13.68,70.28,113.71,199.91
5,Jul 2019,129537,0.7,3.52,10.77,14.98,12.89,66.23,109.52,171.32
6,Aug 2019,112247,0.73,3.45,10.75,15.05,12.98,66.52,107.15,171.96
7,Sep 2019,134602,0.16,3.64,11.11,15.07,12.34,62.43,98.91,196.11
8,Oct 2019,139868,0.61,3.79,10.81,14.31,11.08,55.57,89.38,168.91
9,Nov 2019,150985,0.03,4.43,10.8,14.08,10.67,54.23,87.54,172.24



üîπ STATISTICS FOR: DRIVER_RESPONSE_TIME_MIN


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.03,0.08,1.87,2.79,3.07,13.43,24.5,42.6
1,Mar 2019,167318,0.0,0.1,2.07,2.93,3.01,13.13,23.0,80.2
2,Apr 2019,151811,0.0,0.1,1.97,2.8,2.84,12.63,19.63,49.45
3,May 2019,153830,0.0,0.1,2.17,2.99,2.96,13.27,21.55,67.35
4,Jun 2019,139266,0.0,0.1,1.93,2.71,2.87,12.82,23.38,92.13
5,Jul 2019,129537,0.0,0.12,1.58,2.33,2.68,12.0,22.13,97.97
6,Aug 2019,112247,0.03,0.12,1.52,2.25,2.56,11.58,21.1,74.13
7,Sep 2019,134602,0.0,0.1,1.7,2.48,2.69,12.02,20.83,53.25
8,Oct 2019,139868,0.0,0.1,1.62,2.39,2.71,12.02,23.35,69.32
9,Nov 2019,150985,0.02,0.1,1.9,2.57,2.57,11.55,21.02,47.48



üîπ STATISTICS FOR: DRIVER_REVENUE_SHARE


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.0,0.54,0.94,1.17,1.23,3.77,12.81,146.67
1,Mar 2019,167318,0.0,0.57,1.08,1.22,1.31,3.48,12.64,240.58
2,Apr 2019,151811,0.0,0.54,0.97,1.15,1.24,3.44,9.63,164.57
3,May 2019,153830,0.03,0.51,0.91,1.04,0.85,2.75,6.74,156.87
4,Jun 2019,139266,0.01,0.49,0.87,0.97,0.61,2.42,5.56,77.21
5,Jul 2019,129537,0.09,0.5,0.9,1.02,0.9,2.59,8.0,128.43
6,Aug 2019,112247,0.04,0.5,0.87,1.01,1.02,2.58,8.2,202.28
7,Sep 2019,134602,0.01,0.51,0.84,0.99,0.93,2.63,7.74,165.79
8,Oct 2019,139868,0.02,0.51,0.82,0.98,0.9,2.65,7.83,105.58
9,Nov 2019,150985,0.0,0.52,0.9,1.09,1.33,2.87,9.08,322.64



üîπ STATISTICS FOR: DURATION_MIN


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,1.05,3.08,15.23,18.14,11.92,58.03,84.52,155.75
1,Mar 2019,167318,1.0,3.17,15.35,18.43,12.26,60.13,86.78,196.95
2,Apr 2019,151811,1.0,3.18,15.23,18.36,12.31,60.6,90.12,182.42
3,May 2019,153830,1.05,3.17,15.42,18.95,13.36,66.67,97.8,196.95
4,Jun 2019,139266,1.02,3.22,15.23,18.68,13.17,66.13,98.28,209.83
5,Jul 2019,129537,1.05,3.13,14.57,17.69,12.06,60.62,86.73,197.13
6,Aug 2019,112247,1.0,3.18,14.6,17.76,12.09,60.48,86.58,174.48
7,Sep 2019,134602,1.0,3.22,15.2,18.63,13.02,65.1,96.62,188.6
8,Oct 2019,139868,1.02,3.2,14.85,18.15,12.51,62.42,91.55,147.07
9,Nov 2019,150985,1.02,3.08,14.47,17.59,12.15,61.27,92.58,239.47



üîπ STATISTICS FOR: DURATION_SECONDS


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,63.0,185.0,914.0,1088.65,714.98,3482.0,5071.0,9345.0
1,Mar 2019,167318,60.0,190.0,921.0,1105.57,735.38,3608.0,5207.0,11817.0
2,Apr 2019,151811,60.0,191.0,914.0,1101.31,738.55,3636.0,5407.0,10945.0
3,May 2019,153830,63.0,190.0,925.0,1136.73,801.83,4000.0,5868.0,11817.0
4,Jun 2019,139266,61.0,193.0,914.0,1120.62,790.08,3968.0,5897.0,12590.0
5,Jul 2019,129537,63.0,188.0,874.0,1061.53,723.72,3637.0,5204.0,11828.0
6,Aug 2019,112247,60.0,191.0,876.0,1065.42,725.67,3629.0,5195.0,10469.0
7,Sep 2019,134602,60.0,193.0,912.0,1117.93,781.17,3906.0,5797.0,11316.0
8,Oct 2019,139868,61.0,192.0,891.0,1088.83,750.54,3745.0,5493.0,8824.0
9,Nov 2019,150985,61.0,185.0,868.0,1055.31,729.13,3676.0,5555.0,14368.0



üîπ STATISTICS FOR: PAY_PER_HOUR


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.22,17.23,42.77,46.61,19.87,119.12,183.52,1109.19
1,Mar 2019,167318,0.06,17.59,43.12,47.19,18.68,115.3,173.65,632.61
2,Apr 2019,151811,0.25,17.66,42.92,47.1,18.26,113.95,171.53,585.4
3,May 2019,153830,0.78,18.19,43.56,48.53,19.25,119.9,177.42,544.91
4,Jun 2019,139266,0.57,19.47,43.68,48.94,19.05,120.64,180.29,787.55
5,Jul 2019,129537,4.37,19.57,43.87,49.03,18.96,121.27,183.81,587.23
6,Aug 2019,112247,9.0,19.42,43.9,48.92,18.59,119.56,176.43,454.09
7,Sep 2019,134602,0.3,19.05,42.86,47.59,17.71,114.39,178.5,325.76
8,Oct 2019,139868,1.81,20.01,42.49,46.8,16.82,111.23,175.21,473.66
9,Nov 2019,150985,0.67,21.22,43.12,47.83,17.3,114.11,182.15,429.93



üîπ STATISTICS FOR: SALES_TAX


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.0,0.0,0.91,1.28,1.17,5.76,9.54,23.31
1,Mar 2019,167318,0.0,0.0,0.86,1.25,1.17,5.66,9.71,20.95
2,Apr 2019,151811,0.0,0.0,0.93,1.31,1.2,5.78,9.7,22.5
3,May 2019,153830,0.0,0.0,1.02,1.43,1.28,6.23,10.7,23.85
4,Jun 2019,139266,0.0,0.0,1.06,1.49,1.32,6.55,10.92,22.08
5,Jul 2019,129537,0.0,0.0,1.0,1.4,1.26,6.27,10.82,18.58
6,Aug 2019,112247,0.0,0.0,1.01,1.42,1.25,6.2,10.05,18.45
7,Sep 2019,134602,0.0,0.0,1.05,1.47,1.28,6.23,10.45,24.72
8,Oct 2019,139868,0.0,0.0,1.05,1.47,1.29,6.28,10.58,21.95
9,Nov 2019,150985,0.0,0.0,0.95,1.37,1.25,6.08,10.36,21.23



üîπ STATISTICS FOR: SPEED_KMH


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,1.29,6.87,18.59,21.26,10.39,55.82,67.87,82.98
1,Mar 2019,167318,1.27,6.72,18.63,21.23,10.25,55.28,67.89,82.34
2,Apr 2019,151811,1.38,6.52,18.66,21.27,10.31,55.3,67.57,86.35
3,May 2019,153830,1.03,6.24,18.36,20.87,10.2,54.9,67.75,87.22
4,Jun 2019,139266,1.13,6.13,18.46,20.92,10.17,54.95,68.08,84.48
5,Jul 2019,129537,1.1,6.24,18.97,21.51,10.28,55.15,67.55,86.09
6,Aug 2019,112247,1.43,6.75,19.27,21.84,10.28,55.54,67.89,80.54
7,Sep 2019,134602,1.01,6.08,18.43,21.04,10.46,56.12,68.33,83.83
8,Oct 2019,139868,1.02,5.9,18.24,20.82,10.46,55.54,67.94,85.96
9,Nov 2019,150985,1.34,6.11,18.42,21.14,10.63,56.26,69.09,93.37



üîπ STATISTICS FOR: TIPS


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.0,0.0,0.0,0.36,1.44,5.0,14.4,100.0
1,Mar 2019,167318,0.0,0.0,0.0,0.36,1.39,5.0,15.0,50.5
2,Apr 2019,151811,0.0,0.0,0.0,0.38,1.46,6.34,15.0,100.0
3,May 2019,153830,0.0,0.0,0.0,0.46,1.72,8.81,17.51,44.14
4,Jun 2019,139266,0.0,0.0,0.0,0.53,1.95,10.0,19.78,100.0
5,Jul 2019,129537,0.0,0.0,0.0,0.5,1.86,9.2,18.97,100.0
6,Aug 2019,112247,0.0,0.0,0.0,0.51,1.86,9.37,18.61,50.0
7,Sep 2019,134602,0.0,0.0,0.0,0.51,1.87,9.52,18.8,65.83
8,Oct 2019,139868,0.0,0.0,0.0,0.54,1.96,10.0,19.44,50.0
9,Nov 2019,150985,0.0,0.0,0.0,0.51,1.9,9.68,20.0,80.0



üîπ STATISTICS FOR: TOLLS


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.0,0.0,0.0,0.51,2.19,5.76,23.45,44.75
1,Mar 2019,167318,0.0,0.0,0.0,0.52,2.15,5.76,23.45,44.75
2,Apr 2019,151811,0.0,0.0,0.0,0.56,2.3,6.12,23.45,44.75
3,May 2019,153830,0.0,0.0,0.0,0.61,2.45,6.12,23.45,46.12
4,Jun 2019,139266,0.0,0.0,0.0,0.6,2.44,6.12,24.0,46.12
5,Jul 2019,129537,0.0,0.0,0.0,0.57,2.34,6.12,23.45,45.15
6,Aug 2019,112247,0.0,0.0,0.0,0.61,2.43,6.12,23.45,47.55
7,Sep 2019,134602,0.0,0.0,0.0,0.62,2.48,6.12,23.45,46.4
8,Oct 2019,139868,0.0,0.0,0.0,0.62,2.49,6.12,23.45,46.12
9,Nov 2019,150985,0.0,0.0,0.0,0.57,2.46,6.12,25.15,47.55



üîπ STATISTICS FOR: TORTUOSITY_INDEX


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.04,0.57,1.45,16.3,93.9,296.12,1086.3,6768.88
1,Mar 2019,167318,0.02,0.57,1.44,15.9,96.33,283.24,1021.93,10801.89
2,Apr 2019,151811,0.13,0.56,1.44,13.77,77.44,255.89,688.8,10755.22
3,May 2019,153830,0.11,0.56,1.43,13.61,80.36,257.49,640.52,11310.44
4,Jun 2019,139266,0.1,0.56,1.43,13.86,79.83,257.49,629.25,10814.76
5,Jul 2019,129537,0.08,0.56,1.42,13.61,73.37,257.49,621.21,7625.05
6,Aug 2019,112247,0.09,0.57,1.43,13.63,71.76,259.1,661.44,5614.99
7,Sep 2019,134602,0.15,0.57,1.43,13.58,76.37,257.49,609.94,6802.68
8,Oct 2019,139868,0.06,0.56,1.43,14.03,71.14,254.28,646.95,6733.48
9,Nov 2019,150985,0.1,0.55,1.43,15.17,84.82,263.93,672.7,10771.31



üîπ STATISTICS FOR: TOTAL_RIDER_COST


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.13,2.72,12.75,17.92,16.41,82.97,136.1,292.54
1,Mar 2019,167318,0.11,3.13,12.03,17.38,16.2,81.23,138.37,270.63
2,Apr 2019,151811,0.12,2.97,13.02,18.41,16.82,84.2,138.6,299.27
3,May 2019,153830,0.12,3.68,14.3,20.15,18.09,90.87,148.71,302.1
4,Jun 2019,139266,0.29,4.01,14.94,21.03,18.65,94.0,153.21,287.46
5,Jul 2019,129537,0.13,3.82,14.0,19.73,17.77,90.07,151.17,247.77
6,Aug 2019,112247,0.14,3.89,14.18,19.98,17.69,88.78,142.96,234.34
7,Sep 2019,134602,0.3,3.93,14.82,20.72,18.08,90.24,147.18,312.93
8,Oct 2019,139868,0.12,3.81,14.76,20.75,18.28,91.6,147.71,274.12
9,Nov 2019,150985,0.11,3.54,13.45,19.35,17.86,89.55,151.08,276.69



üîπ STATISTICS FOR: TOTAL_WAIT_TIME_MIN


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.12,0.97,4.3,5.08,3.25,16.07,28.43,70.0
1,Mar 2019,167318,0.07,1.05,4.33,5.11,3.2,15.78,26.92,82.45
2,Apr 2019,151811,0.07,1.0,4.07,4.83,2.96,15.03,23.48,49.45
3,May 2019,153830,0.0,1.0,4.18,4.96,3.12,15.73,25.52,81.13
4,Jun 2019,139266,0.02,0.97,3.93,4.67,3.09,15.47,29.7,142.8
5,Jul 2019,129537,0.0,0.92,3.63,4.3,2.85,14.52,26.22,102.38
6,Aug 2019,112247,0.02,0.88,3.55,4.19,2.76,14.02,26.18,77.5
7,Sep 2019,134602,0.02,0.9,3.7,4.4,2.82,14.45,23.4,61.37
8,Oct 2019,139868,0.02,0.88,3.58,4.27,2.85,14.28,27.47,98.4
9,Nov 2019,150985,0.02,0.93,3.68,4.3,2.62,13.5,23.73,48.58



üîπ STATISTICS FOR: TRIP_KM


Unnamed: 0,audit_month,total_rows,MIN,P01,P50,MEAN,STD,P99,P99.9,MAX
0,Feb 2019,130563,0.16,0.77,4.54,6.9,6.73,32.01,47.62,85.36
1,Mar 2019,167318,0.16,0.79,4.6,6.98,6.76,31.93,47.25,118.64
2,Apr 2019,151811,0.16,0.8,4.55,6.92,6.68,31.49,46.56,107.55
3,May 2019,153830,0.16,0.79,4.51,6.96,6.82,31.9,46.61,113.1
4,Jun 2019,139266,0.16,0.8,4.47,6.88,6.71,31.64,46.06,108.15
5,Jul 2019,129537,0.16,0.8,4.41,6.77,6.62,31.45,45.46,84.31
6,Aug 2019,112247,0.16,0.8,4.49,6.95,6.78,31.91,46.49,102.71
7,Sep 2019,134602,0.16,0.8,4.44,6.91,6.8,31.98,46.99,91.81
8,Oct 2019,139868,0.16,0.79,4.25,6.72,6.72,31.82,46.61,83.01
9,Nov 2019,150985,0.16,0.77,4.23,6.6,6.61,31.33,46.56,107.71


<!-- HIDDEN H1 FOR OUTLINE VIEW -->
<h1 id="cat-flags" style="display: none;">4. Categorical & Boolean Analysis</h1>
<!-- VISIBLE H1 -->
<h1 id="cat-flags-visible" style="font-family: 'Roboto Condensed', 'Arial Narrow', sans-serif; color: white; font-size: 22px; font-weight: bold; background-color: #0771A4; border-radius: 4px; padding: 12px 0px 12px 15px; margin-top: 20px;">4. Categorical & Boolean Analysis</h1>

In [6]:
# 1. Boolean Flags (Columns ending in _count_true)
flag_cols = [c for c in df.columns if c.endswith("_count_true")]

if flag_cols:
    display_header("Boolean Flag Prevalence")

    # Calculate % for visualization
    flag_df = df[[TIME_COL, "total_rows"] + flag_cols].copy()
    plot_cols = []

    for c in flag_cols:
        short_name = c.replace("_count_true", "")
        pct_col = f"{short_name} (%)"
        flag_df[pct_col] = (flag_df[c] / flag_df["total_rows"]) * 100
        plot_cols.append(pct_col)

    # Plot
    fig = px.line(flag_df, x=TIME_COL, y=plot_cols, title="<b>Boolean Flags over Time (%)</b>", markers=True)
    fig.update_layout(hovermode="x unified", height=450, yaxis_title="Percentage of Trips")
    fig.show()

# 2. Categorical Null Checks
# If we tracked nulls for categoricals (like weather_state_nulls)
cat_nulls = [c for c in df.columns if c.endswith("_nulls") and c.replace("_nulls", "") not in sorted_features]

if cat_nulls:
    display_header("Categorical Data Completeness")
    cat_df = df[cat_nulls].sum().to_frame(name="Total Missing")
    cat_df["% Missing"] = (cat_df["Total Missing"] / df["total_rows"].sum()) * 100
    display(cat_df[cat_df["Total Missing"] > 0].style.format({"Total Missing": "{:,.0f}", "% Missing": "{:.4f}%"}))


Unnamed: 0,Total Missing,% Missing
