Real-World Business Scenario: Performance Benchmarking

Requirement: The operations team needs to identify "Extreme Weather Rides." We must:

1. Filter: Only analyze high-volume days (days with > 5,000 rides) to ensure the baseline is statistically significant.

2. Transform: For every individual ride, calculate how much the temperature deviated from the average temperature of that specific day.

3. Aggregate (Collapse): Produce a daily summary report showing the total ride count and the single highest temperature deviation recorded.

In [None]:
import pandas as pd

# Load dataset from the MDA Repository Knowledge Source
def generate_weather_anomaly_report(path: str) -> pd.DataFrame:
    return (
        pd.read_csv(path)
        # 0. INGESTION & TYPE OPTIMIZATION (The Logic Standard)
        .assign(
            starttime=lambda x: pd.to_datetime(x['starttime']),
            date=lambda x: x['starttime'].dt.date
        )
        .loc[:, ['date', 'events', 'temperature', 'wind_speed']]
        
        # 1. FILTER: Data Quality Pruning (The "Bouncer")
        # Rule: Drop any day that doesn't have at least 50 rides.
        # This prevents outliers on low-volume days from skewing our logic.
        .groupby('date')
        .filter(lambda x: len(x) > 50)
        
        # 2. TRANSFORM: Feature Engineering (The "Broadcaster")
        # Rule: Calculate the daily average temp and map it back to every ride.
        # This allows row-level comparison without losing the ride details.
        .assign(
            daily_avg_temp=lambda x: x.groupby('date')['temperature'].transform('mean'),
            temp_anomaly=lambda x: (x['temperature'] - x['daily_avg_temp']).abs()
        )
        
        # 3. AGGREGATE: Final Reporting (The "Collapse")
        # Rule: Collapse the thousands of rides into one summary row per day.
        .groupby('date')
        .agg(
            total_rides=('temperature', 'size'),
            max_temp_deviation=('temp_anomaly', 'max'),
            primary_weather=('events', lambda x: x.value_counts().index[0])
        )
        .reset_index()
        # Sort by the most "anomalous" days
        .sort_values('max_temp_deviation', ascending=False)
    )

# Execution
report = generate_weather_anomaly_report('../data/bikes.csv')

print("--- Production Business Report: Weather Anomalies ---")
report

--- Production Business Report: Weather Anomalies ---


Unnamed: 0,date,total_rides,max_temp_deviation,primary_weather
119,2016-06-30,60,9904.795000,mostlycloudy
95,2016-05-23,52,23.823077,partlycloudy
202,2017-05-15,66,22.693939,cloudy
91,2016-04-18,58,21.827586,partlycloudy
105,2016-06-10,87,21.601149,partlycloudy
...,...,...,...,...
139,2016-07-30,51,3.925490,mostlycloudy
75,2015-10-01,52,3.138462,mostlycloudy
179,2016-09-29,53,2.828302,cloudy
175,2016-09-23,69,2.724638,cloudy


In [16]:
import pandas as pd

# ------------------------------------------------------------------------------
# INGESTION & TYPE OPTIMIZATION
# ------------------------------------------------------------------------------
# Architect's Note: 
# - We optimize strings to 'category' for group-heavy operations.
# - 'cancelled' is cast to bool for memory efficiency.
cols = ['date', 'airline', 'origin', 'dest', 'dep_time', 'arr_time',
        'cancelled', 'air_time', 'distance', 'carrier_delay']

flights = (pd.read_csv('../data/flights.csv', usecols=cols)
    .assign(
        date=lambda x: pd.to_datetime(x['date']),
        airline=lambda x: x['airline'].astype('category'),
        origin=lambda x: x['origin'].astype('category'),
        dest=lambda x: x['dest'].astype('category'),
        cancelled=lambda x: x['cancelled'].astype(bool)
    )
)

# ------------------------------------------------------------------------------
# EXERCISE 1: First and Last per Airline
# ------------------------------------------------------------------------------
first_last_airline = (flights
    .groupby('airline', observed=True)
    .nth([0, -1]).sort_values('airline',ascending=True)
)

# ------------------------------------------------------------------------------
# EXERCISE 2: 500th Flight per Route
# ------------------------------------------------------------------------------
# Architect's Note: .nth is 0-indexed; 500th flight is index 499.
five_hundredth_flight = (flights
    .groupby(['origin', 'dest'], observed=True)
    .nth(499)
)

# ------------------------------------------------------------------------------
# EXERCISE 3: Date of 10th Cancelled Flight per Airline
# ------------------------------------------------------------------------------
tenth_cancelled_date = (flights
    .query('cancelled == True')
    .groupby('airline', observed=True)
    .nth(9)
)

# ------------------------------------------------------------------------------
# EXERCISE 4: Avg Delay for Routes with > 300 Flights
# ------------------------------------------------------------------------------
avg_delay_busy_routes = (
    flights
    # 1. Filter out low-volume routes (The Bouncer)
    .groupby(['origin', 'dest'], observed=True)
    .filter(lambda x: len(x) > 300)
    
    # 2. Collapse remaining high-volume routes into a summary report
    .groupby(['origin', 'dest'], observed=True)
    .agg(
        avg_carrier_delay=('carrier_delay', 'mean'),
        flight_count=('carrier_delay', 'size')
    )
)

In [18]:
import pandas as pd
import numpy as np

# ------------------------------------------------------------------------------
# SETUP: Load and Validate Data
# ------------------------------------------------------------------------------
bikes = pd.read_csv('../data/bikes.csv')

# Architect's Note:
# Binning transforms continuous numerical data into discrete categories.
# This is crucial for handling outliers, non-linear relationships, and simplifying reporting.

# ------------------------------------------------------------------------------
# EXERCISE 1: Custom Binning (0-100, 101-1000, 1001+)
# ------------------------------------------------------------------------------
def classify_trip_duration(df: pd.DataFrame) -> pd.Series:
    """
    Categorizes trips into custom logical buckets.
    Explanation:
    - pd.cut() allows defining specific boundaries.
    - We use -1 instead of 0 for the lower bound to strictly include 0 if present.
    - Labels make the result human-readable immediately.
    """
    return pd.cut(
        df['tripduration'],
        bins=[-1, 100, 1000, np.inf],
        labels=['Short (0-100)', 'Medium (101-1000)', 'Long (1001+)']
    ).value_counts().sort_index()

# ------------------------------------------------------------------------------
# EXERCISE 2: Equal-Width Binning
# ------------------------------------------------------------------------------
def equal_width_analysis(df: pd.DataFrame) -> pd.Series:
    """
    Cuts data into 5 bins of equal 'range' (width).
    Explanation:
    - bins=5 calculates (max - min) / 5.
    - ISSUE: In distributions with outliers (power law), this is often useless.
      Most data clusters in the first bin, while empty bins stretch to the outliers.
    """
    return pd.cut(df['tripduration'], bins=5).value_counts().sort_index()

# ------------------------------------------------------------------------------
# EXERCISE 3: Equal-Frequency (Quantile) Binning
# ------------------------------------------------------------------------------
def equal_freq_analysis(df: pd.DataFrame) -> pd.Series:
    """
    Cuts data into 5 bins with equal 'counts' (quantiles).
    Explanation:
    - pd.qcut() splits data so each bin has ~20% of the rows.
    - RESULT: Much more useful for skewed data (like trip durations) as it 
      reveals the distribution relative to the population density.
    """
    return pd.qcut(df['tripduration'], q=5).value_counts().sort_index()

# ------------------------------------------------------------------------------
# EXERCISE 4: Bivariate Quantile Analysis (Crosstab)
# ------------------------------------------------------------------------------
def duration_temp_crosstab(df: pd.DataFrame) -> pd.DataFrame:
    """
    Analyzes the relationship between Temperature and Trip Duration.
    Explanation:
    - We discretize BOTH variables into quantiles (low, med, high, etc.).
    - pd.crosstab() creates a frequency matrix (Heatmap data).
    - Pattern Search: Do long trips happen more in mild weather?
    """
    return pd.crosstab(
        index=pd.qcut(df['tripduration'], q=5, labels=['Shortest', 'Short', 'Med', 'Long', 'Longest']),
        columns=pd.qcut(df['temperature'], q=5, labels=['Coldest', 'Cold', 'Med', 'Hot', 'Hottest'])
    )

# ------------------------------------------------------------------------------
# EXERCISE 5: Pivot Table with Binned Columns
# ------------------------------------------------------------------------------
def pivot_duration_gender_temp(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates Average Trip Duration by Gender across Temperature Deciles.
    Explanation:
    - We bin temperature into 10 buckets (deciles) inside the pivot_table call.
    - This creates a detailed profile of how behavior changes with weather.
    """
    return df.pivot_table(
        index='gender',
        columns=pd.qcut(df['temperature'], q=10),
        values='tripduration',
        aggfunc='mean'
    )

# ------------------------------------------------------------------------------
# EXERCISE 6: Handling Data Quality & Contextual Binning
# ------------------------------------------------------------------------------
def clean_and_bin_temperature(df: pd.DataFrame) -> pd.Series:
    """
    1. Identifies and removes the anomaly (9999 or similar).
    2. Bins valid data into semantic categories (Cold -> Hot).
    3. Counts occurrences, explicitly tracking Missing Values (NaN).
    """
    # 1. Clean: Replace impossible temps (e.g., > 150F) with NaN
    # Architect's Note: Using .where() for vectorized replacement
    clean_temp = df['temperature'].where(df['temperature'] < 150, np.nan)
    
    # 2. Bin: Define semantic boundaries (Fahrenheit assumptions)
    # Cold: <40, Cool: 40-55, Mild: 55-70, Warm: 70-85, Hot: >85
    return pd.cut(
        clean_temp,
        bins=[-np.inf, 40, 55, 70, 85, np.inf],
        labels=['Cold', 'Cool', 'Mild', 'Warm', 'Hot']
    ).value_counts(dropna=False).sort_index() # dropna=False keeps NaNs visible

# ------------------------------------------------------------------------------
# EXECUTION & ANALYSIS
# ------------------------------------------------------------------------------
print("--- Ex 1: Custom Binning (0-100, 101-1000, 1001+) ---")
print(classify_trip_duration(bikes))

print("\n--- Ex 2: Equal Width Binning (Does it make sense?) ---")
print(equal_width_analysis(bikes))
print("Architect's Verdict: NO. The data is heavily right-skewed. Bin 1 contains 99% of data.")

print("\n--- Ex 3: Equal Frequency Binning (Quantiles) ---")
print(equal_freq_analysis(bikes))
print("Architect's Verdict: YES. This creates balanced groups for comparison.")

print("\n--- Ex 4: Duration vs Temperature Crosstab ---")
print(duration_temp_crosstab(bikes))

print("\n--- Ex 5: Avg Duration by Gender & Temp Deciles ---")
print(pivot_duration_gender_temp(bikes).iloc[:, :3]) # Showing first 3 deciles for brevity

print("\n--- Ex 6: Semantic Weather Buckets (with NaNs) ---")
print(clean_and_bin_temperature(bikes))

--- Ex 1: Custom Binning (0-100, 101-1000, 1001+) ---
tripduration
Short (0-100)          242
Medium (101-1000)    39669
Long (1001+)         10178
Name: count, dtype: int64

--- Ex 2: Equal Width Binning (Does it make sense?) ---
tripduration
(-26.128, 17285.6]    50060
(17285.6, 34511.2]       11
(34511.2, 51736.8]        9
(51736.8, 68962.4]        3
(68962.4, 86188.0]        6
Name: count, dtype: int64
Architect's Verdict: NO. The data is heavily right-skewed. Bin 1 contains 99% of data.

--- Ex 3: Equal Frequency Binning (Quantiles) ---
tripduration
(59.999, 317.0]      10043
(317.0, 480.0]       10011
(480.0, 682.0]       10024
(682.0, 1007.0]       9997
(1007.0, 86188.0]    10014
Name: count, dtype: int64
Architect's Verdict: YES. This creates balanced groups for comparison.

--- Ex 4: Duration vs Temperature Crosstab ---
temperature   Coldest  Cold   Med   Hot  Hottest
tripduration                                    
Shortest         2712  2204  1931  1670     1526
Short       

  return df.pivot_table(


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
