In [2]:
import pandas as pd
import numpy as np

# DATA INGESTION
# We load the full dataset but will isolate specific Series for this Master Class.
df = pd.read_csv('../data/bikes.csv')

# ISOLATING RAW SERIES FOR THE CASE STUDY
duration_series: pd.Series = df['tripduration']
weather_series: pd.Series = df['temperature']
event_series: pd.Series = df['events']
station_series: pd.Series = df['from_station_name']
time_series: pd.Series = df['starttime']

print(f"Audit Started. Total Records: {len(df)}")

Audit Started. Total Records: 50089


In [3]:
event_series

0        mostlycloudy
1        partlycloudy
2        mostlycloudy
3        mostlycloudy
4        partlycloudy
             ...     
50084    partlycloudy
50085    partlycloudy
50086    partlycloudy
50087    partlycloudy
50088    partlycloudy
Name: events, Length: 50089, dtype: object

Chapter 1: The Integrity Check (Numeric Methods)
The Business Question: Our pricing model relies on tripduration. However, we suspect sensor errors are producing negative values or impossibly long rides (outliers). How do we stabilize this metric?

In production, raw means are dangerous. We must sanitize the distribution first.

In [4]:
# 1. BASIC DIAGNOSTICS
# Standard: Use describe() to get the 5-number summary immediately.
stats_summary = duration_series.describe(percentiles=[0.01, 0.5, 0.99])

# 2. NUMERIC SANITATION PIPELINE
# Goal: Create a "Trusted" duration series for modeling.
clean_duration = (
    duration_series
    # Handling Negatives: Use .abs() to fix sign errors from sensors
    .abs()
    
    # Handling Outliers: The "Floor and Ceiling" approach.
    # We clip the top 1% to prevent a single 10-day ride from skewing the mean.
    .clip(lower=60, upper=duration_series.quantile(0.99))
)

# 3. AGGREGATION
# Now we calculate robust metrics on the cleaned series.
avg_duration = clean_duration.mean()
median_duration = clean_duration.median()

print(f"Original Mean: {duration_series.mean():.2f}")
print(f"Robust Mean (Clipped): {avg_duration:.2f}")

Original Mean: 716.87
Robust Mean (Clipped): 679.31


Chapter 2: The Missing Data Mystery (Missing Value Methods)
The Business Question: The events sensor (Rain, Snow, etc.) only records data when something happens. "Sunny" days are recorded as NaN. We need to fill these gaps without introducing bias.

Missing data is not always "bad"; sometimes it is "informative."

In [5]:
# 1. IDENTIFYING THE GAP
# Standard: Check the percentage of missing data.
missing_pct = event_series.isna().mean() * 100

# 2. CONTEXTUAL FILLING
# Logic: If 'events' is null, it implies 'Clear' weather in this specific context.
filled_events = (
    event_series
    # Validation: Check unique values before filling
    .fillna('Clear')
)

# 3. DROPPING STRATEGIES (Scenario: Temperature)
# Logic: If temperature is missing, we cannot guess it. We must flag it or drop it.
valid_temps = (
    weather_series
    .dropna() 
)

print(f"Missing Events: {missing_pct:.1f}%")
print(f"Imputed 'Clear' Events: {(filled_events == 'Clear').sum()}")

Missing Events: 0.0%
Imputed 'Clear' Events: 0


Chapter 3: The Leaderboard (Sorting, Ranking & Uniqueness)
The Business Question: Marketing wants to send "Gold User" coupons to riders with the longest trips. We also need to identify the busiest starting stations.

We need to rank data relative to itself and count distinct occurrences.

In [6]:
# 1. UNIQUENESS
# How many unique stations are there?
unique_stations_count = station_series.nunique()

# 2. FREQUENCY DISTRIBUTION
# What are the top 3 busiest stations?
# Standard: normalize=True gives relative frequency (percentages).
top_stations = station_series.value_counts(normalize=True).head(3)

# 3. RANKING
# Logic: Rank trips by duration. 'min' method assigns the same rank to ties (e.g., tie for 1st place).
# We want the longest trips (ascending=False).
trip_ranks = clean_duration.rank(method='min', ascending=False)

# 4. SELECTING THE "WINNERS" (nlargest)
# Get the exact values of the top 5 longest rides.
top_5_durations = clean_duration.nlargest(5)

print(f"Network Size: {unique_stations_count} Stations")
print("Top Station Traffic:\n", top_stations)

Network Size: 600 Stations
Top Station Traffic:
 from_station_name
Clinton St & Washington Blvd    0.017549
Canal St & Adams St             0.014993
Clinton St & Madison St         0.013436
Name: proportion, dtype: float64


Chapter 4: Trends & Anomalies (Advanced Series Methods)
The Business Question: We need to detect sudden shifts in weather (temperature) to warn riders. Also, we need to perform a random audit of 10% of our transaction IDs.

This requires looking at "row vs. previous row" logic and random sampling.

In [None]:
# 1. DELTA ANALYSIS (diff & pct_change)
# Logic: Calculate the temperature drop/rise from the previous ride record.
temp_change = weather_series.diff()
temp_pct_change = weather_series.pct_change()

# 2. FINDING EXTREMES (idxmin / idxmax)
# Logic: Which specific record (Index ID) had the coldest temperature?
coldest_ride_index = weather_series.idxmin()
coldest_temp_value = weather_series.loc[coldest_ride_index]

# 3. DATA REPLACEMENT
# Scenario: A sensor typo labeled some events 'rain' (lowercase) instead of 'Rain'.
corrected_events = event_series.replace({'rain': 'Rain', 'snow': 'Snow'})

# 4. RANDOM SAMPLING
# Logic: Pull 10% of data for the audit, ensuring reproducibility with random_state.
audit_sample = duration_series.sample(frac=0.10, random_state=42)

print(f"Coldest Ride Index: {coldest_ride_index} ({coldest_temp_value} F)")
print(f"Audit Sample Size: {len(audit_sample)}")

Coldest Ride Index: 27168 (-9999.0 F)
Audit Sample Size: 5009


Chapter 5: Text Wrangling (String Methods)
The Business Question: The station names are messy. We need to standardize them for the app display. Specifically, we want to find all stations located near a "Park".

Vectorized string operations (.str) are the only acceptable way to process text in Pandas.

In [15]:
# 1. STANDARDIZATION
# Logic: Lowercase everything and strip whitespace to prevent join errors.
clean_names = (
    station_series
    .astype(str) # Safety cast
    .str.lower()
    .str.strip()
)

# 2. PATTERN MATCHING (contains)
# Logic: Find boolean mask for stations containing 'park'.
is_park_station = clean_names.str.contains('park', regex=False)

# 3. EXTRACTION (split)
# Logic: The data format is "StationName (Neighborhood)". We want just the Neighborhood.
# We split by '(' and take the second part (index 1).
neighborhoods = (
    station_series
    .str.split('(')
    .str.get(1) # Access the second element of the list
    .str.replace(')', '', regex=False) # Remove the closing parenthesis
)

print(f"Park Stations Found: {is_park_station.sum()}")
print("Sample Neighborhoods:", neighborhoods.dropna().head(3).values)

Park Stations Found: 1259
Sample Neighborhoods: ['Madison St' 'State St & 16th St' 'State St & 16th St']


Chapter 6: The Timeline (Datetime Methods)
The Business Question: Usage patterns vary by day. We need to convert the raw time strings into actual dates to determine which day of the week has the highest load.

In [22]:
# 1. CONVERSION
# Logic: Coerce errors to NaT (Not a Time) to avoid crashing on garbage data.
# Note: In production, explicitly specify 'format' if known for 10x speedup.
dt_series = pd.to_datetime(time_series, errors='coerce')

# 2. COMPONENT EXTRACTION
# Logic: Extract the day name (Mon/Tue) and the hour (0-23).
day_names = dt_series.dt.day_name()
ride_hours = dt_series.dt.hour

# 3. AGGREGATING BY TIME
# Logic: Count rides per day name.
weekly_traffic = day_names.value_counts()

print("Busiest Day:", weekly_traffic.idxmax())
print("Peak Hour:", ride_hours.mode()[0])

Busiest Day: Tuesday
Peak Hour: 17


0    17
Name: starttime, dtype: int32