In [1]:
import pytz, os, sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(os.path.join(os.getcwd(), '..'))

from scripts.decomposition import perform_mstl
from scripts.correlate import analyze_weather_impact, analyze_all_conditions
from scripts.ingest import build_mta_df, get_combined_residuals_df
from scripts.filter import filter_df_by_hour_range, filter_date_range, split_df_at_datetime
from datetime import datetime, timedelta
from scripts.model import *

In [2]:
assets_path = os.path.join(os.getcwd(), '..', 'assets')

hourly_subway_df, hourly_bus_df, weather_df = build_mta_df(
    os.path.join(assets_path, 'hourly_subway_ridership.csv'),
    os.path.join(assets_path, 'hourly_bus_ridership.csv'),
    os.path.join(assets_path, 'nyc_hourly_weather.csv')
)

In [None]:
# Combine subway and bus hourly data
combined_hourly_df = pd.concat([hourly_subway_df, hourly_bus_df])

# Aggregate hourly data to daily
daily_combined_df = combined_hourly_df.groupby(['transportation', combined_hourly_df['hour'].dt.date])['total_ridership'].sum().reset_index()
daily_combined_df['hour'] = pd.to_datetime(daily_combined_df['hour'])

# Set up the plots with larger font sizes
plt.rcParams.update({'font.size': 14})  

fig1, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 20))
fig1.suptitle('Hourly Subway and Bus Ridership', fontsize=24)

# Plot subway and bus ridership for all hourly data
sns.lineplot(x='hour', y='total_ridership', hue='transportation', data=combined_hourly_df, ax=ax1)
ax1.set_title('All Hourly Data', fontsize=20)
ax1.set_xlabel('Date and Time', fontsize=16)
ax1.set_ylabel('Total Estimated Ridership', fontsize=16)
ax1.legend(title='Transportation Type', title_fontsize='16', fontsize='14')
ax1.tick_params(axis='x', rotation=45, labelsize=16)
ax1.tick_params(axis='y', labelsize=16)

start_month = pd.Timestamp('2023-06-01')
end_month = pd.Timestamp('2023-06-15')

# Filter data for the random month
random_month_data = combined_hourly_df[(combined_hourly_df['hour'] >= start_month) & (combined_hourly_df['hour'] < end_month)]

# Plot subway and bus ridership for the random month
sns.lineplot(x='hour', y='total_ridership', hue='transportation', data=random_month_data, ax=ax2)
ax2.set_title('June 1st - 15th 2023', fontsize=20)
ax2.set_xlabel('Date and Time', fontsize=16)
ax2.set_ylabel('Total Estimated Ridership', fontsize=16)
ax2.legend(title='Transportation Type', title_fontsize='16', fontsize='14')
ax2.tick_params(axis='x', rotation=45, labelsize=16)
ax2.tick_params(axis='y', labelsize=16)

plt.tight_layout()
plt.show()

# Create a new figure for the daily aggregated data
fig2, ax3 = plt.subplots(figsize=(15, 8))
fig2.suptitle('Daily Subway and Bus Ridership', fontsize=24)

# Plot subway and bus ridership for all daily aggregated data
sns.lineplot(x='hour', y='total_ridership', hue='transportation', data=daily_combined_df, ax=ax3)
ax3.set_title('All Daily Aggregated Data', fontsize=20)
ax3.set_xlabel('Date', fontsize=16)
ax3.set_ylabel('Total Estimated Ridership', fontsize=16)
ax3.legend(title='Transportation Type', title_fontsize='16', fontsize='14')
ax3.tick_params(axis='x', rotation=45, labelsize=16)
ax3.tick_params(axis='y', labelsize=16)

plt.tight_layout()
plt.show()


Sanity check to make sure daily and hourly data align with eachother

In [None]:
# Perform MSTL decomposition for subway data
subway_decomposition = perform_mstl(hourly_subway_df['total_ridership'], periods=[24, 24*7])

# Perform MSTL decomposition for bus data
bus_decomposition = perform_mstl(hourly_bus_df['total_ridership'], periods=[24, 24*7])

# Plot the decomposition results
def plot_decomposition(decomposition, title, df):
    fig, axs = plt.subplots(3, 1, figsize=(15, 15))
    fig.suptitle(title, fontsize=16)
    
    components = ['trend', 'seasonal', 'resid']
    
    for i, component in enumerate(components):
        axs[i].plot(df['hour'], getattr(decomposition, component))
        axs[i].set_title(component.capitalize(), fontsize=16)
        axs[i].set_xlabel('Date', fontsize=16)
        axs[i].set_ylabel('Value', fontsize=16)
        axs[i].tick_params(axis='both', which='major', labelsize=16)
        
        if component == 'resid':
            # Find the 20 biggest outliers
            residuals = getattr(decomposition, component)
            outliers = np.abs(residuals).nlargest(20)
            
            # # Print the times of the outliers
            # print(f"\n20 biggest outliers for {title}:")
            # for time, value in zip(df['hour'].iloc[outliers.index], outliers):
            #     print(f"Time: {time}, Residual: {value}")
        
    plt.tight_layout()
    plt.show()

# Plot decomposition for subway data
plot_decomposition(subway_decomposition, 'Subway Ridership - MSTL Decomposition', hourly_subway_df)

# Plot decomposition for bus data
plot_decomposition(bus_decomposition, 'Bus Ridership - MSTL Decomposition', hourly_bus_df)


In [None]:
# Zoom in on the trend, seasonality, and residual for bus and subway from June 1 to June 15 in 2023
start_date = '2023-06-01'
end_date = '2023-06-15'

def plot_zoomed_decomposition(decomposition, df, title, start_date, end_date):
    mask = (df['hour'] >= start_date) & (df['hour'] <= end_date)
    zoomed_df = df[mask]
    
    fig, axs = plt.subplots(3, 1, figsize=(15, 15))
    fig.suptitle(f'{title} - Zoomed Decomposition ({start_date} to {end_date})', fontsize=16)
    
    components = ['trend', 'seasonal', 'resid']
    
    for i, component in enumerate(components):
        axs[i].plot(zoomed_df['hour'], getattr(decomposition, component)[mask])
        axs[i].set_title(component.capitalize(), fontsize=16)
        axs[i].set_xlabel('Date', fontsize=16)
        axs[i].set_ylabel('Ridership', fontsize=16)
        
        # Rotate x-axis labels for better readability
        axs[i].tick_params(axis='x', rotation=45, labelsize=16)
        axs[i].tick_params(axis='y', labelsize=16)
        
    plt.tight_layout()
    plt.show()

# Plot zoomed decomposition for subway data
plot_zoomed_decomposition(subway_decomposition, hourly_subway_df, 'Subway Ridership', start_date, end_date)

# Plot zoomed decomposition for bus data
plot_zoomed_decomposition(bus_decomposition, hourly_bus_df, 'Bus Ridership', start_date, end_date)


Read in weather data

In [None]:
# Plot the precipitation weather data over time
plt.figure(figsize=(15, 6))
plt.plot(weather_df['time'], weather_df['Precipitation (in)'], label='Precipitation')
plt.title('Hourly Precipitation in NYC')
plt.xlabel('Date')
plt.ylabel('Precipitation (in)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Print some information about the data
print(f"Number of hours in weather data: {len(weather_df)}")
print(f"Number of hours in subway data after cutoff: {len(hourly_subway_df)}")
print(f"Number of hours in bus data after cutoff: {len(hourly_bus_df)}")

In [7]:
combined_df = get_combined_residuals_df(hourly_subway_df, hourly_bus_df, weather_df, subway_decomposition, bus_decomposition)

In [None]:
start_date = '01-01'
end_date = '12-31'
# start_date = '12-01'
# end_date = '02-28'
# hours = (8, 12)
is_weekend = None
hours = None

result_df = analyze_weather_impact(combined_df, lag=0, start_date=start_date, end_date=end_date, 
                                   hour_range=hours, is_weekend=is_weekend, plot=True)


In [None]:
# Calculate correlations
correlations_df = analyze_all_conditions(combined_df, lag=0)

# Plotting
plt.figure(figsize=(24, 20))
for mode in ['Subway', 'Bus']:
    for i, day_type in enumerate(['Weekday', 'Weekend']):
        plt.subplot(2, 2, i + 1 if mode == 'Subway' else i + 3)
        data = correlations_df[(correlations_df['Transportation Type'] == mode) & (correlations_df['Day Type'] == day_type)]
        sns.lineplot(data=data, x='Season_Hour', y='Correlation', hue='Weather Variable', 
                     markers=True, dashes=False, errorbar=None)
        plt.title(f'{mode} - {day_type} Correlations')
        plt.xlabel('Season and Hour Segment')
        plt.ylabel('Correlation')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.xticks(rotation=90)

plt.tight_layout()
plt.show()

# Get unique variables
variables = correlations_df['Weather Variable'].unique()

# Set up the plot
fig, axs = plt.subplots(len(variables), 1, figsize=(20, 8*len(variables)))
fig.suptitle('Weather Variable Correlations Across Modes and Day Types', fontsize=16)

# Plot each variable
for i, variable in enumerate(variables):
    ax = axs[i] if len(variables) > 1 else axs
    
    for mode in ['Subway', 'Bus']:
        for day_type in ['Weekday', 'Weekend']:
            data = correlations_df[(correlations_df['Transportation Type'] == mode) & 
                                   (correlations_df['Day Type'] == day_type) & 
                                   (correlations_df['Weather Variable'] == variable)]
            
            sns.lineplot(data=data, x='Season_Hour', y='Correlation', 
                         label=f'{mode} - {day_type}',
                         markers=True, dashes=False, ax=ax)
    
    ax.set_title(f'{variable} Correlation')
    ax.set_xlabel('Season and Hour Segment')
    ax.set_ylabel('Correlation')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    x_ticks = np.arange(len(ax.get_xticklabels()))
    x_labels = [label.get_text() for label in ax.get_xticklabels()]
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_labels, rotation=45, ha='right', fontsize=14)
    
    ax.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()


In [None]:
# Increase the default font size for all text elements
plt.rcParams.update({'font.size': 16})

# Calculate correlations
correlations_df = analyze_all_conditions(combined_df, lag=0)

# Get unique variables
variables = correlations_df['Weather Variable'].unique()

# Define markers for each season
season_markers = {'Winter': 'o', 'Spring': 's', 'Summer': '^', 'Fall': 'D'}

# Function to convert hour segment to military time
def to_military_time(hour_segment):
    start, end = map(int, hour_segment.split('-'))
    return f'{start:02d}00-{end:02d}00'

# Plot for both weekdays and weekends
for day_type in ['Weekday', 'Weekend']:
    day_type_correlations = correlations_df[correlations_df['Day Type'] == day_type]
    
    for variable in variables:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
        fig.suptitle(f'{variable} Correlations on {day_type}s', fontsize=20)
        
        for mode, ax in zip(['Subway', 'Bus'], [ax1, ax2]):
            data = day_type_correlations[(day_type_correlations['Transportation Type'] == mode) & 
                                        (day_type_correlations['Weather Variable'] == variable)]
            
            # Add shaded region
            ax.axhspan(-0.2, 0.2, facecolor='gray', alpha=0.2)
            
            for season in ['Winter', 'Spring', 'Summer', 'Fall']:
                season_data = data[data['Season'] == season]
                
                sns.lineplot(data=season_data, x='Hour Segment', y='Correlation', 
                             label=season, marker=season_markers[season], 
                             markersize=10, linewidth=2, ax=ax)
            
            ax.set_title(f'{mode} - {variable} Correlation', fontsize=18)
            ax.set_xlabel('Time of Day', fontsize=16)
            ax.set_ylabel('Correlation', fontsize=16)
            ax.legend(title='Season', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=14, title_fontsize=16)
            
            # Convert x-axis labels to military time and set fixed locations
            x_ticks = np.arange(len(data['Hour Segment'].unique()))
            x_labels = [to_military_time(label) for label in data['Hour Segment'].unique()]
            ax.set_xticks(x_ticks)
            ax.set_xticklabels(x_labels, rotation=45, ha='right', fontsize=14)
            
            ax.tick_params(axis='both', which='major', labelsize=14)
            
            ax.grid(True, linestyle='--', alpha=0.7)
            
            # Set y-axis limits to ensure the shaded region is visible
            ax.set_ylim(-0.65, 0.65)

        plt.tight_layout()
        plt.show()


In [None]:
# Filter data for weekends and specified hour segments
weekend_mask = (combined_df['is_weekend'] == 1) & \
               (combined_df['hour'].dt.hour >= 12) & \
               (combined_df['hour'].dt.hour < 16) & \
               (combined_df['season'] == 'Spring')
weekend_data = combined_df[weekend_mask].copy()

# Create hour_segment column using loc
weekend_data.loc[:, 'hour_segment'] = pd.cut(weekend_data['hour'].dt.hour,
                                            bins=[12, 16],
                                            labels=['Ridership Residual'],
                                            include_lowest=True)

weather_vars = ['Temperature (°F)', 'Precipitation (in)', 'Relative Humidity (%)',
                'Pressure (inHg)', 'Cloud Cover (%)']
modes = ['subway', 'bus']

for i, var in enumerate(weather_vars):

    fig, axs = plt.subplots(1, 2, figsize=(20, 10))
    fig.suptitle('Residuals vs Weather Variables during Spring Weekends (12:00-16:00)', fontsize=16)

    for j, mode in enumerate(modes):
        ax = axs[j]
        
        # Calculate residuals
        residuals = weekend_data[f'{mode}_residual']
        
        # Create scatter plot
        sns.scatterplot(x=weekend_data[var], y=residuals, hue=weekend_data['hour_segment'], 
                        ax=ax, alpha=0.6)
        
        # Add linear regression line without shaded area
        sns.regplot(x=weekend_data[var], y=residuals, scatter=False, ax=ax, 
                    color='red' if mode == 'subway' else 'blue', label='Linear Regression', ci=None)
    
        ax.set_title(f'{mode.capitalize()} - {var}', fontsize=16)
        ax.set_xlabel(var, fontsize=16)
        ax.set_ylabel('Residuals', fontsize=16)
        ax.legend(fontsize=16)
        ax.grid(True, linestyle='--', alpha=0.7)
        ax.tick_params(axis='both', which='major', labelsize=16)

    plt.tight_layout()
    plt.show()

In [None]:
# Create histograms of residuals for subway and bus
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
fig.suptitle('Distribution of Residuals', fontsize=16)

# Subway residuals histogram
sns.histplot(data=combined_df, x='subway_residual', ax=ax1, bins=400)
ax1.set_title('Subway Residuals', fontsize=16)
ax1.set_xlabel('Residual Value', fontsize=14)
ax1.set_ylabel('Count', fontsize=14)
ax1.tick_params(axis='both', which='major', labelsize=12)
ax1.set_xlim(-50000, 50000)
ax1.grid(True, linestyle='--', alpha=0.7)

# Bus residuals histogram
sns.histplot(data=combined_df, x='bus_residual', ax=ax2, bins=400)
ax2.set_title('Bus Residuals', fontsize=16)
ax2.set_xlabel('Residual Value', fontsize=14)
ax2.set_ylabel('Count', fontsize=14)
ax2.tick_params(axis='both', which='major', labelsize=12)
ax2.grid(True, linestyle='--', alpha=0.7)
ax2.set_xlim(-20000, 20000)

plt.tight_layout()
plt.show()

# Print mean absolute value of residuals
print(f"Mean absolute value of subway residuals: {combined_df['subway_residual'].abs().mean()}")
print(f"Mean absolute value of bus residuals: {combined_df['bus_residual'].abs().mean()}")

In [None]:

# Run analysis
conditions = [
    ('subway', 'Spring', 'weekend', (12, 14)),
    ('bus', 'Spring', 'weekend', (12, 14))
]

# Initialize models
models = {
    'GLM': GLMModel(),
    'Quantile': QuantileModel(quantile=0.5),
    'Robust': RobustModel(),
    'GradientBoosting': GradientBoostingModel()
}

train_df, val_df = split_df_at_datetime(combined_df, pd.Timestamp('2023-11-16'))

for mode, season, day_type, hours in conditions:
    print(f"\nAnalysis for {mode.capitalize()} - {season} {day_type} {hours}")
    print("-" * 80)
    
    # results = run_model_analysis(models, combined_df, mode, season, day_type, hours)
    
    # for model_name, result in results.items():
    #     print(f"\n{model_name} Results:")
    #     print("Metrics:", result['train_metrics'])
    #     print("Summary:", result['summary'])

    results = run_model_analysis(models, train_df, mode, season, day_type, hours, val_df)

    for model_name, result in results.items():
        print(f"\n{model_name} Results:")
        print("Train Metrics:", result['train_metrics'])
        print("Validation Metrics:", result['val_metrics'])
        print("Summary:", result['summary'])



In [None]:
# Create bar plot comparing train vs validation MAE
import matplotlib.pyplot as plt
import numpy as np

# Extract MAE values for each model
model_names = []
train_maes = []
val_maes = []

for mode, season, day_type, hours in conditions:
    results = run_model_analysis(models, train_df, mode, season, day_type, hours, val_df)
    
    for model_name, result in results.items():
        model_names.append(f"{model_name}\n({mode})")
        train_maes.append(result['train_metrics']['MAE'])
        val_maes.append(result['val_metrics']['MAE'])

# Set up bar plot
x = np.arange(len(model_names))
width = 0.35

fig, ax = plt.subplots(figsize=(12, 6))
train_bars = ax.bar(x - width/2, train_maes, width, label='Train MAE')
val_bars = ax.bar(x + width/2, val_maes, width, label='Validation MAE')

# Customize plot
ax.set_ylabel('Mean Absolute Error')
ax.set_title('Model Performance Comparison: Train vs Validation MAE')
ax.set_xticks(x)
ax.set_xticklabels(model_names)
ax.legend()

# Add value labels on bars
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.0f}',
                    xy=(rect.get_x() + rect.get_width()/2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', rotation=90)

autolabel(train_bars)
autolabel(val_bars)

plt.tight_layout()
plt.show()
