In [None]:
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read in daily ridership data
file_path = os.path.join('..','assets', 'MTA_Daily_Ridership_Data__Beginning_2020_20241001.csv')

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Read in hourly bus ridership data
bus_file_path = os.path.join('..', 'assets', 'hourly_bus_ridership.csv')
hourly_bus_df = pd.read_csv(bus_file_path)

# Read in hourly subway ridership data
subway_file_path = os.path.join('..', 'assets', 'hourly_subway_ridership.csv')
hourly_subway_df = pd.read_csv(subway_file_path)

# Convert 'hour' column to datetime for both datasets
hourly_bus_df['hour'] = pd.to_datetime(hourly_bus_df['hour'])
hourly_subway_df['hour'] = pd.to_datetime(hourly_subway_df['hour'])

# Display the first few rows of each dataset
print("Hourly Bus Ridership Data:")
print(hourly_bus_df.head())
print("\nHourly Subway Ridership Data:")
print(hourly_subway_df.head())



In [None]:
import pytz
from datetime import timedelta

# For bus data
hourly_bus_df['hour'] = pd.to_datetime(hourly_bus_df['hour'], utc=True)
hourly_bus_df['hour'] = hourly_bus_df['hour'].dt.tz_convert('America/New_York')
daily_bus_df = hourly_bus_df.set_index('hour').resample('D', offset=timedelta(hours=4)).sum().reset_index()
daily_bus_df['hour'] = daily_bus_df['hour'].dt.tz_localize(None)
daily_bus_df = daily_bus_df.rename(columns={'hour': 'Date'})

# For subway data
hourly_subway_df['hour'] = pd.to_datetime(hourly_subway_df['hour'], utc=True)
hourly_subway_df['hour'] = hourly_subway_df['hour'].dt.tz_convert('America/New_York')
daily_subway_df = hourly_subway_df.set_index('hour').resample('D', offset=timedelta(hours=4)).sum().reset_index()
daily_subway_df['hour'] = daily_subway_df['hour'].dt.tz_localize(None)
daily_subway_df = daily_subway_df.rename(columns={'hour': 'Date'})


# Display the first few rows of each dataset
print("Daily Bus Ridership Data:")
print(daily_bus_df.head())
print("\nDaily Subway Ridership Data:")
print(daily_subway_df.head())

# Optionally, you can compare with the original daily data
print("\nOriginal Daily Data:")
print(df[['Date', 'Subways: Total Estimated Ridership', 'Buses: Total Estimated Ridership']].head())

# Calculate and print the total ridership for each dataset
print("\nTotal Ridership Comparison:")
print(f"Bus (from hourly data): {daily_bus_df['total_ridership'].sum():,}")
print(f"Subway (from hourly data): {daily_subway_df['total_ridership'].sum():,}")
print(f"Bus (from daily data): {df['Buses: Total Estimated Ridership'].sum():,}")
print(f"Subway (from daily data): {df['Subways: Total Estimated Ridership'].sum():,}")


In [None]:


# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Set up the plot style
plt.style.use('seaborn')
fig, axs = plt.subplots(3, 1, figsize=(15, 20))
fig.suptitle('Total Estimated Ridership Over Time', fontsize=16)

# Plot for Bus
sns.lineplot(x='Date', y='Subways: Total Estimated Ridership', data=df, ax=axs[0])
axs[0].set_title('Subways: DailyTotal Estimated Ridership')
axs[0].set_xlabel('Date')

# Plot for Subway
sns.lineplot(x='Date', y='Buses: Total Estimated Ridership', data=df, ax=axs[1])
axs[1].set_title('Buses: Dailt Total Estimated Ridership')
axs[1].set_xlabel('Date')

# Plot for LIRR
sns.lineplot(x='Date', y='LIRR: Total Estimated Ridership', data=df, ax=axs[2])
axs[2].set_title('LIRR Ridership')

# Adjust layout and display the plot
plt.tight_layout()
plt.show()


In [None]:
# Convert 'hour' column to datetime for hourly data and remove timezone info
hourly_bus_df['hour'] = pd.to_datetime(hourly_bus_df['hour']).dt.tz_localize(None)
hourly_subway_df['hour'] = pd.to_datetime(hourly_subway_df['hour']).dt.tz_localize(None)

# Ensure 'Date' column in daily_subway_df and daily_bus_df has no timezone info
daily_subway_df['Date'] = pd.to_datetime(daily_subway_df['Date']).dt.tz_localize(None)
daily_bus_df['Date'] = pd.to_datetime(daily_bus_df['Date']).dt.tz_localize(None)

# Set up the plot style
plt.style.use('seaborn')
fig, axs = plt.subplots(2, 2, figsize=(20, 15))
fig.suptitle('Comparison of Daily and Hourly-Derived Daily Ridership', fontsize=16)

# Plot for Subway
sns.lineplot(x='Date', y='Subways: Total Estimated Ridership', data=df, ax=axs[0, 0], label='Daily Data')
sns.lineplot(x='Date', y='total_ridership', data=daily_subway_df, ax=axs[0, 0], label='Derived from Hourly')
axs[0, 0].set_title('Subway: Daily Total Estimated Ridership')
axs[0, 0].set_xlabel('Date')
axs[0, 0].legend()

# Plot for Bus
sns.lineplot(x='Date', y='Buses: Total Estimated Ridership', data=df, ax=axs[0, 1], label='Daily Data')
sns.lineplot(x='Date', y='total_ridership', data=daily_bus_df, ax=axs[0, 1], label='Derived from Hourly')
axs[0, 1].set_title('Bus: Daily Total Estimated Ridership')
axs[0, 1].set_xlabel('Date')
axs[0, 1].legend()

# Plot hourly data for Subway
sns.lineplot(x='hour', y='total_ridership', data=hourly_subway_df, ax=axs[1, 0])
axs[1, 0].set_title('Subway: Hourly Total Estimated Ridership')
axs[1, 0].set_xlabel('Date')

# Plot hourly data for Bus
sns.lineplot(x='hour', y='total_ridership', data=hourly_bus_df, ax=axs[1, 1])
axs[1, 1].set_title('Bus: Hourly Total Estimated Ridership')
axs[1, 1].set_xlabel('Date')

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

In [None]:
# Convert 'hour' column to datetime for hourly data and remove timezone info
hourly_bus_df['hour'] = pd.to_datetime(hourly_bus_df['hour']).dt.tz_localize(None)
hourly_subway_df['hour'] = pd.to_datetime(hourly_subway_df['hour']).dt.tz_localize(None)

# Ensure 'Date' column in daily_subway_df and daily_bus_df has no timezone info
daily_subway_df['Date'] = pd.to_datetime(daily_subway_df['Date']).dt.tz_localize(None)
daily_bus_df['Date'] = pd.to_datetime(daily_bus_df['Date']).dt.tz_localize(None)

# Convert 'Date' column in df to datetime without timezone
df['Date'] = pd.to_datetime(df['Date']).dt.tz_localize(None)

# Set up the plot style
plt.style.use('seaborn')
fig, axs = plt.subplots(2, 2, figsize=(20, 15))
fig.suptitle('Comparison of Daily and Hourly-Derived Daily Ridership', fontsize=16)

# Plot for Subway
sns.lineplot(x='Date', y='Subways: Total Estimated Ridership', data=df, ax=axs[0, 0], label='Daily Data')
sns.lineplot(x='Date', y='total_ridership', data=daily_subway_df, ax=axs[0, 0], label='Derived from Hourly')
axs[0, 0].set_title('Subway: Daily Total Estimated Ridership')
axs[0, 0].set_xlabel('Date')
axs[0, 0].legend()

# Plot for Bus
sns.lineplot(x='Date', y='Buses: Total Estimated Ridership', data=df, ax=axs[0, 1], label='Daily Data')
sns.lineplot(x='Date', y='total_ridership', data=daily_bus_df, ax=axs[0, 1], label='Derived from Hourly')
axs[0, 1].set_title('Bus: Daily Total Estimated Ridership')
axs[0, 1].set_xlabel('Date')
axs[0, 1].legend()

# Plot hourly data for Subway
sns.lineplot(x='hour', y='total_ridership', data=hourly_subway_df, ax=axs[1, 0])
axs[1, 0].set_title('Subway: Hourly Total Estimated Ridership')
axs[1, 0].set_xlabel('Date')

# Plot hourly data for Bus
sns.lineplot(x='hour', y='total_ridership', data=hourly_bus_df, ax=axs[1, 1])
axs[1, 1].set_title('Bus: Hourly Total Estimated Ridership')
axs[1, 1].set_xlabel('Date')

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

In [None]:
# Import required libraries
from statsmodels.tsa.seasonal import MSTL
import numpy as np

def perform_mstl(data, periods=[7, 365]):
    mstl = MSTL(data, periods=periods).fit()
    return mstl

# Perform MSTL decomposition for subway data
subway_decomposition = perform_mstl(hourly_subway_df['total_ridership'], periods=[24, 24*7])

# Perform MSTL decomposition for bus data
bus_decomposition = perform_mstl(hourly_bus_df['total_ridership'], periods=[24, 24*7])

# Plot the decomposition results
def plot_decomposition(decomposition, title, df):
    fig, axs = plt.subplots(4, 1, figsize=(15, 20))
    fig.suptitle(title, fontsize=16)
    
    components = ['observed', 'trend', 'seasonal', 'resid']
    
    for i, component in enumerate(components):
        axs[i].plot(df['hour'], getattr(decomposition, component))
        axs[i].set_title(component.capitalize())
        axs[i].set_xlabel('Date')
        
        if component == 'resid':
            # Find the 20 biggest outliers
            residuals = getattr(decomposition, component)
            outliers = np.abs(residuals).nlargest(20)
            
            # Plot the outliers as red dots
            axs[i].scatter(df['hour'].iloc[outliers.index], outliers, color='red', s=50)
            
            # Print the times of the outliers
            print(f"\n20 biggest outliers for {title}:")
            for time, value in zip(df['hour'].iloc[outliers.index], outliers):
                print(f"Time: {time}, Residual: {value}")
        
    plt.tight_layout()
    plt.show()

# Plot decomposition for subway data
plot_decomposition(subway_decomposition, 'Subway Ridership - MSTL Decomposition', hourly_subway_df)

# Plot decomposition for bus data
plot_decomposition(bus_decomposition, 'Bus Ridership - MSTL Decomposition', hourly_bus_df)


In [None]:
# Read in hourly weather data
weather_df = pd.read_csv('../assets/nyc_hourly_weather_2023_10_08.csv')
weather_df['time'] = pd.to_datetime(weather_df['time'])

# Find the earliest datetime in the weather data
earliest_weather_datetime = weather_df['time'].min()

# Cutoff the hourly ridership data before the earliest hour in the weather data
hourly_subway_df = hourly_subway_df[hourly_subway_df['hour'] >= earliest_weather_datetime]
hourly_bus_df = hourly_bus_df[hourly_bus_df['hour'] >= earliest_weather_datetime]

# Plot the precipitation weather data over time
plt.figure(figsize=(15, 6))
plt.plot(weather_df['time'], weather_df['precip_in'], label='Precipitation')
plt.title('Hourly Precipitation in NYC')
plt.xlabel('Date')
plt.ylabel('Precipitation (mm)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Print some information about the data
print(f"Weather data starts from: {earliest_weather_datetime}")
print(f"Number of hours in weather data: {len(weather_df)}")
print(f"Number of hours in subway data after cutoff: {len(hourly_subway_df)}")
print(f"Number of hours in bus data after cutoff: {len(hourly_bus_df)}")



In [None]:
def analyze_weather_impact(hourly_subway_df, hourly_bus_df, weather_df, subway_decomposition, bus_decomposition, lag=0):
    # Shift weather data by the specified lag
    weather_df_lagged = weather_df.copy()
    weather_df_lagged['time'] = weather_df_lagged['time'] + pd.Timedelta(hours=lag)

    # Merge weather data with residuals from MSTL decomposition
    subway_weather_df = pd.merge(
        pd.DataFrame({'hour': hourly_subway_df['hour'], 'residual': subway_decomposition.resid}),
        weather_df_lagged,
        left_on='hour',
        right_on='time'
    )
    bus_weather_df = pd.merge(
        pd.DataFrame({'hour': hourly_bus_df['hour'], 'residual': bus_decomposition.resid}),
        weather_df_lagged,
        left_on='hour',
        right_on='time'
    )

    # Remove the first 'lag' number of rows and last 'lag' number of rows
    subway_weather_df = subway_weather_df.iloc[lag:-lag] if lag > 0 else subway_weather_df
    bus_weather_df = bus_weather_df.iloc[lag:-lag] if lag > 0 else bus_weather_df

    # List of weather variables to analyze
    weather_vars = ['temp_c', 'temp_f', 'precip_mm', 'precip_in', 'humidity', 'wind_kph', 'wind_mph',
                    'pressure_mb', 'pressure_in', 'cloud', 'feelslike_c', 'feelslike_f', 'uv',
                    'gust_kph', 'gust_mph']

    # Calculate correlations for subway residuals
    subway_correlations = subway_weather_df[weather_vars + ['residual']].corr()['residual'].drop('residual')

    # Calculate correlations for bus residuals
    bus_correlations = bus_weather_df[weather_vars + ['residual']].corr()['residual'].drop('residual')

    # Create a DataFrame to display correlations side by side
    correlation_df = pd.DataFrame({
        'Subway Residual Correlation': subway_correlations,
        'Bus Residual Correlation': bus_correlations
    })

    # Sort by absolute correlation values for subway
    correlation_df = correlation_df.reindex(correlation_df['Subway Residual Correlation'].abs().sort_values(ascending=False).index)

    # Display the correlations
    print(f"Correlations between weather variables and MSTL residuals (lag = {lag} hour(s)):")
    print(correlation_df)

    # Visualize the correlations
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_df, annot=True, cmap='coolwarm', center=0)
    plt.title(f'Correlation between Weather Variables and MSTL Residuals (lag = {lag} hour(s))')
    plt.tight_layout()
    plt.show()

    # Analyze categorical variables: wind_dir and condition_text
    for df, transport_mode in [(subway_weather_df, 'Subway'), (bus_weather_df, 'Bus')]:
        print(f"\n{transport_mode} Residuals by Wind Direction (lag = {lag} hour(s)):")
        print(df.groupby('wind_dir')['residual'].mean().sort_values(ascending=False))
        
        print(f"\n{transport_mode} Residuals by Weather Condition (lag = {lag} hour(s)):")
        print(df.groupby('condition_text')['residual'].mean().sort_values(ascending=False))

    # Visualize the relationship between temperature and residuals
    plt.figure(figsize=(12, 6))
    plt.scatter(subway_weather_df['temp_c'], subway_weather_df['residual'], alpha=0.5, label='Subway')
    plt.scatter(bus_weather_df['temp_c'], bus_weather_df['residual'], alpha=0.5, label='Bus')
    plt.xlabel('Temperature (°C)')
    plt.ylabel('MSTL Residual')
    plt.title(f'Temperature vs. MSTL Residual (lag = {lag} hour(s))')
    plt.legend()
    plt.tight_layout()
    plt.show()

    # Visualize the relationship between precipitation and residuals
    plt.figure(figsize=(12, 6))
    plt.scatter(subway_weather_df['precip_mm'], subway_weather_df['residual'], alpha=0.5, label='Subway')
    plt.scatter(bus_weather_df['precip_mm'], bus_weather_df['residual'], alpha=0.5, label='Bus')
    plt.xlabel('Precipitation (mm)')
    plt.ylabel('MSTL Residual')
    plt.title(f'Precipitation vs. MSTL Residual (lag = {lag} hour(s))')
    plt.legend()
    plt.tight_layout()
    plt.show()

    # Visualize the relationship between cloud cover and residuals
    plt.figure(figsize=(12, 6))
    plt.scatter(subway_weather_df['cloud'], subway_weather_df['residual'], alpha=0.5, label='Subway')
    plt.scatter(bus_weather_df['cloud'], bus_weather_df['residual'], alpha=0.5, label='Bus')
    plt.xlabel('Cloud Cover (%)')
    plt.ylabel('MSTL Residual')
    plt.title(f'Cloud Cover vs. MSTL Residual (lag = {lag} hour(s))')
    plt.legend()
    plt.tight_layout()
    plt.show()

    # Visualize the relationship between humidity and residuals
    plt.figure(figsize=(12, 6))
    plt.scatter(subway_weather_df['humidity'], subway_weather_df['residual'], alpha=0.5, label='Subway')
    plt.scatter(bus_weather_df['humidity'], bus_weather_df['residual'], alpha=0.5, label='Bus')
    plt.xlabel('Humidity (%)')
    plt.ylabel('MSTL Residual')
    plt.title(f'Humidity vs. MSTL Residual (lag = {lag} hour(s))')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Example usage:
analyze_weather_impact(hourly_subway_df, hourly_bus_df, weather_df, subway_decomposition, bus_decomposition, lag=1)
