In [None]:
import pandas as pd
import os

In [None]:
# Construct the file path using os.path
file_path = os.path.join('..','assets', 'MTA_Daily_Ridership_Data__Beginning_2020_20241001.csv')

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to verify the data
print(df.head())

# Display basic information about the DataFrame
print(df.info())

# Display summary statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Set up the plot style
plt.style.use('seaborn')
fig, axs = plt.subplots(3, 1, figsize=(15, 20))
fig.suptitle('Total Estimated Ridership Over Time', fontsize=16)

# Plot for Bus
sns.lineplot(x='Date', y='Subways: Total Estimated Ridership', data=df, ax=axs[0])
axs[0].set_title('Subways: DailyTotal Estimated Ridership')
axs[0].set_xlabel('Date')

# Plot for Subway
sns.lineplot(x='Date', y='Buses: Total Estimated Ridership', data=df, ax=axs[1])
axs[1].set_title('Buses: Dailt Total Estimated Ridership')
axs[1].set_xlabel('Date')

# Plot for LIRR
sns.lineplot(x='Date', y='LIRR: Total Estimated Ridership', data=df, ax=axs[2])
axs[2].set_title('LIRR Ridership')

# Adjust layout and display the plot
plt.tight_layout()
plt.show()


In [None]:
# Trim data to start from October 2020
start_date = pd.to_datetime('2020-10-01')
# start_date = pd.to_datetime('2022-02-01')
# start_date = pd.to_datetime('2023-01-01')
df_trimmed = df[df['Date'] >= start_date]

import numpy as np
from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression

# Define the exponential function
def exponential(x, a, b, c):
    return a * np.exp(b * x) + c

# Set up the plot style
plt.style.use('seaborn')
fig, axs = plt.subplots(3, 1, figsize=(15, 20))
fig.suptitle('Total Estimated Ridership Over Time (From October 2020)', fontsize=16)

# Function to fit and plot exponential model
def fit_and_plot_exponential(ax, x, y, label):
    # Fit exponential model
    popt, _ = curve_fit(exponential, x, y, p0=[1, 0.001, 1], maxfev=10000)
    
    # Plot original data
    ax.plot(df_trimmed['Date'], y, label='Actual Data')
    
    # Plot fitted exponential model
    ax.plot(df_trimmed['Date'], exponential(x, *popt), 'r--', label='Exponential Fit')
    
    ax.set_title(f'{label}: Daily Total Estimated Ridership')
    ax.set_xlabel('Date')
    ax.legend()

    return popt

# Function to fit and plot linear regression
def fit_and_plot_linear(ax, x, y, label):
    # Fit linear regression
    model = LinearRegression()
    model.fit(x.reshape(-1, 1), y)
    
    # Plot original data
    ax.plot(df_trimmed['Date'], y, label='Actual Data')
    
    # Plot fitted linear model
    ax.plot(df_trimmed['Date'], model.predict(x.reshape(-1, 1)), 'r--', label='Linear Fit')
    
    ax.set_title(f'{label}: Daily Total Estimated Ridership')
    ax.set_xlabel('Date')
    ax.legend()

    return model.coef_[0], model.intercept_, model

# Convert dates to numbers for fitting
x = (df_trimmed['Date'] - df_trimmed['Date'].min()).dt.days.values

# Plot for Subways
y_subway = df_trimmed['Subways: Total Estimated Ridership'].values
popt_subway = fit_and_plot_exponential(axs[0], x, y_subway, 'Subways')

# Plot for Buses
y_buses = df_trimmed['Buses: Total Estimated Ridership'].values
popt_buses = fit_and_plot_exponential(axs[1], x, y_buses, 'Buses')

# Plot for LIRR (using linear regression)
y_lirr = df_trimmed['LIRR: Total Estimated Ridership'].values
popt_lirr, intercept_lirr, model_lirr = fit_and_plot_linear(axs[2], x, y_lirr, 'LIRR')

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

# Print the optimized parameters
print("Subway parameters:", popt_subway)
print("Bus parameters:", popt_buses)
print("LIRR parameters (slope, intercept):", popt_lirr, intercept_lirr)


In [None]:
# Perform STL decomposition for each transportation mode
from statsmodels.tsa.seasonal import STL

def perform_stl(data, period=7, trend_window=201, trend_deg=0, seasonal_window=201, seasonal_deg=1, robust=False):
    stl = STL(
        data,
        period=period,
        trend=trend_window,
        trend_deg=trend_deg,
        seasonal=seasonal_window,
        seasonal_deg=seasonal_deg,
        robust=robust
    )
    result = stl.fit()
    return result

# Create a new figure for plotting STL decomposition
fig, axs = plt.subplots(3, 4, figsize=(20, 15))

# List of transportation modes
modes = ['Subway', 'Buses', 'LIRR']
data_list = [y_subway, y_buses, y_lirr]

for i, (mode, data) in enumerate(zip(modes, data_list)):
    # Perform STL decomposition
    stl_result = perform_stl(data)
    
    # Plot original data
    axs[i, 0].plot(x, data)
    axs[i, 0].set_title(f'{mode} - Original')
    axs[i, 0].set_xlabel('Date')
    axs[i, 0].set_ylabel('Ridership')
    
    # Plot trend component
    axs[i, 1].plot(x, stl_result.trend)
    axs[i, 1].set_title(f'{mode} - Trend')
    axs[i, 1].set_xlabel('Date')
    axs[i, 1].set_ylabel('Trend')
    
    # Plot seasonal component
    axs[i, 2].plot(x, stl_result.seasonal)
    axs[i, 2].set_title(f'{mode} - Seasonal')
    axs[i, 2].set_xlabel('Date')
    axs[i, 2].set_ylabel('Seasonal')
    
    # Plot residual component
    axs[i, 3].plot(x, stl_result.resid)
    axs[i, 3].set_title(f'{mode} - Residual')
    axs[i, 3].set_xlabel('Date')
    axs[i, 3].set_ylabel('Residual')

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

# Print some statistics about the decomposition
for mode, data in zip(modes, data_list):
    stl_result = perform_stl(data)
    print(f"\n{mode} STL Decomposition:")
    print(f"Trend strength: {1 - np.var(stl_result.resid) / np.var(stl_result.trend + stl_result.resid):.4f}")
    print(f"Seasonality strength: {1 - np.var(stl_result.resid) / np.var(stl_result.seasonal + stl_result.resid):.4f}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_residuals_with_outliers(data, mode, n_outliers=20):
    # Perform STL decomposition
    stl_result = perform_stl(data)
    residuals = stl_result.resid
    dates = df_trimmed['Date']

    # Find the indices of the n largest absolute residuals
    largest_residuals_idx = np.argsort(np.abs(residuals))[-n_outliers:]

    # Ensure indices are within bounds
    valid_indices = largest_residuals_idx[largest_residuals_idx < len(dates)]

    # Create the plot
    plt.figure(figsize=(15, 6))
    plt.plot(dates, residuals, label='Residuals')
    plt.scatter(dates.iloc[valid_indices], residuals[valid_indices], 
                color='red', s=50, label=f'Top {len(valid_indices)} Outliers')

    # Customize the plot
    plt.title(f'{mode} - Residuals with Top Outliers')
    plt.xlabel('Date')
    plt.ylabel('Residual Value')
    plt.legend()

    # Rotate and align the tick labels so they look better
    plt.gcf().autofmt_xdate()

    # Use tight layout to prevent clipping of tick-labels
    plt.tight_layout()

    # Show the plot
    plt.show()

    # Print the dates of the outliers
    print(f"\n{mode} - Dates of the {len(valid_indices)} largest outliers:")
    for idx in valid_indices:
        print(f"Date: {dates.iloc[idx]}, Residual: {residuals[idx]:.2f}")

# Plot residuals with outliers for each mode
for mode, data in zip(modes, data_list):
    plot_residuals_with_outliers(data, mode)



In [None]:
from statsmodels.tsa.seasonal import MSTL

def perform_mstl(data, periods=[7, 365]):
    mstl = MSTL(data, periods=periods).fit()
    return mstl

# Create a new figure for plotting MSTL decomposition
fig, axs = plt.subplots(3, 5, figsize=(25, 15))

for i, (mode, data) in enumerate(zip(modes, data_list)):
    mstl_result = perform_mstl(data)
    
    # Plot original data
    axs[i, 0].plot(df_trimmed['Date'], data)
    axs[i, 0].set_title(f'{mode} - Original')
    axs[i, 0].set_xlabel('Date')
    axs[i, 0].set_ylabel('Ridership')
    
    # Plot trend component
    axs[i, 1].plot(df_trimmed['Date'], mstl_result.trend)
    axs[i, 1].set_title(f'{mode} - Trend')
    axs[i, 1].set_xlabel('Date')
    axs[i, 1].set_ylabel('Trend')
    
    # Plot weekly seasonal component
    axs[i, 2].plot(df_trimmed['Date'], mstl_result.seasonal[:, 0])
    axs[i, 2].set_title(f'{mode} - Weekly Seasonal')
    axs[i, 2].set_xlabel('Date')
    axs[i, 2].set_ylabel('Weekly Seasonal')
    
    # Plot yearly seasonal component
    axs[i, 3].plot(df_trimmed['Date'], mstl_result.seasonal[:, 1])
    axs[i, 3].set_title(f'{mode} - Yearly Seasonal')
    axs[i, 3].set_xlabel('Date')
    axs[i, 3].set_ylabel('Yearly Seasonal')
    
    # Plot residual component
    axs[i, 4].plot(df_trimmed['Date'], mstl_result.resid)
    axs[i, 4].set_title(f'{mode} - Residuals')
    axs[i, 4].set_xlabel('Date')
    axs[i, 4].set_ylabel('Residuals')

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

# Print some statistics about the decomposition
for mode, data in zip(modes, data_list):
    mstl_result = perform_mstl(data)
    print(f"\n{mode} MSTL Decomposition:")
    print(f"Trend strength: {1 - np.var(mstl_result.resid) / np.var(mstl_result.trend + mstl_result.resid):.4f}")
    print(f"Weekly seasonality strength: {1 - np.var(mstl_result.resid) / np.var(mstl_result.seasonal[:, 0] + mstl_result.resid):.4f}")
    print(f"Yearly seasonality strength: {1 - np.var(mstl_result.resid) / np.var(mstl_result.seasonal[:, 1] + mstl_result.resid):.4f}")

In [None]:
from statsmodels.tsa.stattools import acf
import matplotlib.pyplot as plt
import numpy as np

# Compute autocorrelation values and confidence intervals
acf_values, confint = acf(stl_result.resid, nlags=400, alpha=0.05)

# Generate lag numbers
lags = np.arange(len(acf_values))

# Find lags where autocorrelation amplitude is larger than 0.15
significant_lags = lags[np.abs(acf_values) > 0.15]

# Plot the autocorrelation function with significant lags highlighted
plt.figure(figsize=(12, 6))
plt.stem(lags, acf_values, use_line_collection=True)
plt.scatter(significant_lags, acf_values[significant_lags], color='red', zorder=5)
plt.title('Autocorrelation Function (ACF) of Residuals')
plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.axhline(y=0, color='k', linestyle='--')
plt.show()

print(f"Number of significant lags: {len(significant_lags)}")
print(f"Significant lags: {significant_lags}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.tsa.seasonal import MSTL

def plot_mstl_residuals_with_outliers(data, mode, n_outliers=20):
    # Perform MSTL decomposition
    mstl_result = perform_mstl(data)
    residuals = mstl_result.resid
    dates = df_trimmed['Date']

    # Find the indices of the n largest absolute residuals
    largest_residuals_idx = np.argsort(np.abs(residuals))[-n_outliers:]

    # Ensure indices are within bounds
    valid_indices = largest_residuals_idx[largest_residuals_idx < len(dates)]

    # Create the plot
    plt.figure(figsize=(15, 6))
    plt.plot(dates, residuals, label='Residuals')
    plt.scatter(dates.iloc[valid_indices], residuals[valid_indices], 
                color='red', s=50, label=f'Top {len(valid_indices)} Outliers')

    # Customize the plot
    plt.title(f'{mode} - MSTL Residuals with Top Outliers')
    plt.xlabel('Date')
    plt.ylabel('Residual Value')
    plt.legend()

    # Rotate and align the tick labels so they look better
    plt.gcf().autofmt_xdate()

    # Use tight layout to prevent clipping of tick-labels
    plt.tight_layout()

    # Show the plot
    plt.show()

    # Print the dates of the outliers
    print(f"\n{mode} - Dates of the {len(valid_indices)} largest MSTL residual outliers:")
    for idx in valid_indices:
        print(f"Date: {dates.iloc[idx]}, Residual: {residuals[idx]:.2f}")

# Plot MSTL residuals with outliers for each mode
for mode, data in zip(modes, data_list):
    plot_mstl_residuals_with_outliers(data, mode)

In [None]:
import pandas as pd
import os

def save_combined_residuals_to_csv(modes, data_list):
    combined_residuals = pd.DataFrame()
    
    for mode, data in zip(modes, data_list):
        # Perform MSTL decomposition
        mstl_result = perform_mstl(data)
        residuals = mstl_result.resid
        dates = df_trimmed['Date']
        
        # Create a DataFrame with dates and residuals for this mode
        mode_residuals = pd.DataFrame({
            'Date': dates,
            f'{mode} Residual': residuals
        })
        
        # Merge with the combined DataFrame
        if combined_residuals.empty:
            combined_residuals = mode_residuals
        else:
            combined_residuals = pd.merge(combined_residuals, mode_residuals, on='Date', how='outer')

    # Save to CSV
    filename = os.path.join('..', 'assets', 'combined_daily_ridership_residuals.csv')
    combined_residuals.to_csv(filename, index=False)
    print(f"Combined residuals saved to {filename}")

# Save combined residuals for all modes
save_combined_residuals_to_csv(modes, data_list)
