# Report

## Research Question
What are the isolated long-term effects of COVID-19 on Sydney train patronage patterns from January 2020 to July 2025, after controlling for extreme weather events and natural disasters, and how do these interact with infrastructure changes like new Metro lines?

In [5]:
# Import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import geopandas as gpd
import statsmodels.formula.api as smf
import os
import re



In [9]:
# Cell 1: Markdown - Data Wrangling Setup
"""
# Fixing Date Parsing for Opal Patronage Analysis (2020-2025)
Combines TXT files, handles '%Y-%m-%d' dates, and prepares for COVID/weather analysis (e.g., filtering 2025 floods for recovery trends).
"""

# Cell 3: Combine and Parse Dates Correctly
folder_path = 'WDAC_2025_data/OpalPatronage'
file_names_path = 'WDAC_2025_data/OpalPatronage/opal_patronage_filelist.txt'

# Regex pattern for Opal_Patronage_YYYYMMDD.txt
pattern = r'(Opal_Patronage_\d{8}\.txt)'

file_names_list = []

# Empty list to store strings of file names
with open(file_names_path, mode = 'r') as file_names_object:
    
    for file_link in file_names_object.readlines():
        match_obj = re.search(pattern, file_link)
        file_name = match_obj.group()
        file_names_list.append(file_name)


df_list = []

# Read each file
for file in file_names_list:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, sep='|')  # Pipe-delimited
    df_list.append(df)

# Concatenate
combined_df = pd.concat(df_list, ignore_index=True)

combined_df

Unnamed: 0,trip_origin_date,mode_name,ti_region,tap_hour,Tap_Ons,Tap_Offs
0,2020-01-01,Bus,Chatswood,0,<50,<50
1,2020-01-01,Bus,Macquarie Park,0,<50,<50
2,2020-01-01,Bus,Newcastle and surrounds,0,<50,<50
3,2020-01-01,Bus,North Sydney,0,700,100
4,2020-01-01,Bus,Other,0,4500,3200
...,...,...,...,...,...,...
1372289,2025-07-15,Train,Sydney CBD,23,2600,2400
1372290,2025-07-15,Train,Wollongong and surrounds,23,<100,<100
1372291,2025-07-15,Train,All - NSW,23,5100,9000
1372292,2025-07-15,UNKNOWN,Other,21,<100,<100


In [None]:
combined_df.shape


In [27]:
combined_df

Unnamed: 0,trip_origin_date,mode_name,ti_region,tap_hour,Tap_Ons,Tap_Offs,https://opendata-tpa.transport.nsw.gov.au/Opal_Patronage/2020-01/Opal_Patronage_20200101.txt
0,2024-11-12,Bus,Chatswood,0.0,<100,<100,
1,2024-11-12,Bus,Macquarie Park,0.0,<100,<100,
2,2024-11-12,Bus,Newcastle and surrounds,0.0,<100,<100,
3,2024-11-12,Bus,North Sydney,0.0,<100,<100,
4,2024-11-12,Bus,Other,0.0,800,1100,
...,...,...,...,...,...,...,...
1374311,2021-04-22,UNKNOWN,All - NSW,21.0,<50,<50,
1374312,2021-04-22,UNKNOWN,Other,22.0,<50,<50,
1374313,2021-04-22,UNKNOWN,All - NSW,22.0,<50,<50,
1374314,2021-04-22,UNKNOWN,Other,23.0,<50,<50,
