In [1]:
import petl as etl
from utils.paths import HUQ
from datetime import datetime
from statistics import mean

In [2]:
# Read the CSV
footfall = etl.fromcsv(HUQ / "footfall.csv")
# Convert estimated footfall into an integer, then remove rows where estimated footfall is none
footfall = etl.convert(footfall, 'estimated_actual_footfall', int).selectnotnone('estimated_actual_footfall')
# Separate the datestamp into columns for each part
footfall = etl.addfield(footfall, 'month', lambda rec: datetime.fromisoformat(rec['datestamp']).date().month)
footfall = etl.addfield(footfall, 'year', lambda rec: datetime.fromisoformat(rec['datestamp']).date().year)
footfall = etl.addfield(footfall, 'day', lambda rec: datetime.fromisoformat(rec['datestamp']).date().day)
footfall = etl.addfield(footfall, 'day_of_week', lambda rec: datetime.fromisoformat(rec['datestamp']).strftime("%w"))
footfall = etl.addfield(footfall, 'day_name', lambda rec: datetime.fromisoformat(rec['datestamp']).strftime("%A"))
# footfall = etl.addfield(footfall, 'day_name', lambda rec: datetime.fromisoformat(rec['datestamp']).strftime("%A"))

Create an baseline per location per day, using data for years 2019, 2023, 2024.

In [3]:
# Select years
baseline_years = etl.selectin(footfall, 'year', [2019, 2023, 2024])

# Create a baseline by date
baseline_by_date = etl.aggregate(table=baseline_years, key=['centre_name', 'month', 'day'], aggregation=mean, value='estimated_actual_footfall').rename('value', 'baseline')
baseline_by_date

FileNotFoundError: [Errno 2] No such file or directory: '/Users/lukestrangetdc/GitHub/oi/bradford-2025/data/huq/footfall.csv'

FileNotFoundError: [Errno 2] No such file or directory: '/Users/lukestrangetdc/GitHub/oi/bradford-2025/data/huq/footfall.csv'

In [4]:
# 2025 values
values_2025 = etl.select(footfall, lambda r: r['year'] == 2025).cut(['centre_name', 'month', 'day', 'estimated_actual_footfall']).rename('estimated_actual_footfall', '2025')
values_2025

FileNotFoundError: [Errno 2] No such file or directory: '/Users/lukestrangetdc/GitHub/oi/bradford-2025/data/huq/footfall.csv'

FileNotFoundError: [Errno 2] No such file or directory: '/Users/lukestrangetdc/GitHub/oi/bradford-2025/data/huq/footfall.csv'

In [5]:
# Join the baseline and 2025 into one table and perform some simple transforms for formatting etc.
summary =  ( etl.leftjoin(left=baseline_by_date, right=values_2025, key=['centre_name', 'month', 'day'])
            .addfield('datestamp', lambda row: '2025-' + str(row['month']).zfill(2) + '-' + str(row['day']).zfill(2))
            .cutout('month', 'day')
            .sort(('datestamp', 'centre_name'), reverse=False)
            .cut('datestamp', 'centre_name', '2025', 'baseline')
            .convert('baseline', round)
            # .select(lambda r: r.centre_name == 'Met Office - Bradford')
        )

etl.tocsv(summary, HUQ / 'footfall_2025_vs_baseline_by_centre_name_by_date.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/lukestrangetdc/GitHub/oi/bradford-2025/data/huq/footfall_2025_vs_baseline_by_centre_name_by_date.csv'

# TODO

We also want to look at comparing years by correcting for day of the week, rather than using dates (as above). This should give a fairer comparison because weekends (primarily) have different footfall.

To do this,
- For each year, find first of Jan. Work out what day of week it is. (1-7).
- 2025 starts on Weds (3).
- 2024 starts Monday (1), so we need to shift all 2024 dates back by 2 days (so that the first Wednesday of 2024 lines up with the first day (Wednesday of 2025)).
- We'd add a 'day of year' column to align the days together.