In [None]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.axes
import numpy as np
from pathlib import Path

# Add project root to Python path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

import src.config as config

In [None]:
import pandas as pd
import src.loadProcessed as loadp

counts: pd.DataFrame = loadp.load_selected_count()
locations: pd.DataFrame = loadp.load_processed_locations()

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy.highlevel as phl
import math

# We want to log transform since counts have a power relation
log_data = counts.copy()
log_data['daily_count'] = log_data['daily_count'].apply(np.log)
log_data['is_weekend'] = log_data['day'].isin(['Saturday', 'Sunday'])

model_3 = smf.mixedlm(
    'daily_count ~ day*year',     # Defines the response and fixed effects
    log_data,                       
    groups=log_data['sensor_id'],  # Defines how to cluster the data
    re_formula='~ is_weekend*year'            # Random effect that differs across the clusters
)
result_3 = model_3.fit(reml=False)

print(result_3.summary())

In [None]:
# Extract Fixed Effects
log_fixed_year = result_3.params['year[T.2025]']  # beta_1
log_fixed_intercept = result_3.params['Intercept']  # beta_0

days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Saturday', 'Sunday']
log_fixed_day_effects = {d: result_3.params.get(f'day[T.{d}]', 0) for d in days}

# Extract Random Effects
log_sensor_effects = pd.DataFrame.from_dict(
    result_3.random_effects,
    orient='index'
).rename(
    columns={
        'Group': 'b0_intercept',
        'year[T.2025]': 'b1_year',
        'is_weekend[T.True]': 'b2_weekend',
        'is_weekend[T.True]:year[T.2025]': 'b3_interaction'
    }
)

# Calculate the Year Effect per Day
# Year Effect = (Fixed Year + Fixed Interaction) + (Random Year + (Random Interaction if Weekend))

# Friday is the reference category for Fixed Effects (Interaction = 0)
# It is also a Weekday (Random Interaction = 0)
log_sensor_effects['friday_year_effect'] = log_fixed_year + log_sensor_effects['b1_year']

for day in days:
    fixed_day_interaction = f'day[T.{day}]:year[T.2025]'
    
    if fixed_day_interaction in result_3.params:
        is_weekend = day in ['Saturday', 'Sunday']
        
        # Combine Fixed Parts: beta_1 + beta_{d x Year}
        fixed_part = log_fixed_year + result_3.params[fixed_day_interaction]
        
        # Combine Random Parts: 
        # Weekdays only get b1_year. 
        # Weekends get b1_year + b3_interaction.
        if is_weekend:
            random_part = log_sensor_effects['b1_year'] + log_sensor_effects['b3_interaction']
        else:
            random_part = log_sensor_effects['b1_year']
            
        log_sensor_effects[f'{day.lower()}_year_effect'] = fixed_part + random_part

# Calculate 2019 Intercept
# We now use beta_d and b2_weekend to get the log-count for every day of the week in 2019

# Friday 2019 has no interaction so just add the fixed intercept and the random intercept
log_sensor_effects['friday_2019'] = log_fixed_intercept + log_sensor_effects['b0_intercept']

for day in days:
    is_weekend = day in ['Saturday', 'Sunday']
    # Formula: Intercept + beta_d + b0_i + (b2_i if weekend)
    baseline = log_fixed_intercept + log_fixed_day_effects[day] + log_sensor_effects['b0_intercept']
    if is_weekend:
        baseline += log_sensor_effects['b2_weekend']
    
    log_sensor_effects[f'{day.lower()}_2019'] = baseline

# Average all 7 days to get the sensor-specific 2019 geometric mean log counts
log_sensor_effects['geometric_mean_pedestrians_2019'] = log_sensor_effects[[
    'monday_2019', 'tuesday_2019', 'wednesday_2019', 'thursday_2019', 
    'friday_2019', 'saturday_2019', 'sunday_2019'
]].mean(axis=1)

# Calculate Average Year Effect
log_sensor_effects['mean_year_effect'] = log_sensor_effects[[
    'monday_year_effect', 'tuesday_year_effect', 'wednesday_year_effect',
    'thursday_year_effect', 'friday_year_effect', 'saturday_year_effect', 'sunday_year_effect'
]].mean(axis=1)

log_sensor_effects = log_sensor_effects.drop(columns=['b0_intercept'])
convexivity_correction = np.exp(result_3.scale / 2)

sensor_effects = log_sensor_effects.apply(np.exp)
sensor_effects['arithmetic_mean_pedestrians_2019'] = (
    sensor_effects['geometric_mean_pedestrians_2019'] * convexivity_correction
)

In [None]:
fig, ax = plt.subplots()

fixed_intercept = np.exp(log_fixed_intercept)
fixed_year_effect = np.exp(log_fixed_year)
 
for index, series in sensor_effects.sample(20).iterrows(): 
    ax.axline((0, series['arithmetic_mean_pedestrians_2019']), (1, series['arithmetic_mean_pedestrians_2019']*series['mean_year_effect']))

ax.axline(
    (0, fixed_intercept * convexivity_correction),
    (1, fixed_intercept * convexivity_correction * fixed_year_effect),
    color='black',
    linewidth=2,
    label='Average sensor'
)

ax.set_xlim(0, 1)

In [None]:
bad_sensors = (
    pd.Index(result_3.random_effects.keys())
    .difference(locations['sensor_id'])
)
print(f"Sensors in model but not in locations: {bad_sensors}")

data_clean = counts[~counts['sensor_id'].isin(bad_sensors)]

We see that there are a few sensors which do not have entries in our locations dataset. This means we cannot recover their locations however they still contain useful information for our regression so we will just stop considering them from here.

In [None]:
location_effects = pd.merge(
    sensor_effects, locations.drop(columns=['installation_date', 'note', 'status', 'location']), right_on='sensor_id', left_index=True, how='inner'
)
location_effects['percentage_change'] = round((location_effects['mean_year_effect'] - 1) * 100, 3)
location_effects.to_parquet(config.PROCESSED_DATA_DIR / 'location_effects.parquet')

In [None]:
data_clean.to_parquet(config.PROCESSED_DATA_DIR / 'analysis_data.parquet')