In [None]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.axes
import numpy as np
from pathlib import Path

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy.highlevel as phl

# Add project root to Python path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

import src.config as config

In [None]:
import src.loadProcessed as loadp

counts = loadp.load_selected_count()
log_data = counts.copy()
log_data['daily_count'] = log_data['daily_count'].apply(np.log)

## Justifying log transformation

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(12, 10))

ax1.hist(counts['daily_count'], bins=100, color='#0d6efd', alpha=0.7, edgecolor='white', linewidth=0.5)
ax1.set_xlabel("Daily count", fontsize=12)
ax1.set_ylabel("Frequency", fontsize=12)
ax1.set_title("Distribution of Raw Daily Counts", fontsize=14, pad=15)
ax1.grid(True, alpha=0.3, linestyle='--')
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)

ax2.hist(log_data['daily_count'], bins=100, color='#84ca66', alpha=0.7, edgecolor='white', linewidth=0.5)
ax2.set_xlabel("Log daily count", fontsize=12)
ax2.set_ylabel("Frequency", fontsize=12)
ax2.set_title("Distribution of Log-Transformed Daily Counts", fontsize=14, pad=15)
ax2.grid(True, alpha=0.3, linestyle='--')
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)

fig.tight_layout()

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(4, 7))

sm.qqplot(counts['daily_count'], line='s', ax=ax1, marker='o', alpha=0.6, markerfacecolor='#0d6efd', markeredgecolor='#0d6efd', markersize=4)
ax1.set_title('Q-Q Plot of Raw Daily Counts', fontsize=14, pad=15)
ax1.set_xlabel('Theoretical Quantiles', fontsize=12)
ax1.set_ylabel('Sample Quantiles', fontsize=12)
ax1.grid(True, alpha=0.3, linestyle='--')
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)

sm.qqplot(log_data['daily_count'], line='s', ax=ax2, marker='o', alpha=0.6, markerfacecolor='#84ca66', markeredgecolor='#84ca66', markersize=4)
ax2.set_title('Q-Q Plot of Log-Transformed Daily Counts', fontsize=14, pad=15)
ax2.set_xlabel('Theoretical Quantiles', fontsize=12)
ax2.set_ylabel('Sample Quantiles', fontsize=12)
ax2.grid(True, alpha=0.3, linestyle='--')
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)

fig.tight_layout()

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

for ax, sensor_id in zip(axes.flat, counts['sensor_id'].unique()[:6]):
    y = counts.loc[counts['sensor_id'] == sensor_id, 'daily_count']
    sm.qqplot(np.log(y), line='s', ax=ax, marker='o', markerfacecolor='#0d6efd', markeredgecolor='#0d6efd', alpha=0.6, markersize=3)
    ax.set_title(f'Sensor {sensor_id}', fontsize=12, pad=10)
    ax.set_xlabel('Theoretical Quantiles', fontsize=10)
    ax.set_ylabel('Sample Quantiles', fontsize=10)
    ax.grid(True, alpha=0.3, linestyle='--')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

fig.suptitle('Q-Q Plots by Sensor (Log-Transformed)', fontsize=16, y=0.995)
fig.tight_layout()

We can see from these plots that there is a heavy positive skew in the raw pedestrian counts data. This invites a log transformation which we can see improves our fit giving us a sufficiently normal distribution of counts. We can see that this distribution does have a heavy left tail which is reflected in the marginal QQ-Plots, however this effect is lessened when we consider each sensor seperatley. This arises from the variance in sensor to sensor means, however this heavy left tail persists. The effect of this should be fairly minimal in our analysis as the fit is still quite good, but it should still be considered as a source of error. 

## Model 1

$$log(y_{i,t}) = \beta_0 + \beta_1 \text{Year}_t + \sum_{D \in \text{days} \setminus \text{Friday}} {(\beta_d 1_{t \in D})} + b_{0,i} + b_{1,i} \text{Year}_t + \varepsilon_{i,t}$$
* $log(y_{i,t})$ : Natural log of daily counts of sensor $i$ on day $t$
* $\beta_0$ : Fixed intercept of mean log daily count on a friday in 2019
* $\text{Year}_t$ : Indicator variable of whether t is in 2025 $1\{t \in 2025\}$
* $\beta_1$ : Fixed slope for year (population-average log difference between 2019 and 2025)
* $\beta_d$ : Fixed effect of day $d$ (where Friday is the reference category)
* $b_{0,i}$ : Random intercept for mean log daily count by sensor $i$, that is the log deviation from $\beta_0$ for sensor $i$
* $b_{1,i}$ : Random slope for year for sensor $i$, that is the log deviation from $\beta_1$ for sensor $i$
* $\varepsilon_{i,t}$ : Random error term for the counts on a specific sensor on day $t$, this is assumed normal $\varepsilon_{i,t} \sim N(0, \sigma^2)$

In [None]:
model_1 = smf.mixedlm(
    'daily_count ~ year + day',     # Defines the response and fixed effects
    log_data,                       
    groups=log_data['sensor_id'],   # Defines how to cluster the data
    re_formula='~ year'             # Random effect that differs across the clusters
)
result_1 = model_1.fit(reml=False)

print(result_1.summary())

### Regression Table Interpretation
* Intercept: corresponds to $\beta_0$ (mean log daily count on a Friday in 2019)
* year[T.2025]: corresponds to $\beta_1$ (population-average log difference 2025 vs 2019)
* day[T.Monday], day[T.Tuesday], day[T.Wednesday], day[T.Thursday], day[T.Saturday], day[T.Sunday]: correspond to the $\beta_d$ fixed effects for each day (Friday is the reference)
* Group Var (sensor_id): variance of $b_{0,i}$ (random intercepts by sensor)
* Group x year Cov: covariance between $b_{0,i}$ and $b_{1,i}$
* year Var: variance of $b_{1,i}$ (random slopes for year by sensor)
* Scale: $\sigma^2$ for $\varepsilon_{i,t}$ such that $\varepsilon_{i,t} \sim N(0, 0.289)$

## Model 2

$$log(y_{i,t}) = \beta_0 + \beta_1 \text{Year}_t + \sum_{D \in \text{days} \setminus \text{Friday}} {(\beta_d 1_{t \in D})} + \sum_{D \in \text{days} \setminus \text{Friday}} {(\beta_{d \times Year} 1_{t \in D})} + b_{0,i} + b_{1,i} \text{Year}_t + \varepsilon_{i,t}$$
* $log(y_{i,t})$ : Natural log of daily counts of sensor $i$ on day $t$
* $\beta_0$ : Fixed intercept of mean log daily count on a friday in 2019
* $\text{Year}_t$ : Indicator variable of whether t is in 2025 $1\{t \in 2025\}$
* $\beta_1$ : Fixed main effect of year, representing the population-average log difference between 2019 and 2025 on Fridays.
* $\beta_d$ : Fixed effect of day $d$ (where Friday is the reference category)
* $\beta_{d \times \text{Year}}$ Fixed interaction effect between day $d$ and year, representing the additional log change from 2019 to 2025 for day $d$ relative to Fridays.
* $b_{0,i}$ : Random intercept for mean log daily count by sensor $i$, that is the log deviation from $\beta_0$ for sensor $i$
* $b_{1,i}$ : Random slope for year for sensor $i$, that is the log deviation from $\beta_1$ for sensor $i$
* $\varepsilon_{i,t}$ : Random error term for the counts on a specific sensor on day $t$, this is assumed normal $\varepsilon_{i,t} \sim N(0, \sigma^2)$

In [None]:
model_2 = smf.mixedlm(
    'daily_count ~ day*year',     # Defines the response and fixed effects
    log_data,                       
    groups=log_data['sensor_id'],  # Defines how to cluster the data
    re_formula='~ year'            # Random effect that differs across the clusters
)
result_2 = model_2.fit(reml=False)

print(result_2.summary())

## Model 3