In [1]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.axes
from pathlib import Path

# Add project root to Python path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

In [2]:
import src.loadProcessed as loadp

counts: pd.DataFrame = loadp.load_processed_count()
locations: pd.DataFrame = loadp.load_processed_locations()

In [3]:
# Take the intersection of those sets
common_sensors = (
    set(counts[counts['sensing_date'].dt.year == 2019]['sensor_id'].unique()) 
    & set(counts[counts['sensing_date'].dt.year == 2022]['sensor_id'].unique())
)

# Filter to only those sensors
counts_with_common_dates = counts[counts['sensor_id'].isin(common_sensors)]

# Limit to only 2019 and 2022 with non-zero values
data: pd.DataFrame = (
    counts_with_common_dates[
        counts_with_common_dates['sensing_date'].dt.year.isin([2019, 2022])
    ].query('daily_count > 0')
)
data['year'] = data['sensing_date'].dt.year.astype('category')
data.drop(columns=['sensing_date'])


Unnamed: 0,sensor_id,day,daily_count,year
3034,1,Tuesday,26865,2019
3035,1,Wednesday,33517,2019
3036,1,Thursday,35470,2019
3037,1,Friday,22161,2019
3038,1,Saturday,31611,2019
...,...,...,...,...
178143,62,Thursday,4079,2022
178144,62,Friday,4310,2022
178145,62,Saturday,4027,2022
178146,62,Sunday,4150,2022


In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy.highlevel as phl
import math

# We want to log transform since counts have a power relation
log_data = data.copy()
log_data['daily_count'] = log_data['daily_count'].apply(math.log)

model = smf.mixedlm(
    'daily_count ~ year + day',     # Defines the response and fixed effects
    log_data,                       
    groups=log_data['sensor_id'],   # Defines how to cluster the data
    re_formula='~ year'             # Random effect that differs across the clusters
)

result = model.fit()
print(result.summary())

random_effects = result.random_effects
print(random_effects)

               Mixed Linear Model Regression Results
Model:                 MixedLM    Dependent Variable:    daily_count
No. Observations:      32289      Method:                REML       
No. Groups:            53         Scale:                 0.1850     
Min. group size:       132        Log-Likelihood:        -18913.5765
Max. group size:       669        Converged:             Yes        
Mean group size:       609.2                                        
--------------------------------------------------------------------
                         Coef.  Std.Err.    z    P>|z| [0.025 0.975]
--------------------------------------------------------------------
Intercept                 9.298    0.118  78.535 0.000  9.066  9.530
year[T.2022]             -0.415    0.044  -9.438 0.000 -0.501 -0.329
day[T.Monday]            -0.225    0.009 -25.126 0.000 -0.243 -0.208
day[T.Saturday]          -0.164    0.009 -18.293 0.000 -0.181 -0.146
day[T.Sunday]            -0.358    0.009 -39.946 0