In [16]:
import pandas as pd
from pygam import GammaGAM, LogisticGAM, s, te, l
from scipy.stats import gamma
import numpy as np

In [12]:
weather_df = pd.read_csv('processed_data/weather_data_filled_1950_2023.csv.gz', sep=',', compression='gzip', encoding='utf-8', low_memory=False)
temp_df = pd.read_csv('processed_data/basin_wide_temperature_covariate.csv.gz', sep=',', compression='gzip', encoding='utf-8', low_memory=False)

# 2. Process Basin Temperature (30-day rolling window ending at time t)
temp_df['Date'] = pd.to_datetime(temp_df['Date'])
temp_df = temp_df.sort_values('Date').set_index('Date')
temp_df['temp_30d_avg'] = temp_df['Basin_Mean_Temp'].rolling('30D').mean()

In [None]:
model_df = weather_df.copy()
if 'Date' not in model_df.columns:
        model_df['Date'] = pd.to_datetime(model_df['year'] * 1000 + model_df['day_of_year'], format='%Y%j')

# Merge only necessary columns for the regression
model_df = model_df.merge(temp_df[['temp_30d_avg']], on='Date', how='inner')
model_df = model_df.dropna(subset=['RR', 'temp_30d_avg', 'ALTI', 'LON', 'LAT'])

# Define Feature Matrix X
X = model_df[['LON', 'LAT', 'ALTI', 'day_of_year', 'temp_30d_avg']].values
y = model_df['RR'].values

mask_bulk = y > 10
X_bulk = X[mask_bulk]
y_bulk = y[mask_bulk]

print("fitting gamma GAM...")
gam_gamma = GammaGAM(te(0, 1, n_splines=10) + 
                        s(2, n_splines=10) + 
                        s(3, basis='cp', n_splines=12) + 
                        l(4)).fit(X_bulk, y_bulk)

# Predict spatiotemporal mean exp(eta)
mean_pred = gam_gamma.predict(X)

# Calculate u_s,t: the 90% quantile 
shape_param = 1 / gam_gamma.distribution.scale 
scale_params = mean_pred / shape_param
model_df['threshold_u'] = gamma.ppf(0.90, a=shape_param, scale=scale_params)

# LOGISTIC MODEL FOR OCCURRENCE
model_df['is_exceedance'] = (model_df['RR'] > model_df['threshold_u']).astype(int)
y_logistic = model_df['is_exceedance'].values

print("fitting logistic GAM...")
gam_logit = LogisticGAM(te(0, 1, n_splines=10) + 
                        s(2, n_splines=10) + 
                        s(3, basis='cp', n_splines=12) + 
                        l(4)).fit(X, y_logistic)

# Predict probability of exceedance
model_df['prob_exceedance'] = gam_logit.predict_proba(X)

# GPD
# The paper fits a GPD on (Y - u | Y > u). 
# We calculate the excesses here for the next step.
model_df['excess_over_threshold'] = np.where(
    model_df['is_exceedance'] == 1,
    model_df['RR'] - model_df['threshold_u'],
    np.nan
)


print("Marginal modeling complete.")
model_df = model_df[['Date', 'NUM_POSTE', 'RR', 'temp_30d_avg', 'threshold_u', 'prob_exceedance', 'excess_over_threshold']]


fitting gamma GAM...
fitting logistic GAM...
