# Model Training

## Context


**Data Source**
The data used in this notebook was extracted from the notebooks *model-selection.ipynb* and

- **Data:** 19/08/2025
- **Localização:** ../data/wrangle

## Set up

### Libraries

In [1]:
## Base
import os
import pickle
import numpy as np
import pandas as pd

## Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Model
from statsmodels.regression.quantile_regression import QuantReg
from sklearn.metrics import root_mean_squared_error

In [2]:
# Funções criadas
import sys
from pathlib import Path
sys.path.insert(1, Path.cwd().parents[1].as_posix())

from src.ts_utils import *

from config import *

In [3]:
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=['#003366'])

## Data


In [4]:
df = pd.read_parquet(os.path.join(DATA_PATH_WRANGLE, 'weather_sanitized.parquet'))

with open(os.path.join(DATA_PATH_WRANGLE, "top_100_features.pkl"), 'rb') as file:
    top_features = pickle.load(file)

In [5]:
len(top_features)

70

### New Features

#### Generating numerical

In [6]:
for feature_name in top_features:
    if ('diff' in feature_name) & ('resid' not in feature_name):
        column_name, shift, _ = feature_name.split('_')
        column = df[column_name].diff().shift(int(shift))
        df[feature_name] = column
        
    elif ('wcardinal' not in feature_name) & ('day' not in feature_name) & \
         ('month' not in feature_name) & ('season' not in feature_name) & \
         ('resid' not in feature_name):
        column_name, shift = feature_name.split('_')
        column = df[column_name].shift(int(shift))
        df[feature_name] = column

resid later

#### Generating wcardinal

In [7]:
for feature_name in top_features:
    if 'wcardinal' in feature_name:
        column_name, shift, cardinal = feature_name.split('_')
        column = (df[column_name] == cardinal).astype(int).shift(int(shift)) 
        df[column_name] = column

#### Generating date

In [8]:
for feature_name in top_features:
    if ('day' in feature_name) & ('month' in feature_name) & \
       ('season' in feature_name):
        print(feature_name)

### Data Split

In [9]:
df.set_index('time', inplace=True)
train = df.head(7*365).copy()
X_train = train.drop(columns=['tavg']).copy()
y_train = train[['tavg']].copy()

cp_test = df[df.index > train.index.max()].copy()

## Harmonic Regression

### Train

In [10]:
n = len(X_train)
t = np.arange(n)

omega1 = (2 * np.pi / 365.25)
omega2 = 2 * (2 * np.pi / 365.25)
omega3 = 3 * (2 * np.pi / 365.25)

In [11]:
X_harmonic = np.column_stack([
    np.ones(n), # Intercept           
    np.sin(omega1*t),
    np.cos(omega1*t),
    np.cos(omega2*t),
    np.sin(omega3*t)
])

model = QuantReg(y_train, X_harmonic)
res = model.fit(q=0.5)
res.summary()

0,1,2,3
Dep. Variable:,tavg,Pseudo R-squared:,0.6209
Model:,QuantReg,Bandwidth:,1.478
Method:,Least Squares,Sparsity:,10.15
Date:,"Tue, 02 Sep 2025",No. Observations:,2555.0
Time:,12:52:26,Df Residuals:,2550.0
,,Df Model:,4.0

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,285.0353,0.100,2839.873,0.000,284.838,285.232
x1,-8.3987,0.142,-59.190,0.000,-8.677,-8.121
x2,10.7541,0.142,75.738,0.000,10.476,11.033
x3,0.4052,0.142,2.854,0.004,0.127,0.684
x4,0.2539,0.142,1.789,0.074,-0.024,0.532


In [12]:
X_harmonic_train = np.column_stack([
    np.ones(n), # Intercept           
    np.sin(omega1*t),
    np.cos(omega1*t),
    np.cos(omega2*t)
])

quantM = QuantReg(y_train, X_harmonic_train)
harmonic_model = quantM.fit(q=0.5)
harmonic_model.summary()

0,1,2,3
Dep. Variable:,tavg,Pseudo R-squared:,0.6205
Model:,QuantReg,Bandwidth:,1.483
Method:,Least Squares,Sparsity:,10.09
Date:,"Tue, 02 Sep 2025",No. Observations:,2555.0
Time:,12:52:26,Df Residuals:,2551.0
,,Df Model:,3.0

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,285.0328,0.100,2855.221,0.000,284.837,285.229
x1,-8.3331,0.141,-59.045,0.000,-8.610,-8.056
x2,10.7551,0.141,76.154,0.000,10.478,11.032
x3,0.4578,0.141,3.242,0.001,0.181,0.735


### Test

In [14]:
n_forecast = len(cp_test)
t_forecast = np.arange(n_forecast) + max(t)

omega1 = (2 * np.pi / 365.25)
omega2 = 2 * (2 * np.pi / 365.25)
omega3 = 3 * (2 * np.pi / 365.25)

X_harmonic_forecast = np.column_stack([
    np.ones(n_forecast), # Intercept           
    np.sin(omega1*t_forecast),
    np.cos(omega1*t_forecast),
    np.cos(omega2*t_forecast)
])

In [15]:
train['tavg_forecast'] = harmonic_model.fittedvalues
cp_test['tavg_forecast'] = harmonic_model.predict(X_harmonic_forecast)

In [16]:
root_mean_squared_error(cp_test['tavg'], cp_test['tavg_forecast'])

4.867072238376374