In [1]:
import sys
sys.path.append('..')
import models_utils
import warnings
warnings.filterwarnings("ignore")
from time import sleep

In [2]:
import numpy
import pmdarima
print("numpy:", numpy.__version__)
print("pmdarima:", pmdarima.__version__)


numpy: 1.26.4
pmdarima: 2.0.4


In [3]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pandas as pd
import pmdarima as pm
import warnings

def train_ARIMA_n_ahead(data, target, date_col, year_test_start, n_ahead=1, auto_arima=True, arima_order=(1, 1, 1), min_predictions=104):
    # Suppress convergence warnings for cleaner output
    warnings.filterwarnings("ignore")

    # Convert and sort dates
    data[date_col] = pd.to_datetime(data[date_col])
    data = data.sort_values(by=date_col).reset_index(drop=True)

    # Resample weekly, fill gaps
    data.set_index(date_col, inplace=True)
    data = data.asfreq('W-MON')
    data[target] = data[target].ffill()

    if data[target].isna().sum() > 0:
        print("Warning: Missing data remains after resampling.")
        return [], None, None

    series = data[[target]]

    # Train-test split
    train = series[series.index < year_test_start]
    test = series[(series.index >= year_test_start) &
                  (series.index <= str(pd.to_datetime("2024-12-31") - pd.DateOffset(weeks=n_ahead)))]

    if train.empty or test.empty:
        print("Train or test dataset is empty. Check your date range.")
        return [], None, None

    test_dates = test.index[:]
    if len(test_dates) < min_predictions:
        print(f"Not enough test dates to make {min_predictions} predictions. Only {len(test_dates)} available.")
        return [], None, None

    predictions = []
    actuals = []

    for current_date in test_dates:

        try:
            current_loc = series.index.get_loc(current_date)
            train_series = series.iloc[:current_loc+1][target]

            if auto_arima:
                # Automatically determine ARIMA order using Hyndman-Khandakar algorithm
                stepwise_model = pm.auto_arima(train_series, seasonal=True, error_action='ignore', suppress_warnings=True)
                arima_order = stepwise_model.order

            model = ARIMA(train_series, order=arima_order)
            fitted_model = model.fit()

            prediction_date = pd.to_datetime(current_date) + pd.DateOffset(weeks=n_ahead)
            forecast = fitted_model.forecast(steps=n_ahead)
            predicted_value = forecast[-1]

            actual_value = series.loc[current_date][target]
            predictions.append(predicted_value)
            actuals.append(round(actual_value))

        except Exception as e:
            print(f"Error at {current_date}: {e}")

    predictions = predictions[:min_predictions]
    actuals = actuals[:min_predictions]

    # Evaluation
    MAE = mean_absolute_error(actuals, predictions)
    MSE = mean_squared_error(actuals, predictions)

    return predictions, MAE, MSE



In [4]:
municipals = []
with (open("../municipals.txt", "r") as f):
    for line in f:
        municipals.append(line.strip())

In [5]:
n_weeks_ahead = [1,2,3,4,5,6,7,8,9,10,11,12]
for municipal in municipals:
    print(municipal)
    municipal_df = pd.read_csv(f"../../data/Merged Data/{municipal}_merged.csv")
    municipal_df["Year-Week"] = pd.to_datetime(municipal_df["Year-Week"])
    for n in n_weeks_ahead:
        # minus n_week for 2023-01-01
        n_date = pd.to_datetime("2023-01-02") - pd.DateOffset(weeks=n)
        print(n_date)
        predicted, MAE, MSE = train_ARIMA_n_ahead(municipal_df, target="Cases", n_ahead = n, date_col="Year-Week", year_test_start=n_date, auto_arima=False, arima_order=(12,0,0))
        models_utils.save_data(municipal, n, MSE, MAE, predicted, municipal_df, type="(12,0,0)")

Ajuy
2022-12-26 00:00:00
2022-12-19 00:00:00
2022-12-12 00:00:00
2022-12-05 00:00:00
2022-11-28 00:00:00
2022-11-21 00:00:00
2022-11-14 00:00:00
2022-11-07 00:00:00
2022-10-31 00:00:00
2022-10-24 00:00:00
2022-10-17 00:00:00
2022-10-10 00:00:00
Alimodian
2022-12-26 00:00:00
2022-12-19 00:00:00
2022-12-12 00:00:00
2022-12-05 00:00:00
2022-11-28 00:00:00
2022-11-21 00:00:00
2022-11-14 00:00:00
2022-11-07 00:00:00
2022-10-31 00:00:00
2022-10-24 00:00:00
2022-10-17 00:00:00
2022-10-10 00:00:00
Anilao
2022-12-26 00:00:00
2022-12-19 00:00:00
2022-12-12 00:00:00
2022-12-05 00:00:00
2022-11-28 00:00:00
2022-11-21 00:00:00
2022-11-14 00:00:00
2022-11-07 00:00:00
2022-10-31 00:00:00
2022-10-24 00:00:00
2022-10-17 00:00:00
2022-10-10 00:00:00
Badiangan
2022-12-26 00:00:00
2022-12-19 00:00:00
2022-12-12 00:00:00
2022-12-05 00:00:00
2022-11-28 00:00:00
2022-11-21 00:00:00
2022-11-14 00:00:00
2022-11-07 00:00:00
2022-10-31 00:00:00
2022-10-24 00:00:00
2022-10-17 00:00:00
2022-10-10 00:00:00
Balasan


KeyboardInterrupt: 

In [5]:
len(predicted)

104

In [18]:
Ajuy_df = pd.read_csv(f"../../data/Merged Data/Ajuy_merged.csv")

In [6]:
summed_df_list = []
for municipal in municipals:
    municipal_df = pd.read_csv(f"../../data/Merged Data/{municipal}_merged.csv")
    # lag the features
    municipal_df = models_utils.prepare_dataframe(municipal_df, ["Temperature", "Precipitation", "Humidity"], ["Year", "Week", "Month", "Population"], "Cases", "Year-Week", 1)
    summed_df_list.append(municipal_df)
summed_df = models_utils.prepare_dataframe_summed(summed_df_list)
summed_df

Unnamed: 0,Year-Week,Temperature,Humidity,Precipitation,Cases,Population,Year,Month,Week
0,2014-04-07,29.111821,69.364982,2.800000,18.0,53490.250000,2014,4,15
1,2014-04-14,28.551928,71.990313,30.800000,11.0,53490.250000,2014,4,16
2,2014-04-21,27.369892,81.277237,224.799993,16.0,53490.250000,2014,4,17
3,2014-04-28,28.856428,75.546539,61.600004,14.0,53490.250000,2014,4,18
4,2014-05-05,29.875322,75.951262,8.800000,20.0,53490.250000,2014,5,19
...,...,...,...,...,...,...,...,...,...
555,2024-12-02,29.520965,75.235959,40.450002,135.0,58740.704545,2024,12,49
556,2024-12-09,28.585107,77.225192,55.400004,166.0,58740.704545,2024,12,50
557,2024-12-16,26.923679,85.136271,117.550004,133.0,58740.704545,2024,12,51
558,2024-12-23,27.754071,80.010136,68.600001,96.0,58740.704545,2024,12,52


In [8]:
n_head = range(1,13)
summed_df["Year-Week"] = pd.to_datetime(summed_df["Year-Week"])
for n in n_head:
    # minus n_week for 2023-01-01
    n_date = pd.to_datetime("2023-01-02") - pd.DateOffset(weeks=n)
    print(n_date)
    predicted, MAE, MSE = train_ARIMA_n_ahead(municipal_df, target="Cases", n_ahead = n, date_col="Year-Week", year_test_start=n_date, auto_arima=False, arima_order=(3,0,0))
    models_utils.save_data("(3,0,0)", n, MSE, MAE, predicted, municipal_df, type="Summed Provincial")

2022-12-26 00:00:00
2022-12-19 00:00:00
2022-12-12 00:00:00
2022-12-05 00:00:00
2022-11-28 00:00:00
2022-11-21 00:00:00
2022-11-14 00:00:00
2022-11-07 00:00:00
2022-10-31 00:00:00
2022-10-24 00:00:00
2022-10-17 00:00:00
2022-10-10 00:00:00


In [None]:
n_head = range(1,13)
summed_df["Year-Week"] = pd.to_datetime(summed_df["Year-Week"])
for n in n_head:
    # minus n_week for 2023-01-01
    n_date = pd.to_datetime("2023-01-02") - pd.DateOffset(weeks=n)
    print(n_date)
    predicted, MAE, MSE = train_ARIMA_n_ahead(municipal_df, target="Cases", n_ahead = n, date_col="Year-Week", year_test_start=n_date, auto_arima=False, arima_order=(1,1,1))
    models_utils.save_data("(1,1,1)", n, MSE, MAE, predicted, municipal_df, type="Summed Provincial")

2022-12-26 00:00:00
2022-12-19 00:00:00
2022-12-12 00:00:00
