In [None]:
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.base import BaseEstimator, RegressorMixin
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from mlforecast.target_transforms import Differences
from mlforecast.utils import generate_daily_series
from tqdm import tqdm


In [None]:
# Read data
DATA_PATH = '../data-preprocessing/VinhLong_data.csv'
data = pd.read_csv(DATA_PATH)

In [None]:
data = data[data['Nhiệt độ'] <= 60]
data['Ngày'] = pd.to_datetime(data['Ngày'], format='%d/%m/%Y')
data['Giờ'] = pd.to_datetime(data['Giờ'], format='%H:%M').dt.time
data['Datetime'] = pd.to_datetime(data['Ngày'].astype(str) + ' ' + data['Giờ'].astype(str))
data.set_index('Datetime', inplace=True)
data.drop(['Ngày', 'Giờ'], axis=1, inplace=True)
data.replace(['-', ''], np.nan, inplace=True)
data.dropna(inplace=True)
data = data.apply(pd.to_numeric, errors='ignore')

In [None]:
hourly_data = data.resample('H').mean()
hourly_data.replace(['-', ''], np.nan, inplace=True)
hourly_data.dropna(inplace=True)
hourly_data = hourly_data.apply(pd.to_numeric, errors='ignore')
hourly_data = hourly_data.rename(columns={'Nhiệt độ': 'y'})
hourly_data = hourly_data.reset_index().rename(columns={'Datetime': 'ds'})

In [None]:
hourly_data['unique_id'] = 'series_1'


In [None]:
data_length = len(hourly_data)
train_idx = int(data_length * 0.80)
test_idx = int(data_length * 0.20)

train = hourly_data[:train_idx]
test = hourly_data[train_idx:]

In [None]:
print(f"Train Shape: {train.shape}")
print(f"Test Shape: {test.shape}")

In [None]:
class SARIMAXWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12)):
        self.order = order
        self.seasonal_order = seasonal_order
        self.model_ = None

    def fit(self, X, y):
        self.model_ = SARIMAX(y, exog=X, order=self.order, seasonal_order=self.seasonal_order)
        self.results_ = self.model_.fit(disp=False)
        return self

    def predict(self, X):
        return self.results_.get_forecast(steps=len(X), exog=X).predicted_mean

In [None]:
sarimax_model = SARIMAXWrapper(order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))


In [26]:
from window_ops.rolling import rolling_mean

In [32]:
forecast = MLForecast(
    models=[sarimax_model],
    freq='H',
    lags=[1, 7, 14],
    lag_transforms={
        1: [(rolling_mean, 3), (rolling_mean, 7), (rolling_mean, 28)],
    },
    target_transforms=[Differences([1])]
)

forecast.fit(hourly_data, id_col='unique_id', time_col='ds', target_col='y')

horizon = 10

predictions = forecast.predict(h=horizon)

predicted_dates = predictions['ds']

actual_values = hourly_data[hourly_data['ds'].isin(predicted_dates)]['y']

results = predictions.copy()
results['Actual'] = actual_values.values if not actual_values.empty else None

print(results)


  unique_id                  ds  SARIMAXWrapper Actual
0  series_1 2024-02-21 08:00:00       23.749098   None
1  series_1 2024-02-21 09:00:00       24.108786   None
2  series_1 2024-02-21 10:00:00       24.167354   None
3  series_1 2024-02-21 11:00:00       23.109395   None
4  series_1 2024-02-21 12:00:00       22.258986   None
5  series_1 2024-02-21 13:00:00       21.896000   None
6  series_1 2024-02-21 14:00:00       20.816184   None
7  series_1 2024-02-21 15:00:00       19.901083   None
8  series_1 2024-02-21 16:00:00       19.185840   None
9  series_1 2024-02-21 17:00:00       18.849851   None
