In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
DATA_PATH = '../data-preprocessing/VinhLong_data.csv'
data = pd.read_csv(DATA_PATH)
data = data[data['Nhiệt độ'] <= 60]

In [3]:
data['Ngày'] = pd.to_datetime(data['Ngày'], format='%d/%m/%Y')
data['Giờ'] = pd.to_datetime(data['Giờ'], format='%H:%M').dt.time
data['Datetime'] = pd.to_datetime(data['Ngày'].astype(str) + ' ' + data['Giờ'].astype(str))
data.set_index('Datetime', inplace=True)
data.drop(['Ngày', 'Giờ'], axis=1, inplace=True)
data.head()

Unnamed: 0_level_0,Nhiệt độ,Độ ẩm,Khí áp,T.độ gió,H. gió
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-01-01 01:00:00,22.2,89,1013.7,1.3,225
2014-01-01 02:00:00,21.8,90,1012.9,1.0,270
2014-01-01 03:00:00,22.8,80,1012.5,3.1,320
2014-01-01 04:00:00,22.7,80,1012.5,1.2,235
2014-01-01 05:00:00,22.0,84,1012.5,3.7,319


In [4]:
data.replace(['-', ''], np.nan, inplace=True)
data.dropna(inplace=True)
data = data.apply(pd.to_numeric, errors='ignore')
data.tail()

Unnamed: 0_level_0,Nhiệt độ,Độ ẩm,Khí áp,T.độ gió,H. gió
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-01-15 19:00:00,28.3,65,1011.8,2.2,154
2024-01-15 20:00:00,27.4,70,1012.1,2.1,152
2024-01-15 21:00:00,27.0,72,1012.5,0.2,149
2024-01-15 22:00:00,26.5,75,1012.2,1.3,153
2024-02-21 07:00:00,24.4,97,1012.6,1.4,142


In [5]:
print(data.isna().sum())

Nhiệt độ    0
Độ ẩm       0
Khí áp      0
T.độ gió    0
H. gió      0
dtype: int64


In [6]:
hourly_data = data.resample('H').mean()
hourly_data.replace(['-', ''], np.nan, inplace=True)
hourly_data.dropna(inplace=True)
hourly_data = hourly_data.apply(pd.to_numeric, errors='ignore')
hourly_data.tail()

Unnamed: 0_level_0,Nhiệt độ,Độ ẩm,Khí áp,T.độ gió,H. gió
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-01-15 19:00:00,28.3,65.0,1011.8,2.2,154.0
2024-01-15 20:00:00,27.4,70.0,1012.1,2.1,152.0
2024-01-15 21:00:00,27.0,72.0,1012.5,0.2,149.0
2024-01-15 22:00:00,26.5,75.0,1012.2,1.3,153.0
2024-02-21 07:00:00,24.4,97.0,1012.6,1.4,142.0


In [7]:
hourly_data.reset_index(drop=True, inplace=True)

In [8]:
hourly_data.head()

Unnamed: 0,Nhiệt độ,Độ ẩm,Khí áp,T.độ gió,H. gió
0,22.2,89.0,1013.7,1.3,225.0
1,21.8,90.0,1012.9,1.0,270.0
2,22.8,80.0,1012.5,3.1,320.0
3,22.7,80.0,1012.5,1.2,235.0
4,22.0,84.0,1012.5,3.7,319.0


In [9]:
print(hourly_data.isna().sum())

Nhiệt độ    0
Độ ẩm       0
Khí áp      0
T.độ gió    0
H. gió      0
dtype: int64


In [10]:
# data_length = len(hourly_data)

# train_idx = int(data_length * 0.80)
# test_idx = int(data_length * 0.20)

# train = hourly_data[:train_idx]
# test = hourly_data[train_idx:]

# print(f"Train Shape: {train.shape}")
# print(f"Test Shape: {test.shape}")

In [11]:
def train_test_split(data, n_test):
    n_test = int(len(data) * 0.20)
    return data[:-n_test], data[-n_test:]

In [12]:
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

In [13]:
# testX = test['Nhiệt độ']
# testy = test.drop(columns=['Nhiệt độ'])
# trainX = train['Nhiệt độ']
# trainy = train.drop(columns=['Nhiệt độ'])

In [None]:
def create_features(data, n_lags=3):
    df = data.copy()
    for n in range(1, n_lags + 1):
        df[f'lag_{n}'] = df['Nhiệt độ'].shift(n)
    # df['rolling_mean_3'] = df['Nhiệt độ'].rolling(window=3).mean()
    # target
    df['target_temp'] = df['Nhiệt độ'].shift(-1)
    df['target_humidity'] = df['Độ ẩm'].shift(-1)
    df['target_baro'] = df['Khí áp'].shift(-1)
    df['target_windspeed'] = df['T.độ gió'].shift(-1)
    df['target_winddirec'] = df['H. gió'].shift(-1)
    df.dropna(inplace=True)
    return df

# Apply feature engineering
hourly_data = create_features(hourly_data, n_lags=3)


In [None]:
from sklearn.model_selection import GridSearchCV

def xgboost_grid_search(trainX, trainy):
    model = XGBRegressor(objective='reg:squarederror')
    param_grid = {
        'n_estimators': [100, 500, 1000],
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.01, 0.05, 0.1]
    }
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=3)
    grid_search.fit(trainX, trainy)
    print("Best parameters found: ", grid_search.best_params_)
    return grid_search.best_estimator_

In [None]:
from sklearn.multioutput import MultiOutputRegressor

def xgboost_forecast(train, testX):
    train = np.asarray(train)
    trainX, trainy = train[:, :-5], train[:, -5:]
    model = MultiOutputRegressor(xgboost_grid_search(trainX, trainy))
    model.fit(trainX, trainy)
    yhat = model.predict([testX])
    return yhat[0]


In [None]:
from tqdm import tqdm

In [None]:
def rolling_forecast(data, window_size, n_test):
    predictions = list()
    train, test = train_test_split(data.values, n_test)
    history = [x for x in train]
    for i in range(len(test)):
        # Define the rolling window for training
        rolling_train = history[-window_size:]
        testX, testy = test[i, :-5], test[i, -5:]
        yhat = xgboost_forecast(rolling_train, testX)
        predictions.append(yhat)
        history.append(test[i])
        print('>expected=%.1f, predicted=%.1f' % (testy[0], yhat[0]))
    predictions = np.array(predictions)
    error = np.sqrt(mean_squared_error(test[:, -5:], predictions, multioutput='raw_values'))
    return error, test[:, -5:], predictions

In [None]:
hourly_data.head()

In [None]:
print(hourly_data.isna().sum())

In [None]:
from sklearn.metrics import mean_squared_error

window_size = 100

# Evaluate the model
rmse, y, yhat = rolling_forecast(hourly_data, window_size, 12)

# Print RMSE for each target
targets = ['Temperature', 'Humidity', 'Pressure', 'Wind Speed', 'Wind Direction']

for i, target in tqdm(enumerate(targets), total=len(targets)):
    print(f'RMSE for {target}: {rmse[i]:.3f}')


In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(nrows=5, ncols=1, figsize=(10, 15))
targets = ['Temperature', 'Humidity', 'Pressure', 'Wind Speed', 'Wind Direction']
for i, ax in enumerate(axes):
    ax.plot(y[:, i], label='Expected')
    ax.plot(yhat[:, i], label='Predicted')
    mean_value = np.mean(y[:, i])
    ax.axhline(y=mean_value, color='r', linestyle='--', label='Mean')
    ax.set_title(targets[i])
    ax.legend()

plt.tight_layout()
plt.show()


In [None]:
# # Function to perform walk-forward validation
# def walk_forward_validation(data, n_test):
#     predictions = list()
#     train, test = train_test_split(data.values, n_test)
#     history = [x for x in train]
#     for i in range(len(test)):
#         testX, testy = test[i, :-1], test[i, -1]
#         yhat = xgboost_forecast(history, testX)
#         predictions.append(yhat)
#         history.append(test[i])
#         print('>expected=%.1f, predicted=%.1f' % (testy, yhat))
#     error = mean_absolute_error(test[:, -1], predictions)
#     return error, test[:, -1], predictions

# # Evaluate the model
# mae, y, yhat = walk_forward_validation(hourly_data, 12)
# print('MAE: %.3f' % mae)

# # Plot expected vs predicted values
# plt.plot(y, label='Expected')
# plt.plot(yhat, label='Predicted')
# plt.legend()
# plt.show()
