In [135]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import seaborn as sns


In [136]:
from lightgbm import LGBMRegressor


In [137]:
import importlib.metadata
from prophet import Prophet

In [138]:
prophet_version = importlib.metadata.version('prophet')
print('Prophet %s' % prophet_version)

Prophet 1.1.5


In [139]:
# Load the dataset
DATA_PATH = '../data-preprocessing/VinhLong_data.csv'
data = pd.read_csv(DATA_PATH)
data = data[data['Nhiệt độ'] <= 60]

In [140]:
data['Ngày'] = pd.to_datetime(data['Ngày'], format='%d/%m/%Y')
data['Giờ'] = pd.to_datetime(data['Giờ'], format='%H:%M').dt.time
data['Datetime'] = pd.to_datetime(data['Ngày'].astype(str) + ' ' + data['Giờ'].astype(str))
data.set_index('Datetime', inplace=True)
data.drop(['Ngày', 'Giờ'], axis=1, inplace=True)
data.head()

Unnamed: 0_level_0,Nhiệt độ,Độ ẩm,Khí áp,T.độ gió,H. gió
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-01-01 01:00:00,22.2,89,1013.7,1.3,225
2014-01-01 02:00:00,21.8,90,1012.9,1.0,270
2014-01-01 03:00:00,22.8,80,1012.5,3.1,320
2014-01-01 04:00:00,22.7,80,1012.5,1.2,235
2014-01-01 05:00:00,22.0,84,1012.5,3.7,319


In [141]:
data.replace(['-', ''], np.nan, inplace=True)
data.dropna(inplace=True)
data = data.apply(pd.to_numeric, errors='ignore')
data.tail()

Unnamed: 0_level_0,Nhiệt độ,Độ ẩm,Khí áp,T.độ gió,H. gió
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-01-15 19:00:00,28.3,65,1011.8,2.2,154
2024-01-15 20:00:00,27.4,70,1012.1,2.1,152
2024-01-15 21:00:00,27.0,72,1012.5,0.2,149
2024-01-15 22:00:00,26.5,75,1012.2,1.3,153
2024-02-21 07:00:00,24.4,97,1012.6,1.4,142


In [142]:
print(data.isna().sum())

Nhiệt độ    0
Độ ẩm       0
Khí áp      0
T.độ gió    0
H. gió      0
dtype: int64


In [143]:
hourly_data = data.resample('H').mean()
hourly_data.replace(['-', ''], np.nan, inplace=True)
hourly_data.dropna(inplace=True)
hourly_data = hourly_data.apply(pd.to_numeric, errors='ignore')
hourly_data.tail()

Unnamed: 0_level_0,Nhiệt độ,Độ ẩm,Khí áp,T.độ gió,H. gió
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-01-15 19:00:00,28.3,65.0,1011.8,2.2,154.0
2024-01-15 20:00:00,27.4,70.0,1012.1,2.1,152.0
2024-01-15 21:00:00,27.0,72.0,1012.5,0.2,149.0
2024-01-15 22:00:00,26.5,75.0,1012.2,1.3,153.0
2024-02-21 07:00:00,24.4,97.0,1012.6,1.4,142.0


In [144]:
hourly_data = hourly_data.rename(columns={'Nhiệt độ': 'y'})

In [145]:
hourly_data = hourly_data.reset_index().rename(columns={'Datetime': 'ds'})


In [146]:
hourly_data.set_index('ds', inplace=True)

In [147]:
hourly_data

Unnamed: 0_level_0,y,Độ ẩm,Khí áp,T.độ gió,H. gió
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-01-01 01:00:00,22.2,89.0,1013.7,1.3,225.0
2014-01-01 02:00:00,21.8,90.0,1012.9,1.0,270.0
2014-01-01 03:00:00,22.8,80.0,1012.5,3.1,320.0
2014-01-01 04:00:00,22.7,80.0,1012.5,1.2,235.0
2014-01-01 05:00:00,22.0,84.0,1012.5,3.7,319.0
...,...,...,...,...,...
2024-01-15 19:00:00,28.3,65.0,1011.8,2.2,154.0
2024-01-15 20:00:00,27.4,70.0,1012.1,2.1,152.0
2024-01-15 21:00:00,27.0,72.0,1012.5,0.2,149.0
2024-01-15 22:00:00,26.5,75.0,1012.2,1.3,153.0


In [148]:
data_length = len(hourly_data)

train_idx = int(data_length * 0.80)
test_idx = int(data_length * 0.20)

# Split the hourly_data into train, validation, and test
train = hourly_data[:train_idx]
test = hourly_data[train_idx:]

# Check the shape of the datasets to verify the splits
print(f"Train Shape: {train.shape}")
print(f"Test Shape: {test.shape}")

Train Shape: (57810, 5)
Test Shape: (14453, 5)


In [152]:
model = Prophet(interval_width=0.95)
model.add_regressor('Độ ẩm',standardize=False,mode='multiplicative')
model.add_regressor('Khí áp',standardize=False)
model.add_regressor('T.độ gió',standardize=False,mode='multiplicative')
model.add_regressor('H. gió',standardize=False,mode='multiplicative')
model.fit(prop_data)

15:54:58 - cmdstanpy - INFO - Chain [1] start processing
15:55:04 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x17a16e410>

In [153]:
as_df = hourly_data.copy()

In [154]:
as_df.reset_index()

Unnamed: 0,ds,y,Độ ẩm,Khí áp,T.độ gió,H. gió
0,2014-01-01 01:00:00,22.2,89.0,1013.7,1.3,225.0
1,2014-01-01 02:00:00,21.8,90.0,1012.9,1.0,270.0
2,2014-01-01 03:00:00,22.8,80.0,1012.5,3.1,320.0
3,2014-01-01 04:00:00,22.7,80.0,1012.5,1.2,235.0
4,2014-01-01 05:00:00,22.0,84.0,1012.5,3.7,319.0
...,...,...,...,...,...,...
72258,2024-01-15 19:00:00,28.3,65.0,1011.8,2.2,154.0
72259,2024-01-15 20:00:00,27.4,70.0,1012.1,2.1,152.0
72260,2024-01-15 21:00:00,27.0,72.0,1012.5,0.2,149.0
72261,2024-01-15 22:00:00,26.5,75.0,1012.2,1.3,153.0


In [155]:
print(as_df.isna().sum())

y           0
Độ ẩm       0
Khí áp      0
T.độ gió    0
H. gió      0
dtype: int64


In [156]:
future = model.make_future_dataframe(periods=len(test), freq='H')


In [161]:
# future = model.make_future_dataframe(periods=len(test))
future = future.merge(as_df[['ds', 'Độ ẩm', 'Khí áp', 'T.độ gió', 'H. gió']], on='ds', how='left')


KeyError: "['ds'] not in index"

In [159]:
print(future.isna().sum())

ds              0
Độ ẩm       72263
Khí áp      72263
T.độ gió    72263
H. gió      72263
dtype: int64


In [133]:
future = future.fillna(method='ffill')
forecast = model.predict(future)
model.plot(forecast)
plt.show()

ValueError: Found NaN in column 'Độ ẩm'