In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, GRU
import matplotlib.pyplot as plt


In [3]:
# Load dữ liệu
data = pd.read_csv('data/Tetuan City power consumption.csv')
data



Unnamed: 0,DateTime,Temperature,Humidity,Wind Speed,general diffuse flows,diffuse flows,Zone 1 Power Consumption,Zone 2 Power Consumption,Zone 3 Power Consumption
0,1/1/2017 0:00,6.559,73.8,0.083,0.051,0.119,34055.69620,16128.87538,20240.96386
1,1/1/2017 0:10,6.414,74.5,0.083,0.070,0.085,29814.68354,19375.07599,20131.08434
2,1/1/2017 0:20,6.313,74.5,0.080,0.062,0.100,29128.10127,19006.68693,19668.43373
3,1/1/2017 0:30,6.121,75.0,0.083,0.091,0.096,28228.86076,18361.09422,18899.27711
4,1/1/2017 0:40,5.921,75.7,0.081,0.048,0.085,27335.69620,17872.34043,18442.40964
...,...,...,...,...,...,...,...,...,...
52411,12/30/2017 23:10,7.010,72.4,0.080,0.040,0.096,31160.45627,26857.31820,14780.31212
52412,12/30/2017 23:20,6.947,72.6,0.082,0.051,0.093,30430.41825,26124.57809,14428.81152
52413,12/30/2017 23:30,6.900,72.8,0.086,0.084,0.074,29590.87452,25277.69254,13806.48259
52414,12/30/2017 23:40,6.758,73.0,0.080,0.066,0.089,28958.17490,24692.23688,13512.60504


In [5]:
# Chuyển đổi cột 'DateTime' thành kiểu datetime và đặt làm chỉ mục
data['DateTime'] = pd.to_datetime(data['DateTime'])
data = data.set_index('DateTime')

data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 52416 entries, 2017-01-01 00:00:00 to 2017-12-30 23:50:00
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Temperature                52416 non-null  float64
 1   Humidity                   52416 non-null  float64
 2   Wind Speed                 52416 non-null  float64
 3   general diffuse flows      52416 non-null  float64
 4   diffuse flows              52416 non-null  float64
 5   Zone 1 Power Consumption   52416 non-null  float64
 6   Zone 2  Power Consumption  52416 non-null  float64
 7   Zone 3  Power Consumption  52416 non-null  float64
dtypes: float64(8)
memory usage: 3.6 MB


In [7]:
data.head()

Unnamed: 0_level_0,Temperature,Humidity,Wind Speed,general diffuse flows,diffuse flows,Zone 1 Power Consumption,Zone 2 Power Consumption,Zone 3 Power Consumption
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-01 00:00:00,6.559,73.8,0.083,0.051,0.119,34055.6962,16128.87538,20240.96386
2017-01-01 00:10:00,6.414,74.5,0.083,0.07,0.085,29814.68354,19375.07599,20131.08434
2017-01-01 00:20:00,6.313,74.5,0.08,0.062,0.1,29128.10127,19006.68693,19668.43373
2017-01-01 00:30:00,6.121,75.0,0.083,0.091,0.096,28228.86076,18361.09422,18899.27711
2017-01-01 00:40:00,5.921,75.7,0.081,0.048,0.085,27335.6962,17872.34043,18442.40964


In [9]:
# Hàm tạo đặc trưng lag giữ lại DateTime
def create_lag_features_with_datetime(values, date_values, n_lags=10):
    X, y, dates = [], [], []
    for i in range(n_lags, len(values)):
        X.append(values[i-n_lags:i, 0])  # Sử dụng n_lags giá trị trước để tạo đặc trưng
        y.append(values[i, 0])          # Giá trị mục tiêu hiện tại
        dates.append(date_values[i])    # Lưu DateTime của giá trị hiện tại
    return np.array(X), np.array(y), np.array(dates)

# Chuẩn bị dữ liệu
n_lags = 10  # Số lượng bước lag
target_column = 'Zone 1 Power Consumption'  # Cột cần dự đoán
values = data[[target_column]].values  # Lấy giá trị cột mục tiêu
date_values = data.index.to_numpy()  # Lấy mốc thời gian

# Gọi hàm để tạo đặc trưng lag
X, y, dates = create_lag_features_with_datetime(values, date_values, n_lags)


In [11]:
from sklearn.model_selection import train_test_split

# Chia dữ liệu thành tập huấn luyện và kiểm tra
X_train, X_test, y_train, y_test, dates_train, dates_test = train_test_split(
    X, y, dates, test_size=0.2, random_state=42
)

# Kiểm tra dữ liệu
print("Dates for test set:")
print(dates_test[:10])  # Hiển thị 10 giá trị đầu tiên của DateTime trong tập kiểm tra


Dates for test set:
['2017-11-15T14:50:00.000000000' '2017-12-22T23:40:00.000000000'
 '2017-09-16T04:30:00.000000000' '2017-06-28T21:10:00.000000000'
 '2017-07-06T20:00:00.000000000' '2017-05-25T06:10:00.000000000'
 '2017-02-06T07:00:00.000000000' '2017-07-23T14:50:00.000000000'
 '2017-05-17T22:00:00.000000000' '2017-03-24T19:50:00.000000000']


In [19]:
y_test

array([33680.     , 28946.0076 , 27066.90265, ..., 26486.15385,
       33334.46809, 24665.9081 ])

In [13]:
from sklearn.preprocessing import MinMaxScaler

# Khởi tạo bộ chuẩn hóa
scaler = MinMaxScaler()

# Chuẩn hóa dữ liệu
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [15]:
# Xây dựng mô hình DNN
model = Sequential()
model.add(Dense(64, input_dim=n_lags, activation='relu'))  # Lớp đầu vào
model.add(Dense(32, activation='relu'))                   # Lớp ẩn
model.add(Dense(1))                                       # Lớp đầu ra (dự đoán giá trị liên tục)

# Biên dịch mô hình
model.compile(optimizer='adam', loss='mse')

# Huấn luyện mô hình
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 968020544.0000 - val_loss: 69302672.0000
Epoch 2/50
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 30690926.0000 - val_loss: 16323442.0000
Epoch 3/50
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 15392214.0000 - val_loss: 13599686.0000
Epoch 4/50
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 12359054.0000 - val_loss: 9671221.0000
Epoch 5/50
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 8519583.0000 - val_loss: 5728951.0000
Epoch 6/50
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 4918678.5000 - val_loss: 3280391.2500
Epoch 7/50
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 2842705.2500 - val_loss: 2097383.5000
Epoch 8/50
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━

In [16]:
# Dự báo trên tập kiểm tra
y_pred = model.predict(X_test)

# Tạo DataFrame với DateTime, Actual và Predicted
results = pd.DataFrame({
    'DateTime': dates_test,   # DateTime tương ứng với tập kiểm tra
    'Actual': y_test.flatten(),  # Giá trị thực tế
    'Predicted': y_pred.flatten()  # Giá trị dự đoán
})

# Hiển thị 10 dòng đầu tiên
print("Dữ liệu thực tế và dự đoán:")
print(results.head(10))


[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Dữ liệu thực tế và dự đoán:
             DateTime       Actual     Predicted
0 2017-11-15 14:50:00  33680.00000  33954.429688
1 2017-12-22 23:40:00  28946.00760  29714.841797
2 2017-09-16 04:30:00  27066.90265  26548.478516
3 2017-06-28 21:10:00  43117.35099  42972.312500
4 2017-07-06 20:00:00  46271.36213  46973.687500
5 2017-05-25 06:10:00  21138.88525  20973.722656
6 2017-02-06 07:00:00  21386.44068  22203.048828
7 2017-07-23 14:50:00  35146.84385  34998.195312
8 2017-05-17 22:00:00  42705.83607  42689.136719
9 2017-03-24 19:50:00  42758.80851  42553.457031


In [23]:
dates_test

array(['2017-11-15T14:50:00.000000000', '2017-12-22T23:40:00.000000000',
       '2017-09-16T04:30:00.000000000', ...,
       '2017-11-27T00:10:00.000000000', '2017-03-18T15:20:00.000000000',
       '2017-10-21T07:10:00.000000000'], dtype='datetime64[ns]')

In [26]:
len(dates_test)

10482