In [1]:
### [0] 설정 ###

# 라이브러리
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # tensorflow message 안뜨게 하기

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential

In [2]:
# tensorflow 결과 재현성을 위한 seed 설정
'''
▪️ https://ballentain.tistory.com/84
▪️ https://junstar92.tistory.com/98 (initializer 설정, 모델에서 레이어 지정 시 뒤에 붙인다.)

▪️ 값 변경
- random_seed -> 결과값 달라짐

▪️ 동일한 결과를 위해 seed를 선언하고 실행하는 것이 좋다(Restart -> Run)
''' 

import os
import random
import numpy as np
import tensorflow as tf

random_seed = 9
tf_version = tf.__version__

tf.keras.utils.set_random_seed(random_seed)
tf.config.experimental.enable_op_determinism()
np.random.seed(random_seed)
random.seed(random_seed)

initializer = tf.keras.initializers.GlorotUniform(seed=1)

In [3]:
### [1] dataset 가공 ###

# 1) import data
'''
▪️ 값 변경
- target(Y)가 달라지면 y의 위치를 옮겨줘야 한다. 두번째 line 변경
'''

green_data = pd.read_csv("C:/Users/yoelK/OneDrive/문서/Time-series/Data/green_innout_final(1).csv")
green_data = green_data.drop(columns=["CO2air"]).assign(CO2air=green_data["CO2air"]) # y를 맨 마지막 위치로 이동
green_data.head()  # 2019-12-16 ~ 2020-05-30

Unnamed: 0,time,T_out,RH_out,I_glob,Winddir,Windsp,Tair,Rhair,VentLee,Ventwind,AssimLight,EnScr,CO2air
0,2019-12-16 00:00:00,6.9,81.0,0.0,32.0,4.7,21.3,60.7,0.8,0.0,0.0,96.0,468.0
1,2019-12-16 00:05:00,6.9,81.0,0.0,32.0,4.7,21.5,62.4,0.8,0.0,0.0,96.0,508.999999
2,2019-12-16 00:10:00,6.9,80.0,0.0,32.0,4.7,22.0,61.4,0.8,0.0,0.0,96.0,498.000001
3,2019-12-16 00:15:00,6.9,80.0,0.0,32.0,4.7,21.3,64.8,0.8,0.0,0.0,96.0,491.000001
4,2019-12-16 00:20:00,6.7,81.0,0.0,32.0,4.7,21.6,64.6,0.8,0.0,0.0,95.4,514.999999


In [4]:
# 2) Data 변수 설정 및 조정

original_temp = green_data["CO2air"].values
dates = green_data["time"]    # 그래프를 위한 time 정보 지정

cols = list(green_data)[1:13] # time column 제외
green_data = green_data[cols].astype(float)
green_data

Unnamed: 0,T_out,RH_out,I_glob,Winddir,Windsp,Tair,Rhair,VentLee,Ventwind,AssimLight,EnScr,CO2air
0,6.9,81.0,0.0,32.0,4.7,21.3,60.7,0.8,0.0,0.0,96.0,468.000000
1,6.9,81.0,0.0,32.0,4.7,21.5,62.4,0.8,0.0,0.0,96.0,508.999999
2,6.9,80.0,0.0,32.0,4.7,22.0,61.4,0.8,0.0,0.0,96.0,498.000001
3,6.9,80.0,0.0,32.0,4.7,21.3,64.8,0.8,0.0,0.0,96.0,491.000001
4,6.7,81.0,0.0,32.0,4.7,21.6,64.6,0.8,0.0,0.0,95.4,514.999999
...,...,...,...,...,...,...,...,...,...,...,...,...
47804,15.1,71.0,0.0,2.0,4.3,18.3,77.8,8.8,5.7,0.0,0.0,493.000000
47805,15.0,71.0,0.0,2.0,4.3,18.3,78.5,8.3,4.8,0.0,0.0,493.000000
47806,15.0,71.0,0.0,2.0,3.8,18.3,79.1,8.3,4.8,0.0,0.0,485.000000
47807,15.0,71.0,0.0,2.0,3.8,18.0,79.9,10.6,5.7,0.0,0.0,502.000000


In [5]:
#3) Train, test data 분리
'''
* train:test = 0.95:0.05
* train: 2019-12-16 ~ 2020-05-21
* test: 2020-05-21 16:50:00 ~ 2020-05-30 00:00:00 (약 9일)
'''

n_train = int(0.95 * green_data.shape[0])

train_data = green_data[:n_train]
train_dates = dates[:n_train]

test_data = green_data[n_train:]
test_dates = dates[n_train:]

In [6]:
#4) 정규화
'''
* StandardScaler 사용 -> 성능이 가장 잘 나왔음
* Data leakage 방지를 위해 train과 test 각각 다른 스케일러 적용
'''

scaler1 = StandardScaler()
scaler2 = StandardScaler()

train = scaler1.fit_transform(train_data)
test = scaler2.fit_transform(test_data)

In [7]:
### [2] LSTM 모델에 넣기 위한 데이터 가공(Window dataset) ###
'''
▪️ 변수
- pred_days: 예측하고자 하는 기간.
- seq_len: 예측을 위해 학습시킬 sequence의 길이.
'''

pred_days = 1
seq_len = 156

trainX = []
trainY = []
testX = []
testY =[]

for i in range(seq_len, n_train - pred_days + 1):
    trainX.append(train[i - seq_len:i, 0:-1])
    trainY.append(train[i + pred_days - 1:i + pred_days,-1])

for i in range(seq_len, len(test) - pred_days + 1):
    testX.append(test[i - seq_len:i, 0:-1])
    testY.append(test[i + pred_days - 1:i + pred_days,-1])

trainX, trainY = np.array(trainX), np.array(trainY)
testX, testY = np.array(testX), np.array(testY)

print(trainX.shape, trainY.shape, testX.shape, testY.shape)

(45262, 156, 11) (45262, 1) (2235, 156, 11) (2235, 1)


In [8]:
### [3] 모델 구성 ###

# model 정의
model = Sequential([
    LSTM(11, input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=True, kernel_initializer=initializer),
    LSTM(11, return_sequences=False, kernel_initializer=initializer),
    Dense(trainY.shape[1], kernel_initializer=initializer),
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 156, 11)           1012      
                                                                 
 lstm_1 (LSTM)               (None, 11)                1012      
                                                                 
 dense (Dense)               (None, 1)                 12        
                                                                 
Total params: 2036 (7.95 KB)
Trainable params: 2036 (7.95 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
# Compile & train(학습)

optimizer = Adam()
model.compile(optimizer=optimizer, loss='mse', metrics=["mse"])

from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(patience=20)
model.fit(trainX, trainY, validation_split=0.1, verbose=1, batch_size=256, epochs=100, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100


<keras.src.callbacks.History at 0x1cb591935e0>

In [10]:
### [4] 모델 성능 예측 ###

model.evaluate(testX, testY)

prediction = model.predict(testX)
print(prediction.shape, testY.shape)

 1/70 [..............................] - ETA: 9s - loss: 0.2703 - mse: 0.2703

(2235, 1) (2235, 1)


In [11]:
### [5] Scale값 돌려놓기 ###
'''
▪️ link : https://pasus.tistory.com/266

1. generate array filled with means for prediction
2. substitute predictions into the first column
3. inverse transform
'''

# y_pred
mean_values_pred = np.repeat(scaler2.mean_[np.newaxis, :], prediction.shape[0], axis=0)
mean_values_pred[:, 0] = np.squeeze(prediction)
y_pred = scaler2.inverse_transform(mean_values_pred)[:,0]
print(y_pred.shape)

# testY_original

mean_values_testY = np.repeat(scaler2.mean_[np.newaxis, :], testY.shape[0], axis=0)
mean_values_testY[:, 0] = np.squeeze(testY)
testY_original = scaler2.inverse_transform(mean_values_testY)[:,0]
print(testY_original.shape)

(2235,)
(2235,)


In [12]:
### [6] 평가지표 ###

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mse = mean_squared_error(testY, prediction)
mae = mean_absolute_error(testY, prediction)
rmse = np.sqrt(mean_squared_error(testY, prediction))
r2 = r2_score(testY, prediction)

print('CO2 MSE: {:.4f}'.format(mse)) 
print('CO2 MAE: {:.4f}'.format(mae)) 
print('CO2 RMSE: {:.4f}'.format(rmse)) 
print('CO2 R2: {:.4f}'.format(r2))

CO2 MSE: 0.9299
CO2 MAE: 0.7747
CO2 RMSE: 0.9643
CO2 R2: 0.0425


In [20]:
### [8] 시각화 ###

import plotly.graph_objects as go

# Create a DataFrame object with the data to plot
df = pd.DataFrame({'test': testY, 'predict': prediction})

# Create a Figure object and add a trace for each column in the DataFrame
fig = go.Figure()
for col in df.columns:
    fig.add_trace(go.Scatter(x=test_dates[seq_len:].values, y=df[col].values, mode='lines', name=col))

# Customize the layout of the plot
fig.update_layout(title='CO2: Actual and Predicted', xaxis_title='Date', yaxis_title='CO2 (30min)')

# Display the plot
fig.show()