In [None]:
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as  tf
%matplotlib inline

In [None]:
print(tf.test.gpu_device_name())
"""
Returns the name of a GPU device if available or a empty string.
"""
print(tf.config.list_physical_devices("GPU"))  #default cpu&gpu
"""
Return a list of physical devices visible to the host runtime.
"""
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  

In [None]:
data  = pd.read_csv("./PRSA_data_2010.1.1-2014.12.31.csv")

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data[data['pm2.5'].isna()]

In [None]:
data = data.iloc[24:].copy()

In [None]:
data.fillna(method='ffill',inplace=True)

In [None]:
data.drop('No',axis=1,inplace=True)

In [None]:
import datetime


In [None]:
#  将年月日时转换为时间型索引
data['time'] = data.apply(lambda x: datetime.datetime(year = x['year'],month=x['month'],
                                                     day = x['day'],
                                                     hour = x['hour']),
                         axis = 1)

In [None]:
data.set_index('time',inplace=True)

In [None]:
data.drop(columns=['year','month','day','hour'],inplace=True)

In [None]:
# 对cbwd为object型，需要对其编码 
data = data.join(pd.get_dummies(data.cbwd))


In [None]:
del data['cbwd']


In [None]:
data['pm2.5'].plot()

# 生成数据集

时间序列数据考虑的是前一些时刻的数据与当前时刻的输出相关。时间序列的数据要进行形式变换。

如用前5小时的数据预测1小时的，就要对数据按6小时，滑动窗格方式取出，其中前5小时为X，后1小时的为Y。

如0：5，1：6，2：7...，n-5：n，直到最后一个样本n，每次取6小时数据。

其中0：4共5小时为X，5为Y；1：5共5小时为第二个样本的X，6小时的为第二个样本Y。以此共n-4个样本。

In [None]:
sequence_length = 5*24#预测使用的基础序列长度，可以根据预测结果调整
pred = 1
data_ = []
#连续重叠开窗分块数据（如0:5*24+1—1,1-5*24+1,2:5*24+2，...等），
#每一块是一个样本，得到LSTM需要的数据集。
for i in range(len(data) - sequence_length - pred):
    data_.append(data.iloc[i :i + sequence_length + pred])
    
data_ = np.array([df.values for df in data_])

# 数据集划分

In [None]:
np.random.shuffle(data_)

In [None]:
x = data_[:,:-pred,:]

In [None]:
y = data_[:,-pred:,0]

In [None]:
split_0 = int(data_.shape[0]*0.7)

In [None]:
split_1 = int(data_.shape[0]*0.9)

In [None]:
train_x = x[:split_0]
val_x = x[split_0:split_1]
test_x = x[split_1:]

In [None]:
train_y = y[:split_0]
val_y = y[split_0:split_1]
test_y = y[split_1:]

# 建模
activation="tanh"是LSTM使用cuDNN必要条件，可以不使用cuDNN。

**cuDNN**（CUDA Deep Neural Network library）：是NVIDIA打造的针对深度神经网络的加速库，是一个用于深层神经网络的GPU加速库。

In [None]:
model = keras.Sequential()
model.add(layers.LSTM(32,input_shape = (train_x.shape[1:]),activation="tanh",
                     return_sequences=True))
model.add(layers.LSTM(128,activation="tanh",return_sequences=True))
model.add(layers.LSTM(32,activation="tanh"))
model.add(layers.Dense(1))

注：本质上数据集划分后，用前馈神经网络，CNN都可以，只是效果上的差异。

In [None]:
# CNN网络
'''
model_D = keras.Sequential()
model_D.add(layers.Flatten(input_shape=(train_x.shape[1:]))) #拍平
model_D.add(layers.Dense(16, activation='relu'))
model_D.add(layers.Dense(4, activation='relu'))
model_D.add(layers.Dense(1))
model_D.compile(optimizer=keras.optimizers.Adam(), loss='mse')
history = model_D.fit(train_x, train_y,
                    batch_size = 128,
                    epochs=40,
                    validation_data=(val_x, val_y),
                    use_multiprocessing=True
                    )
'''

# 编译   

In [None]:
model.compile(optimizer=keras.optimizers.Adam(),
             loss="mae")
#指定优化算法，损失函数

# 训练

In [None]:
# 每次128（batch_size）， 全部做完一遍需要做273次，总样本数大致为273*128。
history = model.fit(train_x,train_y,batch_size=128,epochs=200,validation_data=(val_x,val_y))

In [None]:
plt.plot(history.epoch, history.history.get('loss'), 'y', label='Training loss')
plt.plot(history.epoch, history.history.get('val_loss'), 'b', label='Test loss')
plt.legend()

# 预测

In [None]:
pred_y=model.predict(test_x)

# 评价

In [None]:
from sklearn.metrics  import r2_score,mean_squared_error
print(r2_score(pred_y,test_y))
print(mean_squared_error(y_test,pred_y))