In [16]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as pyplot

from math import sqrt

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.metrics import mean_squared_error

import tensorflow as tf

In [17]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [18]:
dataset = pd.read_csv("data/dataset_2.csv", index_col=0)
dataset.sort_values(by="date", inplace=True)

## run that back turbo

In [24]:
mean = dataset.groupby("date").agg("mean")
sum_data = dataset.groupby("date").agg("sum")

In [25]:
mean.head()

Unnamed: 0_level_0,cases,deaths,county,state,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,mobility_from_baseline
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.006426,0.000494,652.321816,25.34156,6.462212,2.558644,8.211147,2.472698,0.495147,-0.429389,146.52192
1,0.006442,0.000496,652.91143,25.361702,8.219363,2.613845,8.589086,2.105062,0.536272,-0.50398,115.856601
2,0.00641,0.000986,653.007878,25.343673,4.839157,-0.045305,13.399946,1.129206,-17.462207,3.260798,121.993245
3,0.006907,0.000987,653.07734,25.335468,0.047434,-1.27959,2.457412,1.153269,-0.16494,0.612342,110.104315
4,0.006903,0.000986,653.007878,25.343673,2.466831,0.702194,3.939562,1.342641,0.792211,0.134514,110.882506


In [26]:
time_series_data = sum_data.iloc[:, :2].join(mean.iloc[:, 3:])
num_feat = len(time_series_data.columns) - 2
values = time_series_data.values.astype('float32')
scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(values)
scaled_data[:5]

array([[0.00000000e+00, 0.00000000e+00, 4.24591064e-01, 8.38449121e-01,
        6.36066318e-01, 3.93262863e-01, 8.39181244e-01, 9.56668735e-01,
        5.94731793e-02, 7.50361919e-01],
       [0.00000000e+00, 0.00000000e+00, 6.04263306e-01, 8.60021293e-01,
        6.36898041e-01, 4.00935650e-01, 8.28178585e-01, 9.57248747e-01,
        5.53382896e-02, 5.19006729e-01],
       [0.00000000e+00, 1.00539901e-05, 4.43435669e-01, 8.18523169e-01,
        5.96830308e-01, 4.98604238e-01, 7.98973083e-01, 7.03426600e-01,
        2.64037013e-01, 5.65304875e-01],
       [5.97297003e-07, 1.00539901e-05, 3.70239258e-01, 7.59696066e-01,
        5.78232229e-01, 2.76452303e-01, 7.99693286e-01, 9.47359920e-01,
        1.17221095e-01, 4.75608617e-01],
       [5.97297003e-07, 1.00539901e-05, 4.43435669e-01, 7.89398551e-01,
        6.08093560e-01, 3.06542456e-01, 8.05360794e-01, 9.60858107e-01,
        9.07328799e-02, 4.81479675e-01]], dtype=float32)

In [27]:
scaled_data.shape

(105, 10)

In [28]:

reframed = series_to_supervised(scaled_data, n_in=14, n_out=1)
reframed[:5]

Unnamed: 0,var1(t-14),var2(t-14),var3(t-14),var4(t-14),var5(t-14),var6(t-14),var7(t-14),var8(t-14),var9(t-14),var10(t-14),...,var1(t),var2(t),var3(t),var4(t),var5(t),var6(t),var7(t),var8(t),var9(t),var10(t)
14,0.0,0.0,0.424591,0.838449,0.636066,0.393263,0.839181,0.956669,0.059473,0.750362,...,1.3e-05,2e-05,0.50824,0.940196,0.723849,0.452588,0.926964,0.993303,0.0,0.78295
15,0.0,0.0,0.604263,0.860021,0.636898,0.400936,0.828179,0.957249,0.055338,0.519007,...,1.6e-05,2e-05,1.0,1.0,0.774084,0.526353,0.966422,1.0,0.013352,0.525368
16,0.0,1e-05,0.443436,0.818523,0.59683,0.498604,0.798973,0.703427,0.264037,0.565305,...,2.9e-05,7e-05,0.443436,0.889694,0.682608,0.34312,0.817749,0.998923,0.061615,0.486291
17,5.97297e-07,1e-05,0.370239,0.759696,0.578232,0.276452,0.799693,0.94736,0.117221,0.475609,...,4.1e-05,8e-05,0.370239,0.95593,0.762172,0.429333,0.841521,0.974291,0.049124,0.510139
18,5.97297e-07,1e-05,0.443436,0.789399,0.608094,0.306542,0.805361,0.960858,0.090733,0.48148,...,6.2e-05,0.000121,0.370239,0.898504,0.700523,0.411969,0.845387,0.97725,0.058258,0.510334


In [29]:
reframed.shape

(91, 150)

In [30]:
reframed.drop(reframed.columns[-num_feat:], axis=1, inplace=True)
values=reframed.values

In [31]:
train_size = 80
train, test = values[:train_size, :], values[train_size:, :]
train_X, train_y = train[:, :-2], train[:, -2:]
test_X, test_y = test[:, :-2], test[:, -2:]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

(80, 1, 140) (80, 2) (11, 1, 140) (11, 2)


In [None]:
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(2))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=100, batch_size=1, validation_data=(test_X, test_y), verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

Train on 80 samples, validate on 11 samples
Epoch 1/100
 - 12s - loss: 0.0368 - val_loss: 0.0509
Epoch 2/100
 - 1s - loss: 0.0876 - val_loss: 0.0454
Epoch 3/100
 - 1s - loss: 0.0657 - val_loss: 0.0291
Epoch 4/100
 - 1s - loss: 0.0528 - val_loss: 0.0700
Epoch 5/100
 - 1s - loss: 0.0672 - val_loss: 0.0667
Epoch 6/100
 - 1s - loss: 0.0285 - val_loss: 0.0699
Epoch 7/100
 - 1s - loss: 0.0226 - val_loss: 0.0555
Epoch 8/100
 - 2s - loss: 0.0430 - val_loss: 0.0327
Epoch 9/100
 - 1s - loss: 0.0304 - val_loss: 0.0316
Epoch 10/100
 - 1s - loss: 0.0374 - val_loss: 0.0462
Epoch 11/100
 - 1s - loss: 0.0511 - val_loss: 0.0400
Epoch 12/100
 - 1s - loss: 0.0272 - val_loss: 0.0480
Epoch 13/100
 - 1s - loss: 0.0267 - val_loss: 0.0207
Epoch 14/100
 - 1s - loss: 0.0277 - val_loss: 0.0434
Epoch 15/100
 - 1s - loss: 0.0323 - val_loss: 0.0475
Epoch 16/100
 - 1s - loss: 0.0252 - val_loss: 0.0390
Epoch 17/100
 - 1s - loss: 0.0269 - val_loss: 0.0539
Epoch 18/100
 - 1s - loss: 0.0263 - val_loss: 0.0298
Epoch 19/1

In [None]:
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
# invert scaling for forecast
inv_yhat = np.concatenate((yhat, test_X[:, 2:(num_feat + 2)]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,:2]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 2))
inv_y = np.concatenate((test_y, test_X[:, 2:(num_feat + 2)]), axis=1)
full = scaler.inverse_transform(inv_y)
inv_y = full[:,:2]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

In [None]:
inv_y = pd.DataFrame(inv_y)

In [None]:
inv_yhat = pd.DataFrame(inv_yhat)

In [None]:
dates = pd.Series(time_series_data.index[80:])
dates

In [None]:
results = pd.DataFrame()
results = pd.concat([dates, inv_y, inv_yhat], axis=1)

In [None]:
results.columns = ["date", "cases", "deaths", "cases_hat", "deaths_hat"]

In [None]:
results.plot(x="date", y=["cases", "cases_hat"])

In [None]:
results.plot(x="date", y=["deaths", "deaths_hat"])