# LTSM for foot traffic
## setup imports


In [16]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import pandas_datareader as web

from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.layers import Dense, Dropout, LSTM

import datetime as dt

import plotly
import chart_studio.plotly as py
import plotly.tools as tls
import plotly.express as px
import plotly.graph_objects as go


# Utils

In [17]:
def list_to_np_array(tmp_list, label=""):
    result = np.array(tmp_list)
    result = np.reshape(result, (result.shape[0], result.shape[1], 1))
    print(f"{label}: row: {result.shape[0]} col: {result.shape[1]}")
    return result

### constants

In [18]:
#  call these brands since they are agregated info
company = 'FIVE_GUYS_BURGERS_AND_FRIES'
ROW_AXIS = 0
COL_AXIS = 1

## Load Data


In [19]:
start = dt.datetime(2020, 4, 1)
end = dt.datetime(2020, 12, 31)
# end = dt.datetime.now() - dt.timedelta(days=1)
data = pd.read_csv('/Users/joe.lau/Downloads/bar.csv',index_col = 'Date')


# Creates the datetime object 
data.index = pd.to_datetime(data.index)

## Gets the training data from the start to the first month of January
train_data = data.loc[start:end].copy(deep = True)

# train_data = web.DataReader(company, 'yahoo', start, end)
train_company_data = train_data[company]

# train_data_close = train_data[CLOSE].values.reshape(-1,1)


In [20]:
data

Unnamed: 0_level_0,ARBYS,BURGER_KING,CARLS_JR,CHICK_FIL_A,CHIPOTLE_MEXICAN_GRILL,DAIRY_QUEEN,DUNKIN_DONUTS,FIVE_GUYS_BURGERS_AND_FRIES,HARDEES,IN_N_OUT_BURGER,...,KFC,MCDONALDS,PANERA_BREAD,PEETS_COFFEE_AND_TEA,SONIC,STARBUCKS,SUBWAY,TACO_BELL,WENDYS,WINGSTOP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-02-01,23787.0,70305.0,10480.0,,70267.0,29642.0,212825.0,28065.0,,2040.0,...,42653.0,219436.0,36544.0,16760.0,18644.0,421716.0,554700.0,52921.0,41854.0,14606.0
2019-02-02,20965.0,63182.0,8878.0,,67573.0,31738.0,175872.0,28037.0,,1987.0,...,38071.0,204060.0,34672.0,12372.0,17223.0,394936.0,497659.0,49214.0,36205.0,13853.0
2019-02-03,15957.0,48316.0,6802.0,,49140.0,21795.0,141970.0,19584.0,,1668.0,...,30743.0,173091.0,24039.0,9328.0,13448.0,314069.0,383732.0,36416.0,28014.0,11463.0
2019-02-04,15898.0,48960.0,7872.0,,51022.0,19296.0,152418.0,17851.0,,1505.0,...,29829.0,165435.0,25106.0,11959.0,12482.0,301276.0,386249.0,36902.0,31007.0,9444.0
2019-02-05,18564.0,58377.0,10308.0,,73828.0,22728.0,185770.0,22315.0,,1593.0,...,40717.0,216642.0,30576.0,14543.0,14832.0,368852.0,472673.0,43790.0,36222.0,11235.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-13,13666.0,40076.0,7573.0,42183.0,16030.0,26992.0,145809.0,8776.0,5200.0,1577.0,...,27686.0,128383.0,13364.0,4295.0,25404.0,243757.0,288740.0,47581.0,24766.0,10994.0
2021-01-14,9999.0,31672.0,5507.0,35514.0,11453.0,18457.0,127685.0,6323.0,4797.0,1081.0,...,19383.0,108859.0,10934.0,3011.0,20153.0,173356.0,211754.0,35417.0,19945.0,7885.0
2021-01-15,12533.0,38204.0,9411.0,43293.0,16429.0,26768.0,145873.0,9193.0,5499.0,2175.0,...,27155.0,129200.0,13384.0,4587.0,24826.0,226515.0,263942.0,49468.0,25125.0,11294.0
2021-01-16,11536.0,39361.0,8555.0,47550.0,18173.0,35953.0,143189.0,10880.0,5084.0,2743.0,...,25433.0,136130.0,13921.0,3933.0,24087.0,296270.0,303428.0,48177.0,22960.0,11794.0


## Prepare Data

In [21]:
# scale date to fit between 0 - 1
scalar = MinMaxScaler(feature_range=(0,1))
scaled_train_data = scalar.fit_transform(train_company_data.values.reshape(-1,1))
print(f"scaled_train_data.shape: {scaled_train_data.shape}")

# how many days to look into past
# window of data to use for pridiction
# larger window is less sensative to recent stock prices
# smaller window is more sensitive to recent stock prices
prediction_days = 5

size_data = len(scaled_train_data)

# x_train is a sliding window of 60 days of data
x_train = []

# y_train is the closing day value of stocks on 60th day
y_train = []

for x in range(prediction_days, size_data):
    prev_x = x - prediction_days
    x_training_data = scaled_train_data[prev_x:x, 0]
    # x_training_data shape = (60,)
    x_train.append(x_training_data)

    y_training_data = scaled_train_data[x, 0]
    # y_training_data shape = ()
    y_train.append(y_training_data)

x_train = list_to_np_array(x_train, "x_train")
y_train = np.array(y_train)


scaled_train_data.shape: (275, 1)
x_train: row: 270 col: 5


## Build model

In [22]:
model = Sequential()
# you can tweak this parameter
units = 15
# x_train.shape[1] = 30
model.add(LSTM(units=units, return_sequences= True, input_shape=(x_train.shape[1], 1)))
model.add(Dropout(0.2))

model.add(LSTM(units=units, return_sequences= True))
model.add(Dropout(0.2))

model.add(LSTM(units=units))
model.add(Dropout(0.2))

model.add(Dense(units=1)) # predict the closing price

model.compile(optimizer="adam", loss="mean_squared_error", metrics=[RootMeanSquaredError(name="rmse")])
model.fit(x_train, y_train, epochs=25, batch_size=32)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x16d34c810>

## Load test data

In [23]:
test_start = dt.datetime(2020,1,1)
test_end = dt.datetime(2021,1,17)

test_data = data.loc[test_start:test_end].copy(deep = True)
# test_data = web.DataReader(company, 'yahoo', test_start, test_end)
y_test = test_data[company].values
print(f"train_data[company].shape: {train_data[company].shape}")
print(f"test_data[company].shape: {test_data[company].shape}")

total_dataset = pd.concat((train_data[company], test_data[company]), axis=ROW_AXIS)
print("total_dataset.shape", total_dataset.shape, type(total_dataset))
start_index = len(total_dataset) - len(test_data) - prediction_days
print(f"start_index: {start_index}")

# model_inputs is a list
model_inputs = total_dataset[start_index:].values
model_inputs_date = total_dataset[start_index:].index
# print("total_dataset[start_index:].values", model_inputs.shape, model_inputs[:-1])

# model_inputs is a column
model_inputs = model_inputs.reshape(-1,1)
# print("model_inputs.reshape(-1,1)", model_inputs.shape, model_inputs[:-1])

# model_input is a column of scaled values
model_inputs = scalar.transform(model_inputs)
# print("scalar.transform(model_inputs)", model_inputs.shape, model_inputs[:-1])

train_data[company].shape: (275,)
test_data[company].shape: (383,)
total_dataset.shape (658,) <class 'pandas.core.series.Series'>
start_index: 270


## Make predictions on test data

In [24]:
x_test = []
x_test_date = []
for x in range(prediction_days, len(model_inputs)):
    x_test.append(model_inputs[x-prediction_days:x, 0])
    x_test_date.append(model_inputs_date[x])

x_test = list_to_np_array(x_test, "x_test")

predicted_prices = model.predict(x_test)
# reverse the scaling
predicted_prices = scalar.inverse_transform(predicted_prices)


x_test: row: 383 col: 5


In [25]:
np.shape(x_test_date)

(383,)

## Plot test predictions

In [26]:
predicted_prices_length = predicted_prices.size
fig = go.Figure()
x_range = np.linspace(0, predicted_prices_length, predicted_prices_length)

fig.add_trace(go.Scatter(name="predicted", x=x_test_date, y=predicted_prices.reshape(predicted_prices_length)))
fig.add_trace(go.Scatter(name="actual", x=x_test_date, y=y_test.reshape(predicted_prices_length)))
fig.update_layout(title=f'{company} (LSTM)', xaxis_title="time", yaxis_title=f"{company} price")
fig.show()

## Predict next day

In [27]:
row_selection = len(model_inputs) + 1 - prediction_days
col_selection = len(model_inputs + 1)
real_data = [model_inputs[row_selection: col_selection, 0]]
real_data = list_to_np_array(real_data, "real_data")

prediction = model.predict(real_data)
prediction = scalar.inverse_transform(prediction)[0][0]

prediction_str = "{:0.2f}".format(prediction)
test_end_str = test_end.strftime("%m-%d-%Y")
print(f"Prediction for {test_end_str}:  visits {prediction_str}")


real_data: row: 1 col: 4
Prediction for 01-17-2021:  visits 5800.38


## RMSE how good is it?

In [28]:
se = np.square(y_test - predicted_prices)
mse = np.mean(se)
rmse = np.sqrt(mse)
rmse_str = "{:0.2f}".format(rmse)
print(f'RMSE: {rmse_str}')

# update rmse calc for looking at the last 100 days 

RMSE: 4972.29


## RMSE: last 100 days

In [29]:
se = np.square(y_test[-100:] - predicted_prices[-100:])
mse = np.mean(se)
rmse = np.sqrt(mse)
rmse_str = "{:0.2f}".format(rmse)
print("rmse: " + rmse_str)

rmse: 1864.36
