In [30]:
from pymongo import MongoClient
import json
import pandas as pd
import numpy as np
import requests
import argparse
import pickle
import yfinance as yf

from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from math import *
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.callbacks import ModelCheckpoint
from keras.losses import MeanSquaredError
from keras.metrics import RootMeanSquaredError
from keras.optimizers import Adam

In [31]:
def preprocess(symbol, start, end, lookback):

    try:
        data = yf.download(symbol, start=start, end=end, progress=False)
        data['date'] = pd.to_datetime(data.index)
        df = data[['date', 'Close']]

        scaler = MinMaxScaler(feature_range=(0,1))
        dataset = df.filter(['Close']).values
        dataset = scaler.fit_transform(dataset)

        dataX, dataY = [], []
        for i in range(len(dataset) - lookback):
            row = [a for a in dataset[i:i+lookback]]
            dataX.append(row)
            dataY.append(dataset[i + lookback][0])
        X, y = np.array(dataX), np.array(dataY)

        # Train-test split
        split_point = int(len(dataset)*0.9)
        X_train, y_train = X[:split_point], y[:split_point]
        X_val, y_val = X[split_point:len(X)], y[split_point:len(y)]

    
    except Exception as e:
        print('ERROR: Loading data failed - ', e)

    return X_train, y_train, X_val, y_val

In [32]:
X_train, y_train, X_val, y_val = preprocess('AAPL', start='2015-01-01', end='2023-02-01', lookback=60)

In [33]:
X_train

[[array([0.0297789]),
  array([0.02494904]),
  array([0.02496473]),
  array([0.02730125]),
  array([0.03379333]),
  array([0.03398151]),
  array([0.02965345]),
  array([0.03117454]),
  array([0.03051593]),
  array([0.02584288]),
  array([0.02454132]),
  array([0.02882234]),
  array([0.03012389]),
  array([0.03459308]),
  array([0.0355026]),
  array([0.03569077]),
  array([0.02948095]),
  array([0.03915635]),
  array([0.04478596]),
  array([0.04205741]),
  array([0.04436256]),
  array([0.04439393]),
  array([0.04582092]),
  array([0.04641682]),
  array([0.044833]),
  array([0.04607183]),
  array([0.04967853]),
  array([0.0541634]),
  array([0.05664106]),
  array([0.05761331]),
  array([0.05878941]),
  array([0.06018505]),
  array([0.05976165]),
  array([0.06140819]),
  array([0.06689667]),
  array([0.06559511]),
  array([0.06029481]),
  array([0.06285087]),
  array([0.05977734]),
  array([0.06076525]),
  array([0.06118866]),
  array([0.05990277]),
  array([0.05656266]),
  array([0.05686

In [None]:
def read_mongo(_symbol):

    '''
    '''
    try:
        client = MongoClient(MONGO_URI)
        db = client.stock
        collection = db[_symbol]

        list_cursor = list(collection.find())

        to_df = pd.DataFrame(list_cursor)
        df = list_to_dataframe(to_df)
    
        print('Connect MongoDB successfully!')

    except Exception as e:
        print('ERROR: ', e)
        return

    return df

In [None]:
def input_dataset(dataset, lookback):

    dataX, dataY = [], []

    for i in range(len(dataset) - lookback):
        row = [a for a in dataset[i:i+lookback]]
        dataX.append(row)
        dataY.append(dataset[i + lookback][0])

    return np.array(dataX), np.array(dataY)

In [None]:
def array_to_input(array, lookback):

    dataX = []

    for i in range(len(array) - lookback + 1):
        row = [[a] for a in array[i:i+lookback]]
        dataX.append(row)

    return np.array(dataX)

In [21]:
# Create a new dataframe with only the 'Close column 
df = read_mongo(_symbol=symbol)
data = df.filter(['close'])
# Convert the dataframe to a numpy array
dataset = data.values
print(len(dataset))
print(50*'~')
# Get the number of rows to train the model on
split_point = int(len(dataset)*0.9)

# Min Max Scale
scaler = MinMaxScaler(feature_range=(0,1))
scaled = scaler.fit_transform(dataset)

# Input dataset
X, y = input_dataset(scaled, lookback)

print('Data train shape: {}'.format(X.shape))
print('Data test shape: {}'.format(y.shape))
print(50*'~')

# Train-test split
X_train, y_train = X[:split_point], y[:split_point]
X_val, y_val = X[split_point:len(X)], y[split_point:len(y)]


## Modeling
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))

# Checkpoint
cp = ModelCheckpoint(f'checkpoint/{symbol}/', save_best_only=True)

# Compile the model
model.compile(loss=MeanSquaredError(), optimizer=Adam(learning_rate=0.0001), metrics=[RootMeanSquaredError()])

# Train the model
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_val, y_val), callbacks=[cp])

# Save
pickle.dump(model, open(f'model/LTSM_{symbol}.sav', 'wb'))


Connect MongoDB successfully!
2034
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Data train shape: (1974, 60, 1)
Data test shape: (1974,)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 1/5



INFO:tensorflow:Assets written to: training_history/assets


INFO:tensorflow:Assets written to: training_history/assets


Epoch 2/5



INFO:tensorflow:Assets written to: training_history/assets


INFO:tensorflow:Assets written to: training_history/assets


Epoch 3/5
Epoch 4/5



INFO:tensorflow:Assets written to: training_history/assets


INFO:tensorflow:Assets written to: training_history/assets


Epoch 5/5
Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......lstm
.........cell
............vars
...............0
...............1
...............2
.........vars
......lstm_1
.........cell
............vars
...............0
...............1
...............2
.........vars
...metrics
......mean
.........vars
............0
............1
......root_mean_squared_error
.........vars
............0
............1
...vars
Keras model archive saving:
File Name                                             Modified             Size
metadata.json                                  2023-02-09 14:11:09           64
config.json                                    2023-02-09 14:11:09         3635
variables.h5                                   2023-02-09 14:11:09       494920


In [29]:
## Result
model = pickle.load(open(f'model/LTSM_{symbol}.sav', 'rb'))
y_predict = model.predict(X_val)

predictY_inverse = scaler.inverse_transform(y_predict.reshape(-1, 1))
valY_inverse = scaler.inverse_transform(y_val.reshape(-1, 1))

result = pd.DataFrame({
    'timestamp': df.datetime[-len(y_val):],
    'true': np.reshape(valY_inverse, len(valY_inverse)),
    'predict': np.reshape(predictY_inverse, len(predictY_inverse)),
})

rmse = sqrt(mean_squared_error(valY_inverse, predictY_inverse))
print('Test RMSE: %.3f' % rmse)
print(50*'~')


# forecast = recursive_forecasting(
#     model = model,
#     scaler = scaler,
#     input = y_val, 
#     list_timestamp = df.datetime[-len(y_val):].values, 
#     step = 1
# )
# print(forecast.tail(10))

Keras model archive loading:
File Name                                             Modified             Size
metadata.json                                  2023-02-09 14:11:08           64
config.json                                    2023-02-09 14:11:08         3635
variables.h5                                   2023-02-09 14:11:08       494920
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......lstm
.........cell
............vars
...............0
...............1
...............2
.........vars
......lstm_1
.........cell
............vars
...............0
...............1
...............2
.........vars
...metrics
......mean
.........vars
............0
............1
......root_mean_squared_error
.........vars
............0
............1
...vars
Test RMSE: 9.296
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [28]:
forecast

Unnamed: 0,timestamp,predict
0,2022-07-06,142.920000
1,2022-07-07,146.350010
2,2022-07-08,147.039990
3,2022-07-11,144.870000
4,2022-07-12,145.860000
...,...,...
140,2023-01-25,141.860000
141,2023-01-26,143.960010
142,2023-01-27,145.929990
143,2023-01-30,143.000000


In [23]:
def recursive_forecasting(model, scaler, input, list_timestamp, step, **kwargs):

    for i in range(step):
        tmp = model.predict(array_to_input(input, lookback))
        input = np.append(input, tmp[-1])
    
    while len(list_timestamp) < len(input):
        list_timestamp = np.append(list_timestamp, list_timestamp[-1] + np.timedelta64(1, 'D'))
    
    output_predict = scaler.inverse_transform(input.reshape(-1, 1))
    
    return pd.DataFrame({
        'timestamp': list_timestamp,
        'predict': np.reshape(output_predict, len(output_predict))
    })