## Imported Libraries

In [47]:
# Linear Algebra
import numpy as np

# Data Processing
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
%matplotlib inline

# For LSTM Architecture
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

In [2]:
# Load the data
dataset = pd.read_csv('fyp.csv')

## Data Exploration/Analysis

In [3]:
dataset.head()

Unnamed: 0,id,date,open,low,high,close,volume
0,1,883905300,170.09,170.09,170.21001,170.21001,12985.0
1,2,883991700,162.64999,162.64999,170.23,170.23,4825.0
2,3,884078100,162.87,162.87,170.39,170.39,2170.0
3,4,884164500,165.47,165.47,170.32001,170.32001,13375.0
4,5,884250900,180.46001,170.69,180.46001,170.69,9420.0


In [4]:
dataset.tail()

Unnamed: 0,id,date,open,low,high,close,volume
4930,4931,1584440036,1316.40002,1266.43994,1321.5,1272.06995,1465185000.0
4931,4932,1584526445,1272.07,1233.97,1287.38,1255.8,1184417000.0
4932,4933,1584612845,1255.8,1222.56,1279.32,1269.31,1903142000.0
4933,4934,1584872046,1269.31,1246.2,1275.42,1251.46,194643000.0
4934,4935,1589278500,1250.85559,1226.16,1251.4,1226.159,99570200.0


In [5]:
dataset.shape

(4935, 7)

In [6]:
# Let's check missing data
dataset.describe()

Unnamed: 0,id,date,open,low,high,close,volume
count,4935.0,4935.0,4935.0,4935.0,4935.0,4935.0,4935.0
mean,2468.0,1229119000.0,625.601454,621.878734,629.585177,625.739177,92585070.0
std,1424.756119,203191100.0,446.232494,443.86234,448.741565,446.160783,288900400.0
min,1.0,883905300.0,146.92999,146.92999,156.42999,156.42999,5.0
25%,1234.5,1054070000.0,275.645005,274.625,279.84999,277.834995,10568.0
50%,2468.0,1220347000.0,433.92999,430.76999,435.48999,432.39001,22804.0
75%,3701.5,1398374000.0,949.984985,947.725005,951.755005,949.32501,2387848.0
max,4935.0,1589278000.0,1881.44995,1866.28003,1888.35999,1881.44995,4847437000.0


In [7]:
dataset.count(0)

id        4935
date      4935
open      4935
low       4935
high      4935
close     4935
volume    4935
dtype: int64

## Pre-processing Data

###### Since there no missing data(null) so data cleaning is not necessary

In [8]:
train_num = int(len(dataset)*0.8)
training_data = dataset.iloc[:train_num,5:6].values
testing_data = dataset.iloc[train_num:,5:6].values

In [51]:
# Normalization
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler(feature_range=(0,1))
scaled_training_data = min_max_scaler.fit_transform(training_data)
scaled_testing_data = min_max_scaler.fit_transform(testing_data)

In [10]:
print('Scaled Data: \n')
print(scaled_training_data)
print(type(scaled_training_data), scaled_training_data.shape)

Scaled Data: 

[[0.01292709]
 [0.01294584]
 [0.01309594]
 ...
 [0.99053446]
 [0.99738266]
 [1.        ]]
<class 'numpy.ndarray'> (3948, 1)


In [41]:
def create_dataset(dataset, time_step=1):
    x_data, y_data = [], []
    for i in range(time_step, len(dataset)):
        x_data.append(dataset[i-time_step:i,0])
        y_data.append(dataset[i,0])
    return np.array(x_data), np.array(y_data)

In [52]:
time_step = 10
X_train, y_train = create_dataset(scaled_training_data, time_step)
X_test, y_test = create_dataset(scaled_testing_data, time_step)

###### Convert into 3D (samples, time step, features) which is required for LSTM architecture.

In [53]:
X_train = np.reshape(X_train, (X_train.shape[0],X_train.shape[1],1))
X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))

## Model Creation

In [54]:
# Stacked LSTM model architecture
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1],1)))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1))

In [55]:
# Optimizer
model.compile(loss='mean_squared_error', optimizer='adam')

In [56]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 10, 50)            10400     
_________________________________________________________________
lstm_4 (LSTM)                (None, 10, 50)            20200     
_________________________________________________________________
lstm_5 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 50,851
Trainable params: 50,851
Non-trainable params: 0
_________________________________________________________________


In [57]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x13a06a490>

In [58]:
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

In [59]:
train_predict = min_max_scaler.inverse_transform(train_predict)
test_predict = min_max_scaler.inverse_transform(test_predict)

In [60]:
# Calculate RMSE performance merices
import math
from sklearn.metrics import mean_squared_error
math.sqrt(mean_squared_error(y_train, train_predict))

1318.9560843542995

In [61]:
# Test Data RMSE
math.sqrt(mean_squared_error(y_test, test_predict))

1393.0167145100284

In [70]:
df = dataset.reset_index()['close']
scaled_df = min_max_scaler.fit_transform(np.array(df).reshape(-1,1))

In [80]:
# look_back = time_step
# trainPredictPlot = np.empty_like(scaled_df)
# trainPredictPlot[:, :] = np.nan
# trainPredictPlot[look_back:len(train_predict)+look_back, :] = train_predict
# # shift test predictions for plotting
# testPredictPlot = np.empty_like(scaled_df)
# testPredictPlot[:, :] = np.nan
# testPredictPlot[len(train_predict)+(look_back)+1:len(scaled_df)-1, :] = test_predict
# plot baseline and predictions
# plt.plot(min_max_scaler.inverse_transform(scaled_df))
# plt.plot(trainPredictPlot)
# plt.plot(testPredictPlot)
# plt.show()

ValueError: could not broadcast input array from shape (977,1) into shape (985,1)