In [3]:
# Recurrent Neural Network

# Part 1 - Data Preprocessing

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the training set
dataset_train = pd.read_csv('Data/Train_data.csv')
# '.values' need the 2nd Column Opening Price as a Numpy array (not vector)
# '1:2' is used because the upper bound is ignored
training_set = dataset_train.iloc[:, 1:2].values

# Feature Scaling
# Use Normalization (versus Standardization) for RNNs with Sigmoid Activation Functions
# 'MinMaxScalar' is a Normalization Library
from sklearn.preprocessing import MinMaxScaler
# 'feature_range = (0,1)' makes sure that training data is scaled to have values between 0 and 1
sc = MinMaxScaler(feature_range = (0, 1))
training_set_scaled = sc.fit_transform(training_set)

# Creating a data structure with 60 timesteps (look back 60 days) and 1 output
# This tells the RNN what to remember (Number of timesteps) when predicting the next Stock Price
# The wrong number of timesteps can lead to Overfitting or bogus results
# 'x_train' Input with 60 previous days' stock prices
X_train = []
# 'y_train' Output with next day's stock price
y_train = []
for i in range(60, 1258):
    X_train.append(training_set_scaled[i-60:i, 0])
    y_train.append(training_set_scaled[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)

# Reshaping (add more dimensions)
# This lets you add more indicators that may potentially have corelation with Stock Prices
# Keras RNNs expects an input shape (Batch Size, Timesteps, input_dim)
# '.shape[0]' is the number of Rows (Batch Size)
# '.shape[1]' is the number of Columns (timesteps)
# 'input_dim' is the number of factors that may affect stock prices
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

# Show the dataset we're working with
display(dataset_train)

Unnamed: 0,Date,Open,Close,High,Low,Volume
0,1/3/2012 16:00:00,326.19,332.70,334.07,326.18,3677850
1,1/4/2012 16:00:00,332.62,334.14,335.12,330.31,2864365
2,1/5/2012 16:00:00,330.87,329.50,331.98,328.11,3283277
3,1/6/2012 16:00:00,329.57,325.01,330.00,324.89,2694024
4,1/9/2012 16:00:00,323.25,311.23,323.50,310.61,5825720
...,...,...,...,...,...,...
2511,12/23/2021 16:00:00,2941.79,2942.85,2971.45,2939.02,690934
2512,12/27/2021 16:00:00,2949.27,2961.28,2968.53,2945.00,662966
2513,12/28/2021 16:00:00,2967.49,2928.96,2967.49,2918.71,931792
2514,12/29/2021 16:00:00,2928.59,2930.09,2943.68,2910.09,851236


In [4]:
import pandas as pd

# create the dataframe
df = pd.read_csv('Data\Train_data.csv')


# clean the Pat_ID column
df.Date = df.Date.str.replace(' 16:00:00', '')