<a href="https://colab.research.google.com/github/mjmaher987/Stock-Prediction-Using-ML/blob/main/Codes/Stock-09012023-2346.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stock Price Prediction

## Dependencies

In [1]:
import time
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.layers import Input, Dense, MultiHeadAttention, Reshape, Flatten, LSTM
import yfinance as yf
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Configs

In [2]:
Configs = {
    "training_percent" : 0.8,
    "epoch_number" : 50,
    "sequence_length" : 10,
    "num_features" : 5
}

## Preprocessing

In [3]:
class ETL:
    """
    ticker: str
    period: string
    test_size: float betwee 0 and 1
    n_input: int
    timestep: int
    Extracts data for stock with ticker `ticker` from yf api,
    splits the data into train and test sets by date,
    reshapes the data into np.array of shape [#weeks, 5, 1],
    converts our problem into supervised learning problem.
    """
    def __init__(self, ticker, test_size=0.2, period='max', n_input=5, timestep=5) -> None:
        self.ticker = ticker
        self.period = period
        self.test_size = test_size
        self.n_input = n_input
        self.df = self.extract_historic_data()
        self.timestep = timestep
        self.train, self.test = self.etl()
        self.X_train, self.y_train = self.to_supervised(self.train)
        self.X_test, self.y_test = self.to_supervised(self.test)

    def get_history_from_yf(self):
        t = yf.Ticker(self.ticker)
        return t.history(period=self.period)

    def extract_historic_data(self) -> pd.Series:
        """
        gets historical data from yf api.
        """
        t = yf.Ticker(self.ticker)
        history = t.history(period=self.period)
        return history.Close

    def split_data(self) -> tuple:
        """
        Splits our pd.Series into train and test series with
        test series representing test_size * 100 % of data.
        """
        data = self.extract_historic_data()
        if len(data) != 0:
            train_idx = round(len(data) * (1-self.test_size))
            train = data[:train_idx]
            test = data[train_idx:]
            train = np.array(train)
            test = np.array(test)
            return train[:, np.newaxis], test[:, np.newaxis]
        else:
            raise Exception('Data set is empty, cannot split.')

    def window_and_reshape(self, data) -> np.array:
        """
        Reformats data into shape our model needs,
        namely, [# samples, timestep, # feautures]
        samples
        """
        NUM_FEATURES = 1
        samples = int(data.shape[0] / self.timestep)
        result = np.array(np.array_split(data, samples))
        return result.reshape((samples, self.timestep, NUM_FEATURES))

    def transform(self, train, test) -> np.array:
        train_remainder = train.shape[0] % self.timestep
        test_remainder = test.shape[0] % self.timestep
        if train_remainder != 0 and test_remainder != 0:
            train = train[train_remainder:]
            test = test[test_remainder:]
        elif train_remainder != 0:
            train = train[train_remainder:]
        elif test_remainder != 0:
            test = test[test_remainder:]
        return self.window_and_reshape(train), self.window_and_reshape(test)

    def etl(self) -> tuple[np.array, np.array]:
        """
        Runs complete ETL
        """
        train, test = self.split_data()
        return self.transform(train, test)

    def to_supervised(self, train, n_out=5) -> tuple:
        """
        Converts our time series prediction problem to a
        supervised learning problem.
        """
        # flatted the data
        data = train.reshape((train.shape[0]*train.shape[1], train.shape[2]))
        X, y = [], []
        in_start = 0
        # step over the entire history one time step at a time
        for _ in range(len(data)):
            # define the end of the input sequence
            in_end = in_start + self.n_input
            out_end = in_end + n_out
            # ensure we have enough data for this instance
            if out_end <= len(data):
                x_input = data[in_start:in_end, 0]
                x_input = x_input.reshape((len(x_input), 1))
                X.append(x_input)
                y.append(data[in_end:out_end, 0])
                # move along one time step
                in_start += 1
        return np.array(X), np.array(y)

In [4]:
def preprocess():
  etl_data = ETL('EURUSD=X')
  history = etl_data.get_history_from_yf()

  data = pd.DataFrame(
      {
          'open': pd.Series([x for x in history.Open]),
          'high': pd.Series([x for x in history.High]),
          'low': pd.Series([x for x in history.Low]),
          'close': pd.Series([x for x in history.Close]),
          'volume': pd.Series([x for x in history.Volume]),
      }
  )

  # Preprocessing
  scaler = MinMaxScaler()
  data['close'] = scaler.fit_transform(data['close'].values.reshape(-1, 1))

  # Splitting into training and testing sets
  train_size = int(Configs['training_percent'] * len(data))
  train_data, test_data = data[:train_size], data[train_size:]
  return train_data, test_data

In [5]:
def create_seqs(train_data, test_data):
  # Define the sequence length
  sequence_length = Configs['sequence_length']
  def create_sequences(data, sequence_length):
      sequences = []
      targets = []
      for i in range(len(data) - sequence_length):
          sequences.append(data[i:i+sequence_length])
          targets.append(data.iloc[i+sequence_length])
      return np.array(sequences), np.array(targets)
  train_sequences, train_targets = create_sequences(train_data, sequence_length)
  test_sequences, test_targets = create_sequences(test_data, sequence_length)
  return train_sequences, train_targets, test_sequences, test_targets

## Experiments

### NBEATS

In [6]:
class NBeatsModel(tf.keras.Model):
    def __init__(self):
        super(NBeatsModel, self).__init__()
        self.flatten = tf.keras.layers.Flatten(input_shape=(Configs['sequence_length'], 5))
        self.dense = tf.keras.layers.Dense(5)

    def call(self, inputs, training):
        x = self.flatten(inputs)
        x = self.dense(x)
        return x

def get_nbeats_model():
  return NBeatsModel()

def run_nbeats(train_sequences, train_targets, test_sequences, test_targets):
  nbeats_model = get_nbeats_model()
  nbeats_model.compile(optimizer='adam', loss='mean_squared_error')
  nbeats_model.fit(train_sequences, train_targets, epochs=Configs['epoch_number'], batch_size=32, validation_split=0.2, verbose=0)

  # Make predictions
  predictions_nbeats = nbeats_model.predict(test_sequences)
  mse = mean_squared_error(test_targets, predictions_nbeats)
  mae = mean_absolute_error(test_targets, predictions_nbeats)
  rmse = mean_squared_error(test_targets, predictions_nbeats, squared=False)
  print('--------------------------------\nNBeats')
  print(f"Mean Squared Error: {mse}")
  print(f"Mean Absolute Error: {mae}")
  print(f"Root Mean Squared Error: {rmse}")
  return predictions_nbeats, mse, mae, rmse

### NHits

In [7]:
def build_nhits(input_shape, output_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=input_shape),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(output_shape)
    ])
    return model

def run_nhits(train_sequences, train_targets, test_sequences, test_targets):
  nhits_model = build_nhits((Configs['sequence_length'], 5), 5)
  nhits_model.compile(optimizer='adam', loss='mean_squared_error')
  nhits_model.fit(train_sequences, train_targets, epochs=Configs['epoch_number'], batch_size=32, validation_split=0.2, verbose=0)


  nhits_predictions = nhits_model.predict(test_sequences)
  mse = mean_squared_error(test_targets, nhits_predictions)
  mae = mean_absolute_error(test_targets, nhits_predictions)
  rmse = mean_squared_error(test_targets, nhits_predictions, squared=False)
  print('--------------------------------\nNHits')
  print(f"Mean Squared Error: {mse}")
  print(f"Mean Absolute Error: {mae}")
  print(f"Root Mean Squared Error: {rmse}")
  return nhits_predictions, mse, mae, rmse

### RNN

In [8]:
def run_rnn(train_sequences, train_targets, test_sequences, test_targets):
  model_rnn = tf.keras.Sequential([
      tf.keras.layers.SimpleRNN(64, input_shape=(Configs['sequence_length'], 5), return_sequences=False),
      tf.keras.layers.Dense(5)
  ])

  model_rnn.compile(optimizer='adam', loss='mean_squared_error')
  model_rnn.fit(train_sequences, train_targets, epochs=Configs['epoch_number'], batch_size=32, validation_split=0.2, verbose=0)


  # Make predictions
  predictions_rnn = model_rnn.predict(test_sequences)
  mse = mean_squared_error(test_targets, predictions_rnn)
  mae = mean_absolute_error(test_targets, predictions_rnn)
  rmse = mean_squared_error(test_targets, predictions_rnn, squared=False)
  print('--------------------------------\nRNN')
  print(f"Mean Squared Error: {mse}")
  print(f"Mean Absolute Error: {mae}")
  print(f"Root Mean Squared Error: {rmse}")
  return predictions_rnn, mse, mae, rmse

### LSTM

In [9]:
def run_lstm(train_sequences, train_targets, test_sequences, test_targets):
  lstm_model = tf.keras.Sequential([
      tf.keras.layers.LSTM(64, input_shape=(Configs['sequence_length'], 5), return_sequences=False),
      tf.keras.layers.Dense(5)
  ])

  lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

  lstm_model.fit(train_sequences, train_targets, epochs=Configs['epoch_number'], batch_size=32, verbose=0)


  lstm_predictions = lstm_model.predict(test_sequences)
  mse = mean_squared_error(test_targets, lstm_predictions)
  mae = mean_absolute_error(test_targets, lstm_predictions)
  rmse = mean_squared_error(test_targets, lstm_predictions, squared=False)
  print('--------------------------------\nLSTM')
  print(f"Mean Squared Error: {mse}")
  print(f"Mean Absolute Error: {mae}")
  print(f"Root Mean Squared Error: {rmse}")
  return lstm_predictions, mse, mae, rmse

### Transformers

In [10]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0, epsilon=1e-6, attention_axes=None, kernel_size=1):
  """
  Creates a single transformer block.
  """
  x = layers.LayerNormalization(epsilon=epsilon)(inputs)
  x = layers.MultiHeadAttention(
      key_dim=head_size, num_heads=num_heads, dropout=dropout,
      attention_axes=attention_axes
      )(x, x)
  x = layers.Dropout(dropout)(x)
  res = x + inputs

    # Feed Forward Part
  x = layers.LayerNormalization(epsilon=epsilon)(res)
  x = layers.Conv1D(filters=ff_dim, kernel_size=kernel_size, activation="relu")(x)
  x = layers.Dropout(dropout)(x)
  x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=kernel_size)(x)
  return x + res

def build_transfromer(head_size, num_heads, ff_dim, num_trans_blocks, mlp_units, dropout=0, mlp_dropout=0, attention_axes=None, epsilon=1e-6, kernel_size=1):
  """
  Creates final model by building many transformer blocks.
  """
  n_timesteps, n_features, n_outputs = Configs['sequence_length'], 5, 5
  inputs = tf.keras.Input(shape=(n_timesteps, n_features))
  x = inputs
  for _ in range(num_trans_blocks):
    x = transformer_encoder(x, head_size=head_size, num_heads=num_heads, ff_dim=ff_dim, dropout=dropout, attention_axes=attention_axes, kernel_size=kernel_size, epsilon=epsilon)

  x = layers.GlobalAveragePooling1D(data_format="channels_first")(x)
  for dim in mlp_units:
    x = layers.Dense(dim, activation="relu")(x)
    x = layers.Dropout(mlp_dropout)(x)

  outputs = layers.Dense(n_outputs)(x)
  return tf.keras.Model(inputs, outputs)



def fit_transformer(transformer: tf.keras.Model, train_sequences, train_targets):
  """
  Compiles and fits our transformer.
  """
  transformer.compile(
    loss="mse",
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    metrics=["mae", 'mape']
  )

  callbacks = [tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)]
  hist = transformer.fit(train_sequences, train_targets, batch_size=32, epochs=Configs['epoch_number'], verbose=0, callbacks=callbacks)
  return hist

def run_transformer(train_sequences, train_targets, test_sequences, test_targets):
  transformer_model = build_transfromer(head_size=128, num_heads=4, ff_dim=2, num_trans_blocks=4, mlp_units=[256], mlp_dropout=0.10, dropout=0.10, attention_axes=1)
  hist = fit_transformer(transformer_model, train_sequences, train_targets)
  transformer_predictions = transformer_model.predict(test_sequences)
  mse = mean_squared_error(test_targets, transformer_predictions)
  mae = mean_absolute_error(test_targets, transformer_predictions)
  rmse = mean_squared_error(test_targets, transformer_predictions, squared=False)
  print('--------------------------------\nTransformer')
  print(f"Mean Squared Error: {mse}")
  print(f"Mean Absolute Error: {mae}")
  print(f"Root Mean Squared Error: {rmse}")
  return transformer_predictions, mse, mae, rmse


## Visualizations

In [11]:
import os
def visualize_all(test_targets, predictions_nbeats, nhits_predictions, predictions_rnn, lstm_predictions, transformer_predictions, folder):
  test_targets_close = [test_target[3] for test_target in test_targets]
  nbeats_close = [predictions_nbit[3] for predictions_nbit in predictions_nbeats]
  nhits_close = [nhits_prediction[3] for nhits_prediction in nhits_predictions]
  rnn_close = [prediction_rnn[3] for prediction_rnn in predictions_rnn]
  lstm_close = [lstm_prediction[3] for lstm_prediction in lstm_predictions]
  transformer_close = [transformer_prediction[3] for transformer_prediction in transformer_predictions]

  plt.figure(figsize=(24, 18), dpi=400)
  plt.plot(test_targets_close, label='Actual')
  plt.plot(nbeats_close, label='NBeats')
  plt.plot(nhits_close, label='NHits')
  plt.plot(rnn_close, label='RNN')
  plt.plot(lstm_close, label='LSTM')
  plt.plot(transformer_close, label='Transformer')
  plt.legend()

  # Check if the folder exists
  if not os.path.exists(folder):
    os.mkdir(folder)

  plt.savefig(os.path.join(folder, 'plot.png'))


def visualize_one(test_targets_close, predicts_close, folder, name):
  plt.figure(figsize=(8, 6), dpi=80)
  plt.plot(test_targets_close, label='Actual')
  plt.plot(predicts_close, label=name)
  plt.legend()
  plt.savefig(os.path.join(folder, name + '.png'))

In [None]:
def start_simulation(train_data, test_data):
   print('--------------------------')
   print('--------------------------')
   print('--------------------------')
   print('Configs:')
   print('Sequence Length:' + str(Configs['sequence_length']))
   print('Epoch Number: ' + str(Configs['epoch_number']))

   start_time = time.time()


   train_sequences, train_targets, test_sequences, test_targets = create_seqs(train_data, test_data)

   preds_nbeats, mse_nbeats, mae_nbeats, rmse_nbeats = run_nbeats(train_sequences, train_targets, test_sequences, test_targets)
   preds_nhits, mse_nhits, mae_nhits, rmse_nbeats = run_nhits(train_sequences, train_targets, test_sequences, test_targets)
   preds_rnn, mse_rnn, mae_rnn, rmse_rnn = run_rnn(train_sequences, train_targets, test_sequences, test_targets)
   preds_lstm, mse_lstm, mae_lstm, rmse_lstm = run_lstm(train_sequences, train_targets, test_sequences, test_targets)
   preds_transformer, mse_transformer, mae_transformer, rmse_transformer = run_transformer(train_sequences, train_targets, test_sequences, test_targets)

   end_time = time.time()
   simulation_time = end_time - start_time
   print(f"The simulation took {simulation_time} seconds to run.")

   folder = "/content/drive/MyDrive/Experiments/" + str(Configs['epoch_number']) + '_' + str(Configs['sequence_length'])

   visualize_all(test_targets, preds_nbeats, preds_nhits, preds_rnn, preds_lstm, preds_transformer, folder)

   test_targets_close = [test_target[3] for test_target in test_targets]
   nbeats_close = [predictions_nbit[3] for predictions_nbit in preds_nbeats]
   nhits_close = [nhits_prediction[3] for nhits_prediction in preds_nhits]
   rnn_close = [prediction_rnn[3] for prediction_rnn in preds_rnn]
   lstm_close = [lstm_prediction[3] for lstm_prediction in preds_lstm]
   transformer_close = [transformer_prediction[3] for transformer_prediction in preds_transformer]

   visualize_one(test_targets_close, nbeats_close, folder, 'NBeats')
   visualize_one(test_targets_close, nhits_close, folder, 'NHits')
   visualize_one(test_targets_close, rnn_close, folder, 'RNN')
   visualize_one(test_targets_close, lstm_close, folder, 'LSTM')
   visualize_one(test_targets_close, transformer_close, folder, 'Transformer')

folder = "/content/drive/MyDrive/Experiments/"
if not os.path.exists(folder):
  os.mkdir(folder)

train_data, test_data = preprocess()
seqs = [2, 5, 10, 50]
epochs = [10, 50, 100, 200]
for seq in seqs:
  for epoch in epochs:
    Configs['epoch_number'] = epoch
    Configs['sequence_length'] = seq
    start_simulation(train_data, test_data)

--------------------------
--------------------------
--------------------------
Configs:
Sequence Length:2
Epoch Number: 10
--------------------------------
NBeats
Mean Squared Error: 0.009725912731752811
Mean Absolute Error: 0.08241733537309906
Root Mean Squared Error: 0.08654909527163698
--------------------------------
NHits
Mean Squared Error: 0.00012395235586687049
Mean Absolute Error: 0.005331743865119399
Root Mean Squared Error: 0.008326176158181074
--------------------------------
RNN
Mean Squared Error: 0.00018687849560518978
Mean Absolute Error: 0.009750583174286495
Root Mean Squared Error: 0.01269763100166765
--------------------------------
LSTM
Mean Squared Error: 8.83010746499111e-05
Mean Absolute Error: 0.006036458233529071
Root Mean Squared Error: 0.007670908120358463
--------------------------------
Transformer
Mean Squared Error: 0.0007820178411302913
Mean Absolute Error: 0.016171927195087515
Root Mean Squared Error: 0.022712460141350265
The simulation took 63.749182

  plt.figure(figsize=(8, 6), dpi=80)


--------------------------
--------------------------
--------------------------
Configs:
Sequence Length:5
Epoch Number: 10
--------------------------------
NBeats
Mean Squared Error: 0.007650107612028718
Mean Absolute Error: 0.06712035891969731
Root Mean Squared Error: 0.07157857674729001
--------------------------------
NHits
Mean Squared Error: 0.0002017875709521904
Mean Absolute Error: 0.00776775622630015
Root Mean Squared Error: 0.011584879519960956
--------------------------------
RNN
Mean Squared Error: 0.0001683997342380746
Mean Absolute Error: 0.009784722596073525
Root Mean Squared Error: 0.012280191686465727
--------------------------------
LSTM
Mean Squared Error: 0.0001106943710479305
Mean Absolute Error: 0.007414731941169837
Root Mean Squared Error: 0.009682135838410679
--------------------------------
Transformer
Mean Squared Error: 7.643183938662074e-05
Mean Absolute Error: 0.006087411917281084
Root Mean Squared Error: 0.00803850804972345
The simulation took 122.9664299

### NBeats

In [None]:
visualize_one(test_targets_close, nbeats_close)

### NHits

In [None]:
visualize_one(test_targets_close, nhits_close)

### LSTM

In [None]:
visualize_one(test_targets_close, rnn_close)

## RNN

In [None]:
visualize_one(test_targets_close, lstm_close)

### Transformer

In [None]:
visualize_one(test_targets_close, transformer_close)

Save and serialize models for the bot

In [None]:
def save_models():
  # save models for bot usage
  nbeats_model.save('./model_dumps/nbeats.tf')
  nhits_model.save('./model_dumps/nhits.keras')
  model_rnn.save('./model_dumps/rnn.keras')
  lstm_model.save('./model_dumps/lstm.keras')
  transformer_model.save('./model_dumps/transformer.keras')