Required Files at Working Directory


1.   main.csv
2.   nasdaq_screener.csv



# Import Libraries

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import os
import math
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error
import json

# Set Working Directory

In [None]:
work_dir="/content/drive/MyDrive/<base_path>/Dataset"
result_dir="/content/drive/MyDrive/<base_path>/Results"

# Load Data

In [None]:
data = pd.read_csv(f"{work_dir}/main.csv")
cols_to_drop = data.columns[data.columns.str.contains('fiscal_quarter_key')]
data.drop(cols_to_drop, axis=1, inplace=True)
grouped = data.groupby('Symbol')

# Count the number of samples per symbol
lengths = [(symbol, len(group)) for symbol, group in grouped]
lengths_sorted = sorted(lengths, key=lambda x: x[1])

# Count the number of companies with more than or equal to 80 samples
count = sum(1 for symbol, length in lengths_sorted if length >= 80)
print("Number of companies with more than 80 samples:", count)
stockno = count
# Filter subgroups with more than or equal to 80 samples
filtered_groups = [(symbol, group) for symbol, group in grouped if len(group) >=80]

# Define a function to truncate each subgroup to 80 samples
def truncate_to_80(group):
    return group.iloc[:80]

# Apply the truncation function to each subgroup in filtered_groups
truncated_groups = [(symbol, truncate_to_80(group)) for symbol, group in filtered_groups]
filtered_data = pd.concat([group for symbol, group in truncated_groups])
companies = filtered_data['Symbol'].unique()
cols_to_drop = data.columns[data.columns.str.contains('Symbol')]
filtered_data.drop(cols_to_drop, axis=1, inplace=True)
cols_to_drop = data.columns[data.columns.str.contains('period_end_date')]
filtered_data.drop(cols_to_drop, axis=1, inplace=True)
cols_to_drop = data.columns[data.columns.str.contains('fiscal_quarter_number')]
filtered_data.drop(cols_to_drop, axis=1, inplace=True)
data = filtered_data.to_numpy()
print(len(data[0]))

Number of companies with more than 80 samples: 1658
109


# Pre Processing

In [None]:
def pre_process(data,sequence_length,overlap,batch_size):
  # here 70 samples are grouped together into a sub group : Train and 10 : Test
  grouped_data = []
  grouped_data_test = []
  for i in range(stockno):
      train_start_index = i*80
      train_end_index = (i+1)*80-10

      test_start_index = train_end_index
      test_end_index = test_start_index + 10

      grouped_data.append(data[train_start_index:train_end_index])
      grouped_data_test.append(data[test_start_index:test_end_index])
  grouped_data = np.array(grouped_data) # contains data for each company in subgroup
  len(grouped_data[-1])
  grouped_data[0][0:0+sequence_length]

  # this cell normnalize the dataset by groupwise
  train_mean_ref=[]
  train_std_ref=[]

  test_mean_ref=[]
  test_std_ref=[]

  for i in range(len(grouped_data)):
      mean_vector = np.mean(grouped_data[i], axis=0)
      std_vector = np.std(grouped_data[i], axis=0)

      # Check for zero standard deviation
      zero_std_indices = np.where(std_vector == 0)[0]

      # Avoid division by zero and handle NaN
      std_vector[std_vector == 0] = 1  # Replace zero standard deviations with 1 to avoid division by zero

      # Reference of mean and std
      train_mean_ref.append(mean_vector)
      train_std_ref.append(std_vector)
      # Normalize the data
      normalized_data = (grouped_data[i] - mean_vector) / std_vector

      # Handle elements where standard deviation was zero
      normalized_data[:, zero_std_indices] = 0  # Set corresponding elements to zero

      # Assign the normalized data back to grouped_data
      grouped_data[i] = normalized_data



      normalized_data = (grouped_data_test[i] - mean_vector) / std_vector

      # Handle elements where standard deviation was zero
      normalized_data[:, zero_std_indices] = 0  # Set corresponding elements to zero

      # Assign the normalized data back to grouped_data
      grouped_data_test[i] = normalized_data


  # for i in range(len(grouped_data_test)):
  #     mean_vector = np.mean(grouped_data_test[i], axis=0)
  #     std_vector = np.std(grouped_data_test[i], axis=0)

  #     # Check for zero standard deviation
  #     zero_std_indices = np.where(std_vector == 0)[0]

  #     # Avoid division by zero and handle NaN
  #     std_vector[std_vector == 0] = 1  # Replace zero standard deviations with 1 to avoid division by zero

  #     # Reference of mean and std
  #     test_mean_ref.append(mean_vector)
  #     test_std_ref.append(std_vector)
  #     # Normalize the data
  #     normalized_data = (grouped_data_test[i] - mean_vector) / std_vector

  #     # Handle elements where standard deviation was zero
  #     normalized_data[:, zero_std_indices] = 0  # Set corresponding elements to zero

  #     # Assign the normalized data back to grouped_data
  #     grouped_data_test[i] = normalized_data



  dataset=[]
  for i in range(len(grouped_data)):#1658
      for j in range(len(grouped_data[i])):#70
          if j*(sequence_length - overlap)+sequence_length < len(grouped_data[i]):
              datapoint = grouped_data[i][j*(sequence_length - overlap):j*(sequence_length - overlap)+sequence_length]
              #print(len(datapoint))
              dataset.append(datapoint)
          else:
              break
  dataset_test=[]
  for i in range(len(grouped_data_test)):#1658
      for j in range(len(grouped_data_test[i])):#70
          if j*(sequence_length - overlap)+sequence_length < len(grouped_data_test[i]):
              datapoint = grouped_data_test[i][j*(sequence_length - overlap):j*(sequence_length - overlap)+sequence_length]
              dataset_test.append(datapoint)
          else:
              break



  dataset = np.array(dataset)
  features = dataset[:, :, :-1]

  dataset_test = np.array(dataset_test)
  features_test = dataset_test[:, :, :-1]
  print(features.shape, features_test.shape)



  targets = []
  for i in range(len(grouped_data)):#1658
      for j in range(len(grouped_data[i])):#70
          if j*(sequence_length - overlap)+sequence_length < len(grouped_data[i]):
              target = grouped_data[i][j*(sequence_length - overlap)+sequence_length][-1]
              targets.append(target)
          else:
              break

  targets_test = []
  for i in range(len(grouped_data_test)):#1658
      for j in range(len(grouped_data_test[i])):#10
          if j*(sequence_length - overlap)+sequence_length < len(grouped_data_test[i]):
              target = grouped_data_test[i][j*(sequence_length - overlap)+sequence_length][-1]

              targets_test.append(target)
          else:
              #print(j)
              break

  targets = np.array(targets)
  print(targets.shape)
  features = torch.tensor(features, dtype=torch.float32)
  targets = torch.tensor(targets, dtype=torch.float32)


  targets_test = np.array(targets_test)
  print(targets_test.shape)
  features_test = torch.tensor(features_test, dtype=torch.float32)
  targets_test = torch.tensor(targets_test, dtype=torch.float32)



  X_train, X_val, y_train, y_val = train_test_split(features, targets, test_size=0.2, random_state=42)
  train_dataset = TensorDataset(X_train, y_train)
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  val_dataset = TensorDataset(X_val, y_val)
  val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  test_dataset = TensorDataset(features_test, targets_test)
  test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  return train_loader,val_loader,test_loader,train_mean_ref,train_std_ref

# Models

## MLP

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, output_size, sequence_length):
        super(MLP, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(in_features=(input_size*sequence_length), out_features=256)
        self.dropout1 = nn.Dropout(p=0.1)
        self.fc2 = nn.Linear(in_features=256, out_features=128)
        self.dropout2 = nn.Dropout(p=0.1)
        self.fc3 = nn.Linear(in_features=128, out_features=64)
        self.dropout3 = nn.Dropout(p=0.1)
        self.fc4 = nn.Linear(in_features=64, out_features=32)
        self.dropout4 = nn.Dropout(p=0.1)
        self.fc5 = nn.Linear(in_features=32, out_features=16)
        self.fc6 = nn.Linear(in_features=16, out_features=output_size)

    def forward(self, x):
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = F.relu(self.fc3(x))
        x = self.dropout3(x)
        x = F.relu(self.fc4(x))
        x = self.dropout4(x)
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        return x

## LSTM

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h_0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        c_0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        _, (h_n, _) = self.lstm(x, (h_0, c_0))
        output = self.fc(h_n[-1])
        return output

## LSTM-AM

In [None]:
# Define the model
class LSTMAttentionRegressor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMAttentionRegressor, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.attention = nn.Linear(hidden_size, 1)

    def forward(self, x):
        h_0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        c_0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        output, _ = self.lstm(x, (h_0, c_0))
        attention_weights = nn.functional.softmax(self.attention(output), dim=1)
        context_vector = torch.sum(attention_weights * output, dim=1)
        output = self.fc(context_vector)
        return output

## BiLSTM-AM

In [None]:
class BiLSTM_AM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, sequence_length, dropout_prob):
        super(BiLSTM_AM, self).__init__()
        self.lstm_1 = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True, bias=True)
        self.lstm_2 = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True, bias=True)
        self.attention_linear = nn.Linear(hidden_dim * 2, 1, bias=True)
        self.linear = nn.Linear(hidden_dim * 2, output_dim, bias=True)
        self.sequence_length = sequence_length
        self.dropout = nn.Dropout(p=dropout_prob)

    def forward(self, x):
        # Input shape: (batch_size, sequence_length, input_dim)
        batch_size, sequence_length, input_dim = x.size()

        # LSTM layers
        x, _ = self.lstm_1(x)
        x, _ = self.lstm_2(x)
        x = self.dropout(x)

        # Attention mechanism
        attention_weights = F.softmax(self.attention_linear(x), dim=1)
        attended_out = torch.sum(attention_weights * x, dim=1)

        # Output layer
        output = self.linear(attended_out)

        return output

## CNN-BiLSTM-AT

In [None]:
#Model Implementation

class CNN_BiLSTM_AM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, sequence_length, dropout_prob):
        super(CNN_BiLSTM_AM, self).__init__()
        self.conv1d_1 = nn.Conv1d(in_channels=input_dim, out_channels=hidden_dim, kernel_size=1, padding=0, bias = True)
        self.conv1d_2 = nn.Conv1d(in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=1, padding=0, bias = True)
        self.conv1d_3 = nn.Conv1d(in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=1, padding=0, bias = True)
        self.pool = nn.MaxPool1d(kernel_size=1, padding=0)
        self.lstm_1 = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True, bias = True)
        self.lstm_2 = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True, bias = True)
        self.attention_linear = nn.Linear(hidden_dim * 2, 1, bias = True)
        self.linear = nn.Linear(hidden_dim * 2, output_dim, bias = True)
        self.sequence_length = sequence_length
        self.dropout = nn.Dropout(p=dropout_prob)

    def forward(self, x):
        # Input shape: (batch_size, sequence_length, input_dim)
        batch_size, sequence_length, input_dim = x.size()

        # Reshape for Conv1d input
        x = x.permute(0, 2, 1)  # Reshape for Conv1d input
        x = self.pool(F.relu(self.conv1d_1(x)))
        x = self.pool(F.relu(self.conv1d_2(x)))
        x = self.pool(F.relu(self.conv1d_3(x)))

        x = self.dropout(x)
        # Reshape for LSTM input
        x = x.permute(0, 2, 1)  # Reshape for LSTM input
        x, _ = self.lstm_1(x)
        x, _ = self.lstm_2(x)
        x = self.dropout(x)
        # Attention mechanism
        attention_weights = F.softmax(self.attention_linear(x), dim=1)
        attended_out = torch.sum(attention_weights * x, dim=1)

        # Output layer
        output = self.linear(attended_out)

        return output

# Model Run

In [None]:
def model_run(model,train_loader,test_loader,val_loader,model_details,work_dir):
  criterion = nn.MSELoss()
  optimizer = optim.Adam(model.parameters(), lr=0.0001)
  train_losses = []
  val_losses = []
  num_epochs = 100
  all_predictions = []
  all_true_values = []
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  # Training
  #best_val_loss = float('inf')
  patience = 10  # Number of epochs to wait for improvement
  counter = 0
  for epoch in range(num_epochs):
      model.train()
      running_train_loss = 0.0
      progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False, mininterval=1)
      for inputs, labels in progress_bar:
          inputs, labels = inputs.to(device), labels.to(device)
          optimizer.zero_grad()

          # Forward pass
          outputs = model(inputs.float())
          loss = criterion(outputs.squeeze(), labels.float())

          # Backward pass and optimization
          loss.backward()
          optimizer.step()

          running_train_loss += loss.item()
          progress_bar.set_postfix({'loss': running_train_loss / len(train_loader)})

      # Record training loss for this epoch
      train_losses.append(running_train_loss / len(train_loader))

      # Validation loop
      model.eval()
      running_val_loss = 0.0
      predictions = []
      true_values = []
      with torch.no_grad():
      #    val_progress_bar = tqdm(val_dataloader, desc=f'Validation', leave=False, mininterval=1)
          for inputs, labels in val_loader:
              inputs, labels = inputs.to(device), labels.to(device)#labels are normalized

              # Forward pass
              normalized_outputs = model(inputs.float())
              loss = criterion(normalized_outputs.squeeze(), labels.float()) # losses are computed between normalized outputs and nromalized prediction

              running_val_loss += loss.item()

              #outputs = normalized_outputs * vector_max[-1]
              #labels = labels * vector_max[-1]
              # Store predicted and true values
              predictions.extend(normalized_outputs.cpu().numpy())
              true_values.extend(labels.cpu().numpy())

          # Record validation loss for this epoch
          val_losses.append(running_val_loss / len(val_loader))

      all_predictions.append(predictions)
      all_true_values.append(true_values)

      #TestingModel
      model.eval()
      running_test_loss = 0.0
      predictions_test_data = []
      true_values_test_data = []
      with torch.no_grad():
      #    val_progress_bar = tqdm(val_dataloader, desc=f'Validation', leave=False, mininterval=1)
          for inputs, labels in test_loader:
              inputs, labels = inputs.to(device), labels.to(device)#labels are normalized

              # Forward pass
              normalized_outputs = model(inputs.float())
              loss = criterion(normalized_outputs.squeeze(), labels.float()) # losses are computed between normalized outputs and nromalized prediction

              running_val_loss += loss.item()

              #outputs = normalized_outputs * vector_max[-1]
              #labels = labels * vector_max[-1]
              # Store predicted and true values
              predictions_test_data.extend(normalized_outputs.cpu().numpy())
              true_values_test_data.extend(labels.cpu().numpy())

          print(f"Test Loss for {model_details}:", running_val_loss / len(test_loader))
      """
      avg_val_loss = running_val_loss / len(val_dataloader)
      if avg_val_loss < best_val_loss: #check for the improvement of val_loss
          best_val_loss = avg_val_loss
          counter = 0
      else:
          counter = counter + 1
      if counter >= patience:
          print(f'Early stopping after {epoch+1} epochs without improvement.')
          break"""
      # Print average loss for this epoch
      print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]}, Val Loss: {val_losses[-1]}")

  # Plot training and validation losses
  plt.figure()
  plt.plot(range(1, num_epochs + 1), train_losses, label='Train Loss')
  plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.title(f'Training and Validation Losses for {model_details}')
  plt.legend()
  plt.show()
  plt.savefig(f'{work_dir}/model_loss_{model_details}.png')

  #TestingModel
  model.eval()
  running_test_loss = 0.0
  predictions_test_data = []
  true_values_test_data = []
  with torch.no_grad():
  #    val_progress_bar = tqdm(val_dataloader, desc=f'Validation', leave=False, mininterval=1)
      for inputs, labels in test_loader:
          inputs, labels = inputs.to(device), labels.to(device)#labels are normalized

          # Forward pass
          normalized_outputs = model(inputs.float())
          loss = criterion(normalized_outputs.squeeze(), labels.float()) # losses are computed between normalized outputs and nromalized prediction

          running_val_loss += loss.item()

          #outputs = normalized_outputs * vector_max[-1]
          #labels = labels * vector_max[-1]
          # Store predicted and true values
          predictions_test_data.extend(normalized_outputs.cpu().numpy())
          true_values_test_data.extend(labels.cpu().numpy())

      print(f"Test Loss for {model_details}:", running_val_loss / len(test_loader))

  return predictions_test_data,true_values_test_data

# Post Processing

In [None]:
def post_processing(test_mean_ref,test_std_ref,predictions_test_data,true_values_test_data,sequence_length,model_details,work_dir, companies):
  test_mean_ref_target = []
  test_std_ref_target = []

  for i in range(len(test_mean_ref)):
    test_mean_ref_target.append(test_mean_ref[i][-1])
    test_std_ref_target.append(test_std_ref[i][-1])

  test_by_comp_pred=[]
  test_by_comp_act=[]
  test_by_comp_pred = [x[0] for x in predictions_test_data]

  test_by_comp_pred = [test_by_comp_pred[i * (10-sequence_length) : (i + 1) * (10-sequence_length)] for i in range(stockno)]
  test_by_comp_act = [true_values_test_data[i * (10-sequence_length) : (i + 1) * (10-sequence_length)] for i in range(stockno)]

  for i in range(len(test_by_comp_pred)):
    test_by_comp_pred[i] = [x * test_std_ref_target[i] + test_mean_ref_target[i] for x in test_by_comp_pred[i]]
    test_by_comp_act[i] = [x * test_std_ref_target[i] + test_mean_ref_target[i] for x in test_by_comp_act[i]]


  pct_value_error = {}
  maerr = {}
  for i in range(len(test_by_comp_act)):
    pred_tmp = test_by_comp_pred[i]
    act_tmp = test_by_comp_act[i]
    pct_value_error[companies[i]] = mean_absolute_percentage_error(act_tmp,pred_tmp)
    maerr[companies[i]] = mean_absolute_error(act_tmp,pred_tmp)


  with open(f'{work_dir}/mae_{model_details}.json', 'w') as convert_file:
     convert_file.write(json.dumps(maerr))

  with open(f'{work_dir}/pct_{model_details}.json', 'w') as convert_file:
     convert_file.write(json.dumps(pct_value_error))

  predictions={}
  actual={}

  for i in range(len(test_by_comp_pred)):
    predictions[companies[i]]=test_by_comp_pred[i]
    actual[companies[i]]=test_by_comp_act[i]

  with open(f'{work_dir}/pred_{model_details}.json', 'w') as convert_file:
     convert_file.write(json.dumps(predictions))

  with open(f'{work_dir}/act_{model_details}.json', 'w') as convert_file:
      convert_file.write(json.dumps(actual))


# Experiment Setup

In [None]:
sequence_length = 4
overlap = 3
batch_size = 25
input_dim = len(data[0]) - 1
hidden_dim = 32
output_dim = 1
dropout_prob = 0.2


#initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

MLP_model = MLP(input_dim, output_dim, sequence_length).to(device)
LSTM_model = LSTM(input_dim, 2, output_dim).to(device)
LSTM_AT_model = LSTMAttentionRegressor(input_dim, 2, output_dim).to(device)
BiLSTM_AM_model = BiLSTM_AM(input_dim, hidden_dim, output_dim, sequence_length, dropout_prob).to(device)
CNN_BiLSTM_AM_model = CNN_BiLSTM_AM(input_dim, hidden_dim, output_dim, sequence_length, dropout_prob).to(device)

models={
        'MLP':MLP_model,
        'LSTM':LSTM_model,
        'LSTM_AT':LSTM_AT_model,
        'BiLSTM_AM':BiLSTM_AM_model,
        'CNN_LSTM_AM':CNN_BiLSTM_AM_model}

# Model implementation
for model_name,model in models.items():
  print("Model:",model_name)
  train_loader,val_loader,test_loader,test_mean_ref,test_std_ref = pre_process(data,sequence_length,overlap,batch_size)
  predictions_test_data,true_values_test_data = model_run(model,train_loader,test_loader,val_loader,model_name,result_dir)
  post_processing(test_mean_ref,test_std_ref,predictions_test_data,true_values_test_data,sequence_length,model_name,result_dir, companies)