In [13]:
import torch
from torch import nn
from torchvision import models, transforms, datasets
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
import numpy as np

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' #checking for gpu

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd drive/MyDrive/capstone_ml

/content/drive/MyDrive/capstone_ml


## Defining Dataset


In [6]:
#defining custom dataset to input to model
class EarningsDataset(data.Dataset):
  def __init__(self, csv_file, root_dir, transform=None):
    self.annotations = pd.read_csv(csv_file)
    self.root_dir = root_dir
    self.transform = transform

  def __len__(self):
    return len(self.annotations)

  def __getitem__(self,index):
    name = str(self.annotations.iloc[index,0])
    y_label = self.annotations.iloc[index,1]
    example = torch.load(self.root_dir+name) #loading saved torch tensor representing file
    time_series = example["time_series"]
    volatility = example["volatility"] if not torch.isnan(example["volatility"]).any() else torch.tensor(1)
    volume = example["volume"] if not torch.isnan(example["volume"]).any() else torch.tensor(1)
    marketcap = example["marketcap"] if not torch.isnan(example["marketcap"]).any() else torch.tensor(1)
    sector = example["sector"]
    industry = example["industry"]

    return (time_series, volatility, volume, marketcap, sector, industry), y_label

##Creating Train Test Split

In [15]:
dataset = EarningsDataset(csv_file="annotations3.csv", root_dir="training_data3/")
total_size = len(dataset)
print(len(dataset))
train_size = int(0.8 * total_size)
test_size = total_size - train_size

train_dataset, test_dataset = data.random_split(dataset, [train_size, test_size])

batch_size = 64
trainloader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
testloader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)

2189


##RNN Model

In [16]:
class RNNModel(nn.Module):
  def __init__(self):
      super(RNNModel, self).__init__()
      self.lstm = nn.LSTM(input_size=6, hidden_size=20, num_layers=2, batch_first=True)
      self.fc1 = nn.Linear(34, 16)
      self.fc2 = nn.Linear(16, 1)
      self.activation = nn.Tanh()

  def forward(self, time_series, scalar):
      h0 = torch.zeros(2, batch_size, 20).to(device)
      c0 = torch.zeros(2, batch_size, 20).to(device)
      output, (hn, cn) = self.lstm(time_series, (h0, c0))
      embedding = hn[-1]
      combined = torch.cat((embedding, scalar), dim=1)
      combined = self.activation(self.fc1(combined))
      output = self.fc2(combined)
      output = self.activation(output)
      return output

##CNN Model

In [17]:
class CNNModel(nn.Module):
  def __init__(self):
      super(CNNModel, self).__init__()
      self.conv1 = nn.Conv1d(in_channels=6, out_channels=10, kernel_size=2)
      self.flatten = nn.Flatten()
      self.fc1 = nn.Linear(64, 16)
      self.fc2 = nn.Linear(16, 1)
      self.activation = nn.Tanh()

  def forward(self, time_series, scalar):
      embedding = self.conv1(time_series)
      embedding = self.flatten(embedding)
      combined = torch.cat((embedding, scalar), dim=1)
      x = self.fc1(combined)
      x = self.activation(x)
      return x

##Train, Test Loop

In [23]:
model = RNNModel()
model.to(device)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(5):
    train_loss = 0
    for inputs, labels in trainloader:
      labels = labels.to(device)
      time_series, volatility, volume, marketcap, sector, industry = inputs
      time_series = time_series.to(device)
      scalar_inputs = torch.cat([volatility.unsqueeze(1).to(device), volume.unsqueeze(1).to(device), marketcap.unsqueeze(1), sector.to(device)], dim=1)

      optimizer.zero_grad()
      outputs = model(time_series.float(), scalar_inputs.float())
      loss = loss_function(outputs, labels.unsqueeze(1).float())
      loss.backward()
      optimizer.step()
      train_loss += loss.item()*batch_size

    test_loss = 0
    with torch.no_grad():
      for inputs, labels in testloader:
        labels = labels.to(device)
        time_series, volatility, volume, marketcap, sector, industry = inputs
        time_series = time_series.to(device)
        scalar_inputs = torch.cat([volatility.unsqueeze(1).to(device), volume.unsqueeze(1).to(device), marketcap.unsqueeze(1), sector.to(device)], dim=1)

        outputs = model(time_series.float(), scalar_inputs.float())
        loss = loss_function(outputs, labels.unsqueeze(1).float())
        test_loss += loss.item()*batch_size

    print(f'Epoch {epoch+1}, Train Loss: {train_loss/len(trainloader.dataset)}, Test Loss:{test_loss/len(testloader.dataset)}')

Epoch 1, Train Loss: 0.16144619064287483, Test Loss:0.1070354420300488
Epoch 2, Train Loss: 0.12772095319136834, Test Loss:0.10095937175837826
Epoch 3, Train Loss: 0.1252231262943392, Test Loss:0.09729819428430844
Epoch 4, Train Loss: 0.12422918769579352, Test Loss:0.09957278917913567
Epoch 5, Train Loss: 0.1224068960824876, Test Loss:0.08923574343119582
