In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [33]:
# !unzip /content/drive/MyDrive/stocknet-dataset.zip

In [3]:
# install dependencies
# !pip install wandb pytorch_lightning transformers

#### 1. Dependencies

In [1]:
# utils
import gc
import os
import tqdm
import torch
import json

# data
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

# model
import torch.nn as nn
from transformers import AutoModel

# traning, loggin and evaluation
import torch.nn.functional as F
import torch.optim as optim
import wandb
import pytorch_lightning as pl
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger

In [2]:
device =torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### 2. Datasets
- A generic dataset class to create pytorch dataset
- No **Text Cleaning** is being performed

In [3]:
class StonkDataset(Dataset):
    
    def __init__(self, tokenizer, file, root, max_len=512):
        
        self.tokenizer = tokenizer
        self.data = pd.read_csv(file) # slow but it's ok since we have only few thousand entries
        self.max_len=max_len
        self.root = root
    
    def read_file(self, file_name, stock, root):
        # read a file from file_location
        path = os.path.join(root, stock, file_name)
        tweets = []
        with open(path) as file:
            for line in file:
                data = json.loads(line)
                tweets.append(" ".join(data['text']))

        # remove the duplicate and preserve the order 
        
        seen = set()
        
        tweets = [x for x in tweets if not (x in seen or seen.add(x))]
        
        return tweets
    
    def __len__(self):
        return self.data.shape[0]
    
    
    def __getitem__(self, idx):
        
        # get the label, stock name, date and tweets from the dataframe
        
        label = self.data['Trend'].iloc[idx]
        
        stock = self.data['Stock'].iloc[idx]
        
        file = self.data['Tweet'].iloc[idx]
        
        date = self.data['Date'].iloc[idx]
        
        tweets = self.read_file(file_name=file, stock=stock, root=self.root)
        
        input_ids = torch.empty((0, self.max_len), dtype=torch.long)
        attention_masks = torch.empty((0, self.max_len), dtype=torch.long)
        
        for tweet in tweets:
            input_id, attention_mask = self.encoder(text=tweet)
            
            input_ids = torch.vstack((input_ids, input_id))
            
            attention_masks = torch.vstack((attention_masks, attention_mask))
        
        
        return {
            "stock":stock,
            "date":date,
            "tweets":tweets,
            "input_ids":input_ids,
            "attention_masks":attention_masks,
            "label":label
        }

    
    
    def encoder(self, text):
        
        encode = self.tokenizer.encode_plus(
            text=text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt',
            return_token_type_ids=True,
        )
        
        return encode['input_ids'].squeeze(), encode['attention_mask'].squeeze()
        
        

#### 3. StonkNet Model

In [41]:
class StonkNet(nn.Module):
    
    
    def __init__(self, config=None):
        super(StonkNet, self).__init__()
        
        self.config = config
        
        self.base = AutoModel.from_pretrained(pretrained_model_name_or_path=self.config['model_name'])
        
        # free the RoBERTa 
        for param in self.base.parameters():
            param.requires_grad = False
        
        # GRU to prcess tweets for current day
        self.gru = nn.GRU(
            input_size=self.config['hidden_size'],
            hidden_size=self.config['hidden_size'],
            num_layers=self.config['num_layers'],
            batch_first=True,
            bidirectional=True,
        )
        
        # classifier on top of GRU's last hidden state
        self.classifier = nn.Sequential(*[
            nn.Linear(in_features=2*self.config['hidden_size'], out_features=256),
            nn.LeakyReLU(),
            nn.Linear(in_features=256, out_features=128),
            nn.LeakyReLU(),
            nn.Linear(in_features=128, out_features=self.config['num_classes'])
        ])
        
        # initialize the hidden state as well as stock name
        self.hx = None 
        self.previous_stock = None
        
    
    
    def forward(self, input_ids, attention_masks, current_stock):
        
        """
            input_ids.shape = attention_masks.shape = [num_tweets, max_len]
        """
        
        outputs = self.base(input_ids=input_ids, attention_mask=attention_masks)
        
        pooler = outputs['pooler_output']
        
        # pooler.shape = [num_tweets, hidden_size] 
        
        
        # batchifying the pooler output
        batch = pooler.unsqueeze(0)
        
       
        # batch.shape = [1, num_tweets, hidden_size]
        
        # since data has been so we can only use tweets for same stocks
        if self.previous_stock==current_stock:
            _, hidden = self.gru(input=batch, hx=self.hx)
            
        else:
            _, hidden = self.gru(input=batch, hx=None)
            
        self.hx = hidden.detach()
        self.previous_stock = current_stock
        
        
        # hidden.shape [2, batch_size, hidden_size] # make it batch first again
        x = hidden.permute(1, 0, 2)
        
        x = torch.hstack((x[:, 0, :], x[:, 1, :]))
        
        
        logits = self.classifier(x)
        
        
        return logits
        
        
        

#### Context Aware Attention Model

In [4]:
class TweetRNN(nn.Module):
    
    def __init__(self, model_name="roberta-base", hidden_size=768, bidirectional=True, num_layers=1):
        super(TweetRNN, self).__init__()
        
        
        # embedding layer is replaced by pretrained roberta's embedding
        self.base = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name) #nn.Embedding(num_embeddings=tokenizer.vocab_size, embedding_dim=768) 
        #freeze the model parameters
        for param in self.base.parameters():
            param.requires_grad = False
        
        #self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size)
        self.rnn = nn.RNN(
            input_size=hidden_size, 
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            bidirectional=bidirectional,
            batch_first=True
        )
    
    def forward(self, input_ids, attention_mask):
        """
            x.shape = [batch_size, seq_len]
        """
        
    
        outputs = self.base(input_ids, attention_mask)

        pooler = outputs['pooler_output']

        pooler = outputs['pooler_output'].unsqueeze(0)

        # outputs = outputs.mean(dim=1).unsqueeze(0).to(device)
        
        outputs,_ = self.rnn(pooler)
                
        return outputs

In [53]:
trnn = TweetRNN().to(device)

In [79]:
outputs = trnn(input_ids=batch['input_ids'].to(device), attention_mask=batch['attention_masks'].to(device))

In [80]:
outputs.shape

torch.Size([1, 6, 1536])

In [5]:
class ContextAwareAttention(nn.Module):
    
    def __init__(self, hidden_size=1536, output_size=768, seq_len=128):
        super(ContextAwareAttention, self).__init__()
        
        # context aware self attention
        self.fc_1 = nn.Linear(in_features=hidden_size, out_features=output_size, bias=False)
        self.fc_3 = nn.Linear(in_features=hidden_size//2, out_features=output_size, bias=True)
        self.fc_2 = nn.Linear(in_features=output_size, out_features=128, bias=False)
        
        # linear projection
        self.linear_projection = nn.Linear(in_features=hidden_size, out_features=1, bias=True)
        
    
    def forward(self, hidden_states, h_forward):
        """
            hidden_states.shape = [batch, seq_len, hidden_size]
            h_forward.shape = [1, hidden_size]
        """
        
        
        # compute the energy
        S = self.fc_2(torch.tanh(self.fc_1(hidden_states) + self.fc_3(h_forward.unsqueeze(1)))).squeeze(1)
        # S.shape = [batch, seq_len, input_size] # input_size is hyperparameter
        
        # compute the attention
        # print(f"S.shape = {S.shape}")

        A = S.softmax(dim=-1)
        
        # Compute the sentence representation
        M = torch.matmul(A.permute(0, 2, 1), hidden_states)
        
        # linear projection of the sentence
        x = self.linear_projection(M)
        
        return x

In [6]:
h_forward = torch.randn((1, 1, 768), device=device)

In [59]:
attn = ContextAwareAttention().to(device)

In [90]:
x = attn(hidden_states=outputs, h_forward=h_forward)

In [91]:
x.shape

torch.Size([1, 128, 1])

In [7]:
class DailyTweetsRNN(nn.Module):
    
    def __init__(self, input_size=1, hidden_size=768, bidirectional=True, num_layers=1):
        super(DailyTweetsRNN, self).__init__()
        
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            bidirectional=bidirectional,
            batch_first=True
        )
        
    
    def forward(self, input_, hx=None):
        
        """
            input_.shape = [batch, input_size] # input_size was chosen in attention module
            hx.shape = [2, batch_size, hidden_size]
        """
        
        _, hidden = self.rnn(input=input_, hx=hx)
        
        return hidden

In [65]:
dtrnn = DailyTweetsRNN().to(device)

In [93]:
hidden = dtrnn(input_=x)

In [94]:
hidden.shape

torch.Size([2, 1, 768])

In [8]:
class ContextAwareStonkNet(nn.Module):
    
    def __init__(self, model_name="roberta-base", hidden_size=768, num_classes=2):
        
        super(ContextAwareStonkNet, self).__init__()
        
        self.in_features = 2*hidden_size
        
        # utterance encoder model
        self.tweet_rnn = TweetRNN(model_name=model_name, hidden_size=hidden_size)
        
        # context aware self attention module
        self.context_aware_attention = ContextAwareAttention(hidden_size=2*hidden_size, output_size=hidden_size, seq_len=128)
        
        # conversaton level rnn
        self.daily_tweets = DailyTweetsRNN(input_size=1, hidden_size=hidden_size)
        
        # classifier on top of feature extractor
        self.classifier = nn.Sequential(*[
            nn.Linear(in_features=self.in_features, out_features=256),
            nn.LeakyReLU(),
            nn.Linear(in_features=256, out_features=128),
            nn.LeakyReLU(),
            nn.Linear(in_features=128, out_features=num_classes)
        ])
        
        # initial hidden_states
        self.hx = torch.randn((2, 1, hidden_size)).to(device)
        
    
    def forward(self, input_ids, attention_mask, current_stock):
        """
            x.shape = [batch, seq_len, hidden_size]
        """
        
        
        outputs = self.tweet_rnn(input_ids=input_ids, attention_mask=attention_mask)
        # outputs.shape = [1, tweets, 2*768]

        
        # hidden
        hx = self.hx
        
            
        # get sentence representation as 2d-matrix and project it linearly
        m = self.context_aware_attention(hidden_states=outputs, h_forward=hx[0].detach().unsqueeze(0))

        # print(f'm.shape = {m.shape}')
        
        # apply rnn on linearly porjected vector
        hx = self.daily_tweets(input_=m, hx=hx.detach())

        # print(f'hx.shape = {hx.shape}')
        
        # concat current utterance's last hidden state to the features vector
        features = hx.view(1, -1).to(device)
            
        
        self.hx = hx.detach()
        
        logits = self.classifier(features)
        
        return logits

#### 4. PyTorch Lightning Trainer 

In [9]:
class LightningModel(pl.LightningModule):
    
    def __init__(self, model, config):
        super(LightningModel, self).__init__()
        
        self.config = config
        self.model = model #StonkNet(config=config)
        
        self.tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
        
    def forward(self, input_ids, attention_masks, current_stock):
        logits = self.model(input_ids=input_ids.squeeze(0), attention_mask=attention_masks.squeeze(0), current_stock=current_stock[0])
        return logits
    
    def configure_optimizers(self):
        return optim.Adam(params=self.parameters(), lr=self.config['lr'])
    
    def train_dataloader(self):
        train_dataset = StonkDataset(tokenizer=self.tokenizer, file=self.config['train_file'], root=self.config['root'])
        train_loader = DataLoader(dataset=train_dataset, batch_size=self.config['batch_size'], shuffle=False, num_workers=self.config['num_workers'])
        return train_loader
    
    def training_step(self, batch, batch_idx):
        
        labels = batch['label']
        logits = self(input_ids=batch['input_ids'], attention_masks=batch['attention_masks'], current_stock=batch['stock'])
        loss = F.cross_entropy(logits, labels)
        
        acc = accuracy_score(labels.cpu(), logits.argmax(dim=1).cpu())
        f1 = f1_score(labels.cpu(), logits.argmax(dim=1).cpu(), average=self.config['average'])
        
        wandb.log({"loss":loss, "accuracy":acc, "f1_score":f1})
        return {"loss":loss, "accuracy":acc, "f1_score":f1}
    
    def val_dataloader(self):
        valid_dataset = StonkDataset(tokenizer=self.tokenizer, file=self.config['valid_file'], root=self.config['root'])
        valid_loader = DataLoader(dataset=valid_dataset, batch_size=self.config['batch_size'], shuffle=False, num_workers=self.config['num_workers'])
        return valid_loader
    
    def validation_step(self, batch, batch_idx):
        
        labels = batch['label']
        logits = self(input_ids=batch['input_ids'], attention_masks=batch['attention_masks'], current_stock=batch['stock'])
        loss = F.cross_entropy(logits, labels)
        
        acc = accuracy_score(labels.cpu(), logits.argmax(dim=1).cpu())
        f1 = f1_score(labels.cpu(), logits.argmax(dim=1).cpu(), average=self.config['average'])
        precision = precision_score(labels.cpu(), logits.argmax(dim=1).cpu(), average=self.config['average'])
        recall = recall_score(labels.cpu(), logits.argmax(dim=1).cpu(), average=self.config['average'])
        return {"val_loss":loss, "val_accuracy":torch.tensor([acc]), "val_f1":torch.tensor([f1]), "val_precision":torch.tensor([precision]), "val_recall":torch.tensor([recall])}
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_acc = torch.stack([x['val_accuracy'] for x in outputs]).mean()
        avg_f1 = torch.stack([x['val_f1'] for x in outputs]).mean()
        avg_precision = torch.stack([x['val_precision'] for x in outputs]).mean()
        avg_recall = torch.stack([x['val_recall'] for x in outputs]).mean()
        wandb.log({"val_loss":avg_loss, "val_accuracy":avg_acc, "val_f1":avg_f1, "val_precision":avg_precision, "val_recall":avg_recall})
        return {"val_loss":avg_loss, "val_accuracy":avg_acc, "val_f1":avg_f1, "val_precision":avg_precision, "val_recall":avg_recall}
    
    def test_dataloader(self):
        test_dataset = StonkDataset(tokenizer=self.tokenizer, file=self.config['valid_file'], root=self.config['root'])
        test_loader = DataLoader(dataset=test_dataset, batch_size=self.config['batch_size'], shuffle=False, num_workers=self.config['num_workers'])
        return test_loader
    
    def test_step(self, batch, batch_idx):
        
        labels = batch['label']
        logits = self(input_ids=batch['input_ids'], attention_masks=batch['attention_masks'], current_stock=batch['stock'])
        loss = F.cross_entropy(logits, labels)
        
        acc = accuracy_score(labels.cpu(), logits.argmax(dim=1).cpu())
        f1 = f1_score(labels.cpu(), logits.argmax(dim=1).cpu(), average=self.config['average'])
        precision = precision_score(labels.cpu(), logits.argmax(dim=1).cpu(), average=self.config['average'])
        recall = recall_score(labels.cpu(), logits.argmax(dim=1).cpu(), average=self.config['average'])
        return {"test_loss":loss, "test_precision":torch.tensor([precision]), "test_recall":torch.tensor([recall]), "test_accuracy":torch.tensor([acc]), "test_f1":torch.tensor([f1])}
    
    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        avg_acc = torch.stack([x['test_accuracy'] for x in outputs]).mean()
        avg_f1 = torch.stack([x['test_f1'] for x in outputs]).mean()
        avg_precision = torch.stack([x['test_precision'] for x in outputs]).mean()
        avg_recall = torch.stack([x['test_recall'] for x in outputs]).mean()
        return {"test_loss":avg_loss, "test_precision":avg_precision, "test_recall":avg_recall, "test_acc":avg_acc, "test_f1":avg_f1}

In [10]:
config  = {
    
    # data 
    "root":"./stocknet-dataset/tweet/preprocessed/",
    "train_file":"./stocknet-dataset/train.csv",
    "valid_file":"./stocknet-dataset/valid.csv",

    "max_len":512,
    "batch_size":1,
    "num_workers":4,
    
    # model
    "model_name":"roberta-base", #'distilbert-base-uncased',
    "hidden_size":768,
    "num_classes":2,
    "num_layers":1,
    
    # training
    "save_dir":"./",
    "project":"stonk-net",
    "run_name":"context-aware-attention-1",
    "lr":1e-5,
    "monitor":"val_f1",
    "min_delta":0.001,
    "filepath":"./checkpoints/{epoch}-{val_f1:4f}",
    "precision":32,
    "average":"macro",
    "epochs":5,
    "device":torch.device("cuda" if torch.cuda.is_available() else "cpu"),
}

In [11]:
tokenizer = AutoTokenizer.from_pretrained(config['model_name'])

In [26]:
valid_dataset = StonkDataset(tokenizer=tokenizer, file=config['valid_file'], root=config['root'])
valid_loader = DataLoader(dataset=valid_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=config['num_workers'])

In [35]:
batch = valid_dataset[389]

In [34]:
# batch['input_ids'].shape

##### Logging

In [12]:
logger = WandbLogger(
    name=config['run_name'],
    save_dir=config["save_dir"],
    project=config["project"],
    log_model=True,
)
early_stopping = EarlyStopping(
    monitor=config["monitor"],
    min_delta=config["min_delta"],
    patience=5,
)
checkpoints = ModelCheckpoint(
    filepath=config["filepath"],
    monitor=config["monitor"],
    save_top_k=1
)



In [13]:
trainer = pl.Trainer(
    logger=logger,
    gpus=[0],
    checkpoint_callback=checkpoints,
    callbacks=[early_stopping],
    default_root_dir="./models/",
    max_epochs=config["epochs"],
    precision=config["precision"],
    automatic_optimization=True
)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [14]:
net = ContextAwareStonkNet()

In [15]:
model = LightningModel(model=net, config=config)

In [None]:
trainer.fit(model)

[34m[1mwandb[0m: Currently logged in as: [33mmacab[0m (use `wandb login --relogin` to force relogin)



  | Name  | Type                 | Params
-----------------------------------------------
0 | model | ContextAwareStonkNet | 130 M 
-----------------------------------------------
5.8 M     Trainable params
124 M     Non-trainable params
130 M     Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

In [None]:
trainer.test(model)

In [None]:
m = StonkNet(config=config)

In [None]:
print(batch['input_ids'].shape, batch['attention_masks'].shape, len(batch['stock']))

torch.Size([1, 36, 512]) torch.Size([1, 36, 512]) 1


In [None]:
logits = m.forward(input_ids=batch['input_ids'].squeeze(0), attention_masks=batch['attention_masks'].squeeze(0), current_stock=batch['stock'])

In [None]:
logits.shape

torch.Size([1, 2])

In [None]:
batch['label'].shape

torch.Size([1])