In [56]:
import numpy as np
import pandas as pd
import json
import nltk
from data_utils import *
import torch.utils.data as data 
import time
import torch

In [57]:
def pretty_print(epoch, loss):
    print('Epoch {} completed with loss {:.4f}'.format(epoch + 1, loss * (0.9 ** epoch)))

In [58]:
with open('data.json') as f:
    dataset = json.load(f)

In [59]:
vocab = BuildVocab()
start = time.time()
for i in range(len(dataset)):
    text = TextToSentences(dataset[i]['news'].lower())
    vocab.addText(text)
print('Vocabulary built in {:.4f} sec, with size: {}'.format(time.time() - start, vocab.n_words))

Vocabulary built in 0.4098 sec, with size: 6326


In [60]:
pos = BuildPOS()
start = time.time()
for i in range(len(dataset)):
    p = dataset[i]['pos']
    pos.addPOS(p)
print('POS built in {:.4f} sec, with size: {}'.format(time.time() - start, pos.npos))

POS built in 0.0004 sec, with size: 18


In [61]:
class StockPredictionDataset(data.Dataset):
    def __init__(self, dataset, vocab, pos):
        self.data = dataset
        self.vocab = vocab
        self.pos = pos
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        news_, pos_, stock_ = self.data[i]['news'], self.data[i]['pos'], self.data[i]['stock']
        news_ = torch.Tensor(self.process_sent(news_)).long()
        pos_ = torch.Tensor(self.process_pos(news_, pos_)).long()
        stock_ = np.array(stock_)
        y = int(np.sum((stock_[-1,:6] - stock_[-2,:6])) > 0) if len(stock_) >= 2 else 1
        stock_ = torch.Tensor(stock_)
        return news_, pos_, stock_, y
    
    def process_sent(self, sent):
        tokens = nltk.word_tokenize(sent.lower())
        out = []
        out.append(self.vocab.word2index['<SOS>'])
        out.extend([self.vocab.word2index[token] for token in tokens if token in self.vocab.word2index])
        out.append(self.vocab.word2index['<EOS>'])
        return out
    
    def process_pos(self, sent, pos):
        out = [self.pos.pos2index['<SOS>']]
        out.extend([self.pos.pos2index[token] for token in pos])
        if len(sent) <= (len(pos) + 1):
            pos = pos[:len(sent) - 2]
        out.extend([self.pos.pos2index[pos[-1]] for _ in range(len(sent) - len(pos) - 2) ])
        out.append(self.pos.pos2index['<EOS>'])
        return out
            

In [62]:
def collate_fn(data):
    def merge(sequences, is_token):
        lengths = [len(seq) for seq in sequences]
        if is_token:
            padded_seqs = torch.zeros(len(sequences), max(lengths)).long()
        else:
            padded_seqs = torch.zeros(len(sequences), max(lengths), 120)
        for i, seq in enumerate(sequences):
            end = lengths[i]
            if end == 0:
                continue
            padded_seqs[i,:end] = seq[:]
        return padded_seqs, lengths
              
    news, pos, stock, y = [], [], [], []
    for i, (news_i, pos_i, stock_i, y_i) in enumerate(data):
        news.append(news_i)
        pos.append(pos_i)
        stock.append(stock_i)
        y.append(y_i)
    news.sort(key = lambda x: len(x), reverse=True)
    pos.sort(key = lambda x: len(x), reverse=True)
    stock.sort(key = lambda x: len(x), reverse=True)
    news, news_lengths = merge(news, True)
    pos, _ = merge(pos, True)
    stock, stock_lengths = merge(stock, False)
    y = torch.Tensor(y)
    return news, pos, stock, y, news_lengths, stock_lengths
    

In [63]:
dataset = StockPredictionDataset(dataset, vocab, pos)
dataloader = torch.utils.data.DataLoader(dataset = dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

In [64]:
from model import SSPM, MSSPM
import torch.nn as nn

In [65]:
word_emb_dim = 256
hidden_dim = 256
vocab_size = vocab.n_words
event_size = pos.npos
num_heads = 4
num_epochs = 10
criterion = nn.BCELoss()

In [66]:
model = SSPM(vocab_size, word_emb_dim, hidden_dim, event_size, num_heads)
model.cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

for e in range(num_epochs):
    for i, (news, pos, stock, y, sent_lengths, stock_lengths) in enumerate(dataloader):
        bsize = len(news)
        for i in range(bsize):
            if sent_lengths[i] == 0:
                sent_lengths[i] += 1
            elif stock_lengths[i] == 0:
                stock_lengths[i] += 1
        optimizer.zero_grad()"""
        news = news.cuda()
        pos = pos.cuda()
        stock = stock.cuda()
        y = y.cuda().view(-1, 1)
        probs = model(news, pos, stock, sent_lengths, stock_lengths)
        loss = criterion(probs, y)
        loss.backward()
        optimizer.step()
    pretty_print(e, loss.item())      

Epoch 1 completed with loss 0.5446
Epoch 2 completed with loss 0.7044
Epoch 3 completed with loss 0.6008
Epoch 4 completed with loss 0.4893
Epoch 5 completed with loss 0.4904
Epoch 6 completed with loss 0.3683
Epoch 7 completed with loss 0.4055
Epoch 8 completed with loss 0.3225
Epoch 9 completed with loss 0.2598
Epoch 10 completed with loss 0.2482


In [54]:
word_emb_dim = 256
hidden_dim = 256
vocab_size = vocab.n_words
event_size = pos.npos
num_heads = 4
num_epochs = 10
eta = 0.5
criterion = nn.BCELoss()

In [55]:
model = MSSPM(vocab_size, word_emb_dim, hidden_dim, event_size, num_heads)
model.cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
for e in range(num_epochs):
    for i, (news, pos, stock, y, sent_lengths, stock_lengths) in enumerate(dataloader):
        bsize = len(news)
        for i in range(bsize):
            if sent_lengths[i] == 0:
                sent_lengths[i] += 1
            elif stock_lengths[i] == 0:
                stock_lengths[i] += 1
        optimizer.zero_grad()
        news = news.cuda()
        pos = pos.cuda()
        stock = stock.cuda()
        y = y.cuda().view(-1, 1)
        probs, loss_crf = model(news, pos, stock, sent_lengths, stock_lengths)
        loss_bce = criterion(probs, y)
        loss = eta * loss_bce + (1 - eta) * loss_crf
        loss.backward()
        optimizer.step()
    pretty_print(e, loss.item())     

Epoch 1 completed with loss 3.0225
Epoch 2 completed with loss 2.1889
Epoch 3 completed with loss 2.4880
Epoch 4 completed with loss 2.1528
Epoch 5 completed with loss 1.5973
Epoch 6 completed with loss 1.6378
Epoch 7 completed with loss 1.5195
Epoch 8 completed with loss 1.3947
Epoch 9 completed with loss 1.2478
Epoch 10 completed with loss 1.0738
