In [9]:
#Imports

from collections import Counter

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import time
import tqdm

In [10]:
import data_processing as dp

In [11]:
# importing the module
import json
  
# Opening JSON file
f = open('json_datasets/train.json', 'r')

raw_train = json.load(f)

vocab = dp.Vocabulary()
vocab.parseText(raw_train)

dataset = dp.text2GraphDataset(raw_json_file = raw_train)
dataloader = dp.getBatches(vocab, dataset, batch_size = 8, shuffle = False)

Creating empty vocabulary object
Finished Parsing Text
Creating custom dataset for T2G task
Creating empty vocabulary object
Finished Parsing Text
Finished processing raw json file


In [12]:
for (x,y) in zip(dataloader[0]['text'],dataloader[0]['entity_inds']):
	print([x[z[0]:z[1]] for z in y])

[tensor([1493, 1497]), tensor([1492, 1493, 1494, 1495, 1496])]
[tensor([1493, 1497]), tensor([1492, 1493, 1494, 1495, 1496])]
[tensor([1493, 1497]), tensor([1493])]
[tensor([1493, 1497]), tensor([1498])]
[tensor([1493, 1497]), tensor([1498])]
[tensor([1493, 1497]), tensor([1498])]
[tensor([1493, 1497]), tensor([1499])]
[tensor([1493, 1497]), tensor([1499])]


In [13]:
inp_types = len(vocab.entities.wordlist) + len(vocab.text.wordlist)
rel_types = len(vocab.relations.wordlist)

In [14]:
dataloader[0]

{'entity_inds': [[(2, 4), (5, 10)],
  [(1, 3), (7, 12)],
  [(1, 3), (7, 8)],
  [(1, 3), (4, 5)],
  [(1, 3), (8, 9)],
  [(1, 3), (4, 5)],
  [(1, 3), (6, 7)],
  [(4, 6), (7, 8)]],
 'text_lengths': [12, 14, 10, 11, 15, 12, 9, 10],
 'entity_lengths': [2, 2, 2, 2, 2, 2, 2, 2],
 'text': tensor([[   2,    4, 1493, 1497,    6, 1492, 1493, 1494, 1495, 1496,    8,    3,
             0,    0,    0],
         [   2, 1493, 1497,    9,    4,   10,    6, 1492, 1493, 1494, 1495, 1496,
             8,    3,    0],
         [   2, 1493, 1497,    9,    4,   10,    6, 1493,    8,    3,    0,    0,
             0,    0,    0],
         [   2, 1493, 1497,   11, 1498,   12,   13,   14,   15,    8,    3,    0,
             0,    0,    0],
         [   2, 1493, 1497,   11,   16,   17,   18,    6, 1498,   12,   13,   19,
            15,    8,    3],
         [   2, 1493, 1497,   11, 1498,   12,   13,    4,   14,   15,    8,    3,
             0,    0,    0],
         [   2, 1493, 1497,   11,   20,   21, 1499,  

In [33]:
model = ModelLSTM(inp_types, rel_types, 100)

#dataloader[0]
model.forward(dataloader[0])

tensor([[[[-6.0135, -5.8261, -6.2549,  ..., -6.1588, -5.7649, -5.5606],
          [-5.3612, -5.8228, -6.2137,  ..., -5.6133, -5.3899, -6.0846]],

         [[-6.1191, -6.3883, -6.9742,  ..., -6.0371, -5.3482, -6.7017],
          [-5.8671, -4.4141, -6.6187,  ..., -5.7058, -5.9226, -5.3751]]],


        [[[-5.8942, -5.7228, -5.9620,  ..., -6.1625, -5.5074, -6.6184],
          [-6.2515, -6.4347, -6.7804,  ..., -5.3424, -6.0351, -7.0967]],

         [[-5.7410, -6.1617, -6.5203,  ..., -4.9812, -5.9386, -6.5096],
          [-6.4004, -5.8797, -6.4620,  ..., -5.7474, -5.8288, -5.9985]]],


        [[[-6.5456, -5.9010, -6.6531,  ..., -4.8142, -5.2187, -5.4348],
          [-6.2245, -5.9202, -7.1903,  ..., -5.6096, -5.7418, -5.5788]],

         [[-6.4174, -6.5166, -6.9035,  ..., -5.0348, -5.3933, -5.6015],
          [-5.7416, -6.4307, -6.0652,  ..., -5.5198, -5.5212, -6.7901]]],


        ...,


        [[[-5.8710, -5.2130, -5.7292,  ..., -6.1913, -5.8534, -6.5523],
          [-6.3254, -5.6742, -6

In [17]:
class ModelLSTM(nn.Module):
	def __init__(self, input_types, relation_types, model_dim, dropout = 0.5):
		super().__init__()

		self.word_types = input_types
		self.relation_types = relation_types
		self.dropout = dropout
		self.model_dim = model_dim

		self.emb = nn.Embedding(input_types, self.model_dim) # 40000 because we use the Bert tokenizer
		self.lstm = nn.LSTM(self.model_dim, self.model_dim//2, batch_first=True, bidirectional=True, num_layers=2)
		self.relation_layer1 = nn.Linear(self.model_dim , self.model_dim)
		self.relation_layer2 = nn.Linear(self.model_dim , self.model_dim)
		self.drop = nn.Dropout(self.dropout)
		self.projection = nn.Linear(self.model_dim , self.model_dim)
		self.decoder = nn.Linear(self.model_dim , self.relation_types)
		self.layer_norm = nn.LayerNorm(self.model_dim)

		self.init_params()

	def init_params(self):
		nn.init.xavier_normal_(self.relation_layer1.weight.data)
		nn.init.xavier_normal_(self.relation_layer2.weight.data)
		nn.init.xavier_normal_(self.projection.weight.data)
		nn.init.xavier_normal_(self.decoder.weight.data)

		nn.init.constant_(self.relation_layer1.bias.data , 0)
		nn.init.constant_(self.relation_layer2.bias.data , 0)
		nn.init.constant_(self.projection.bias.data , 0)
		nn.init.constant_(self.decoder.bias.data , 0)

	def forward(self, batch):
		sents = batch['text']
		sents, (c_0, h_0) = self.lstm(self.emb(sents))

		bs, _, hidden_dim = sents.shape
		max_ents = max([len(x) for x in batch['entity_inds']])
		
		cont_word_mask = sents.new_zeros(bs, max_ents)
		cont_word_embs = sents.new_zeros(bs, max_ents, hidden_dim)

		for b, (sent,entind) in enumerate(zip(sents,batch['entity_inds'])):
			for n_ent, wordemb in enumerate([sent[z[0]:z[1]] for z in entind]):
				cont_word_embs[b, n_ent] = torch.mean(wordemb, dim = 0)
				cont_word_mask[b, n_ent] = 1

		# bs x max_ents x model_dim
		cont_word_embs = self.layer_norm(cont_word_embs)

		rel1 = self.relation_layer1(cont_word_embs)
		rel2 = self.relation_layer2(cont_word_embs)

		#bs x max_ents x max_ents x model_dim
		out = rel1.unsqueeze(1) + rel2.unsqueeze(2)

		out = F.relu(self.drop(out))
		out = F.relu(self.projection(out))
		out = self.decoder(out)

		out = out * cont_word_mask.view(bs,max_ents,1,1) * cont_word_mask.view(bs,1,max_ents,1)

		return torch.log_softmax(out, -1)