#**Preliminary information**#

This notebook includes "naive" baseline model for the **task a** and is mainly built upon a notebook#6 as it is suggested in the course slides.

#**Imports and set up**#

In [None]:
!pip install torchtext --upgrade
!pip install pytorch_lightning

In [None]:
#to check if gpu is available
import torch
torch.cuda.is_available()

'/device:GPU:0'

In [None]:
from typing import Tuple, List, Dict
import json
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import itertools
from torch.utils.data import Dataset
from tqdm import tqdm
import torchtext
from torchtext.vocab import vocab
from collections import Counter, OrderedDict
import pytorch_lightning as pl
from torch.utils.data import DataLoader
import torch
from torch import nn
import pprint
import torch.optim as optim
import string
import random
import numpy as np
import os

In [None]:
#for the repeatability of the experiments
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.random.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

#**Data and preprocessing**#

Two ways of creating vocabulary were used: one is based on the dataset and another one employees pretrained word embbdings (GloVe, 100d).

In [None]:
#get GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
#rm all except for 100d
!rm glove.6B.50d.txt
!rm glove.6B.200d.txt
!rm glove.6B.300d.txt
!rm glove.6B.zip

In [None]:
#create dictionary that maps words as strings to their embedding vectors
def load_embeddings(embeddings_path):

    word_vectors = dict()
    with open(embeddings_path) as f:

        for i, line in tqdm(enumerate(f)):

            word, *vector = line.strip().split(' ')
            vector = torch.tensor([float(c) for c in vector])
            word_vectors[word] = vector

    return word_vectors
word_vectors = load_embeddings('glove.6B.100d.txt')

In [None]:
#vocabulary maps words to indices, word_vectors maps words to embeddings, for the nerual network feeding we would like to map indices to embeddings
def get_embeddings(vocabulary, word_vectors):

    vectors_store = np.zeros((len(vocabulary), 100))
    for word, index in tqdm(vocabulary.get_stoi().items()):
      if index == 0 or index == 1 or word not in list(word_vectors.keys()):
        vectors_store[index] = torch.rand(100)
      else:
        vectors_store[index] = word_vectors[word]

    return torch.from_numpy(vectors_store).float()


In [None]:
#get data
!git clone https://github.com/SapienzaNLP/nlp2021-hw2/

Cloning into 'nlp2021-hw2'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 32 (delta 10), reused 18 (delta 3), pack-reused 0[K
Unpacking objects: 100% (32/32), done.


In [None]:
#datasets paths
restaurants_dev = 'nlp2021-hw2/data/restaurants_dev.json'
restaurants_train = 'nlp2021-hw2/data/restaurants_train.json'

laptops_dev = 'nlp2021-hw2/data/laptops_dev.json'
laptops_train = 'nlp2021-hw2/data/laptops_train.json'

In [None]:
#get dataset from path
def read_dataset(path: str) -> List[Dict]:

    with open(path, "r") as f:
        samples = json.load(f)

    return samples

Different experiemnts with datasets were performed. The function below allows to choose which dataset we are going to use for the training, and which one - for the evaluation.

**r** stands for the restaurants, **l** - for the laptops, and **m** means mixed.

In [None]:
def datasets_setup(mode):

      restaurants_train_samples = read_dataset(restaurants_train)
      restaurants_dev_samples = read_dataset(restaurants_dev)

      laptops_train_samples = read_dataset(laptops_train)
      laptops_dev_samples = read_dataset(laptops_dev)

      if mode == 'rr':
        return restaurants_train_samples, restaurants_dev_samples
      if mode == 'rl':
        return restaurants_train_samples, laptops_dev_samples
      if mode == 'rm':
        return restaurants_train_samples, restaurants_dev_samples + laptops_dev_samples

      if mode == 'lr':
        return laptops_train_samples, restaurants_dev_samples
      if mode == 'll':
        return laptops_train_samples, laptops_dev_samples
      if mode == 'lm':
        return laptops_train_samples, restaurants_dev_samples + laptops_dev_samples

      if mode == 'mr':
        return restaurants_train_samples + laptops_train_samples, restaurants_dev_samples
      if mode == 'ml':
        return restaurants_train_samples + laptops_train_samples, laptops_dev_samples
      if mode == 'mm':
        return restaurants_train_samples + laptops_train_samples, restaurants_dev_samples + laptops_dev_samples
        
train_samples, dev_samples = datasets_setup('mr')
print(len(train_samples), len(dev_samples))

5000 541


In [None]:
class AspectExtractionDataset(Dataset):

    def __init__(self, 
                 samples: str, 
                 window_size: int,
                 window_shift: int = -1,
                 device = 'cpu'):

      self.samples = samples
      self.window_size = window_size
      self.window_shift = window_shift if window_shift > 0 else window_size

      examples = []

      for sample in samples:
        
        #information that we need for task a from each sample dictionary is a set of ground truth aspects and sentence by itself 
        gt_term = {term_gt[1] for term_gt in sample["targets"]}
        raw_sentence = sample["text"]

        #punctuation removal
        raw_sentence = raw_sentence.translate(str.maketrans('', '', string.punctuation))

        #split sentence into tokens, split set of aspects into separate words
        tokenized_sentence = word_tokenize(raw_sentence)
        target_tokens = set(itertools.chain(*[word_tokenize(i) for i in gt_term]))
        #representation of a sentence in the form of list of flags: 1 for words that belongs to gt set and 0 otherwise
        mask = [int(i in target_tokens) for i in tokenized_sentence]

        #join sentence tokens and the mask list to create iterable object
        example = list(zip(tokenized_sentence, mask))
        
        sentence = []
        
        #attach flags to the corresponding tokens by means of dictionary representation
        for tuple_item in example:

          assert len(tuple_item) == 2
          dict_item = {"token": tuple_item[0], "gt_flag": tuple_item[1]}
          sentence.append(dict_item)

        examples.append(sentence)

      self.device = device
      #to provide processing by windowing 
      self.data = self.create_window(examples)
      #final set up of data
      self.encoded_data = None

    def index_dataset(self, vocabulary, label_vocabulary):

        self.encoded_data = []

        for i in range(len(self.data)):

            indices = torch.LongTensor(self.sentence2indices(self.data[i], vocabulary)).to(self.device)

            """The most interesting part there is that since labels are binary we could provide then without using any label_vocabulary, 
            but just using line of code commented below in green. However, the decision to create label_vocabulary was taken to handle 
            labeling of a <pad> token"""

            labels = torch.LongTensor([label_vocabulary[str(d["gt_flag"])] if d is not None else label_vocabulary["<pad>"] for d in self.data[i]]).to(self.device) 
            #labels = torch.LongTensor([d["gt_flag"] if d is not None else 0 for d in self.data[i]]).to(self.device)
            self.encoded_data.append({"inputs": indices, "outputs": labels})

    def create_window(self, examples):

        data = []

        for example in examples:

            for i in range(0, len(example), self.window_shift):

                window = example[i:i+self.window_size]
                if len(window) < self.window_size:
                    window = window + [None]*(self.window_size - len(window))     
                assert len(window) == self.window_size
                data.append(window)

        return data

    def __len__(self):
      return len(self.data)

    def __getitem__(self, idx):
        if self.encoded_data is None:
            raise RuntimeError("""Data are not encoded""")
        return self.encoded_data[idx]

    def get_elem(self, idx):
      return self.data[idx]


    #create input
    @staticmethod
    def sentence2indices(sentence: list, vocabulary: vocab):

        indices = []

        for item in sentence:

            if item is None:
              indices.append(vocabulary["<pad>"])
            elif item["token"] in vocabulary:
              indices.append(vocabulary[item["token"]])
            else:
              indices.append(vocabulary["<unk>"])

        return indices

    """ NOTE: we do not create there any decoding function since decoding is done implicitly later"""



Corresponding part of the norebook#6 is not executing for me. As it seems by the nature of errors the reason lies in the library updates. The main goal there was to keep the same idea but make it working. vocab(OrderedDict(counter)) solution ssolution is found.

In [None]:
def build_vocab(dataset, min_freq):

    counter = Counter()
    
    for i in tqdm(range(len(dataset))):
        for item in dataset.get_elem(i):
            if item is not None:
                counter[item["token"]] += 1

    vocabulary = vocab(OrderedDict(counter), min_freq)
    specials = ['<unk>', '<pad>']
    vocabulary.insert_token(specials[1], 0)
    vocabulary.insert_token(specials[0], 1)
    vocabulary.set_default_index(vocabulary[specials[0]])

    return vocabulary

def build_label_vocab(dataset):

    counter = Counter()

    for i in tqdm(range(len(dataset))):
        for item in dataset.get_elem(i):
            if item is not None:
                counter[str(item["gt_flag"])] += 1

    vocabulary = vocab(OrderedDict(counter))
    pad_token = '<pad>'
    vocabulary.insert_token(pad_token, 2)

    return vocabulary


window_size, window_shift = 100, 100
dataset = AspectExtractionDataset(train_samples, window_size, window_shift)
vocabulary = build_vocab(dataset, min_freq = 2)
label_vocabulary = build_label_vocab(dataset)
dataset.index_dataset(vocabulary, label_vocabulary)

100%|██████████| 5000/5000 [00:00<00:00, 114282.48it/s]
100%|██████████| 5000/5000 [00:00<00:00, 109634.94it/s]


In [None]:
embeddings = get_embeddings(vocabulary, word_vectors)

100%|██████████| 3566/3566 [00:23<00:00, 153.62it/s]


In DataModule everything is implemented in a standard way except for the val and test dataloaders which in our case are the same since we don't know secret test set and are using dev one both for validation and evaluation.

In [None]:
class DataModuleAspectExtraction(pl.LightningDataModule):

    def __init__(self, train_samples, dev_samples, window_size, window_shift, vocabulary, label_vocabulary, batch_size = 128):
        super(DataModuleAspectExtraction, self).__init__()
        self.train_samples = train_samples
        self.dev_samples = dev_samples
        self.window_size = window_size
        self.window_shift = window_shift
        self.vocabulary = vocabulary
        self.label_vocabulary = label_vocabulary
        self.batch_size = batch_size

    def setup(self, stage = None):

        self.train = AspectExtractionDataset(self.train_samples, self.window_size, self.window_shift)
        self.dev = AspectExtractionDataset(self.dev_samples, self.window_size, self.window_shift)

        self.train.index_dataset(self.vocabulary, self.label_vocabulary)
        self.dev.index_dataset(self.vocabulary, self.label_vocabulary)

    def train_dataloader(self):
        return DataLoader(self.train, batch_size = self.batch_size)

    def val_dataloader(self):
        return DataLoader(self.dev, batch_size = self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.dev, batch_size = self.batch_size)



##**Some useful prints for dataset exploration**##

In [None]:
#print some examples for the more convenient representation and debug of the AspectExtractionDataset class
def show_dataset():

    window_size, window_shift = 30, 30
    dataset = AspectExtractionDataset(train_samples, window_size, window_shift)

    print('Dataset test:')
    for i in range(10):
        print('  sample {}: {}'.format(i, [item["token"] + ": " +  str(item["gt_flag"]) for item in dataset.get_elem(i) if item is not None]))

show_dataset()

Dataset test:
  sample 0: ['A: 0', 'hearty: 0', 'two: 0', 'thumbs: 0', 'up: 0']
  sample 1: ['They: 0', 'outshine: 0', 'HH: 0']
  sample 2: ['From: 0', 'the: 0', 'moment: 0', 'you: 0', 'enter: 0', 'till: 0', 'the: 0', 'moment: 0', 'you: 0', 'walk: 0', 'out: 0', 'the: 0', 'friendly: 0', 'and: 0', 'helpful: 0', 'staff: 1', 'was: 0', 'was: 0', 'just: 0', 'Fantastic: 0']
  sample 3: ['My: 0', 'girlfriend: 0', 'convinced: 0', 'me: 0', 'to: 0', 'go: 0', 'in: 0', 'the: 0', 'other: 0', 'night: 0', 'for: 0', 'a: 0', 'quick: 0', 'bite: 0']
  sample 4: ['The: 0', 'sangria: 1', 'was: 0', 'pretty: 0', 'tasty: 0', 'and: 0', 'good: 0', 'on: 0', 'a: 0', 'hot: 0', 'muggy: 0', 'day: 0']
  sample 5: ['We: 0', 'go: 0', 'on: 0', 'Mondays: 0', 'for: 0', 'the: 0', 'prix: 1', 'fixe: 1', 'and: 0', 'our: 0', 'experience: 0', 'with: 0', 'the: 0', 'food: 1', 'has: 0', 'been: 0', 'comparable: 0', 'to: 0', 'Blue: 0', 'Ribbon: 0']
  sample 6: ['First: 0', 'went: 0', 'here: 0', 'to: 0', 'enjoy: 0', 'their: 0', 'garde

In [None]:
#to see what vocabulary does
print(torchtext.__version__)
print("vocab size:", len(vocabulary))
print([f"{x}:{y}" for x, y in list(vocabulary.get_stoi().items())[:10]])
print("this index: ", vocabulary["this"])
print("<pad> index: ", vocabulary["<pad>"])
print("<unk> index", vocabulary["<unk>"])
print("word at index 52: ", vocabulary.get_itos()[52])
print("unknown words are indexed at: ", vocabulary["alskfj"])

0.10.0
vocab size: 3566
['walmart:3565', 'crawling:3564', 'Buyer:3563', 'decently:3562', 'Pros:3558', 'discontinued:3557', 'safari:3555', 'Many:3554', 'subwoofer:3553', 'scary:3552']
this index:  109
<pad> index:  0
<unk> index 1
word at index 52:  been
unknown words are indexed at:  1


In [None]:
#to see label_vocabulary, note that by this way of <pad> handling we turn binary classification problem onto multi-classes one
print("vocab size:", len(label_vocabulary))
print("string to index")
print(label_vocabulary.get_stoi())
print()
print("index to string")
print(label_vocabulary.get_itos())

vocab size: 3
string to index
{'<pad>': 2, '0': 0, '1': 1}

index to string
['0', '1', '<pad>']


In [None]:
"""one more auxiliary printing of data for debug purposes"""
#inputs from the first (could be any) sentence
input_tensor = dataset[0]["inputs"]

#forms of the first (could be any) sentence
print([d["token"] if d is not None else "None" for d in dataset.get_elem(0)])
# Let’s print each token with the index in the vocabulary, ‘None’ stands for padding.
print(list(zip([d["token"] if d is not None else "None" for d in dataset.get_elem(0)], input_tensor.tolist())))
print("appetizers has index: ", vocabulary["appetizers"])

['A', 'hearty', 'two', 'thumbs', 'up', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None']
[('A', 2), ('hearty', 1), ('two', 3), ('thumbs', 4), ('up', 5), ('None', 0), ('None', 0), ('None', 0), ('None', 0), ('None', 0), ('None', 0), ('None', 0), ('None', 0), ('None', 0), ('None', 0), ('None'

In [None]:
#the most of labels correspond to the padding, but they are going to be cut off, i.e. ignored
output_tensor = dataset[0]["outputs"]
output_tensor

tensor([0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2])

In [None]:
#prints for better understanding of appropriate vocabulary/embedded vocabulary manipulations
"""print(vocabulary.get_stoi()['appetizers'])
print(word_vectors['appetizers'])
print(embeddings[vocabulary.get_stoi()['appetizers']])"""

553
tensor([-5.7418e-01,  4.9870e-01,  3.1178e-01,  6.6689e-01,  3.1680e-01,
         1.0406e-01,  9.0021e-01,  6.3674e-02, -5.0363e-01, -3.3445e-01,
         1.7486e-01,  6.1917e-02, -1.2625e-04,  6.4889e-01,  9.3980e-01,
        -2.4753e-02, -5.7799e-01, -4.2463e-01, -4.0425e-01,  5.1427e-01,
         5.8695e-02,  1.8698e-01,  8.5402e-02, -9.9113e-02, -8.2900e-02,
         6.0906e-01, -7.5863e-01, -8.1057e-02, -2.2382e-01, -1.0357e+00,
        -2.1335e-02,  1.1643e-01,  6.3951e-02, -1.3832e+00,  1.4857e-01,
         1.1152e+00,  3.2367e-03, -4.0337e-01, -1.8049e-01, -5.7761e-01,
         7.1744e-01, -8.9309e-02, -1.5069e-01, -5.7849e-01,  3.1301e-01,
         2.5030e-01, -6.7838e-01, -3.8537e-01,  1.1833e-02,  5.6701e-01,
         7.9209e-02,  2.0186e-01, -4.9630e-02, -2.5854e-01, -1.1691e+00,
         5.7579e-01,  4.4603e-01,  3.3672e-01, -1.0553e-01,  4.2730e-01,
        -9.3159e-01,  9.4412e-01,  5.3920e-01,  3.6955e-01, -2.0122e-01,
        -6.8636e-01,  6.3220e-02, -2.1345e-01, 

In [None]:
embeddings.size(0), embeddings.size(1)

(3566, 100)

In [None]:
#prints to check embedded represntation
embedding_layer = nn.Embedding(len(vocabulary), 10)
print(embedding_layer)
x_embeddings = embedding_layer(torch.LongTensor(x_ids))
print(x_embeddings.shape)
print(x_embeddings[0])
print(x_embeddings[0].detach().numpy())
print(x_embeddings[0].tolist())

Embedding(3566, 10)
torch.Size([12, 10])
tensor([-0.5353,  0.5259,  0.3934, -0.9987, -1.8888,  0.5878,  0.5776,  0.2579,
         0.3110, -1.1750], grad_fn=<SelectBackward>)
[-0.53532803  0.5259088   0.3933937  -0.99874914 -1.8888019   0.5877501
  0.5775815   0.2579008   0.3110446  -1.1749836 ]
[-0.5353280305862427, 0.5259088277816772, 0.39339369535446167, -0.9987491369247437, -1.8888019323349, 0.5877500772476196, 0.577581524848938, 0.25790080428123474, 0.3110446035861969, -1.1749836206436157]


#**Model**#

In [None]:
class AspectExtractionModel(pl.LightningModule):

    def __init__(self, hparams, embeddings = None, *args, **kwargs):
      super().__init__()

      """The model supports 2 options: with or without use of pre-trained word embeddings. next few lines of code support branching."""
      #random embedding
      self.word_embedding = nn.Embedding(hparams.vocab_size, hparams.embedding_dim)

      #if we want we can substitute random values with pretrained e.g. glove
      if embeddings is not None:
          self.word_embedding.weight.data.copy_(embeddings)

      #layers architecture is the most basic one
      self.lstm = nn.LSTM(input_size = hparams.embedding_dim, hidden_size = hparams.hidden_dim, 
                            bidirectional = hparams.bidirectional,
                            num_layers = hparams.num_layers, 
                            dropout = hparams.dropout,
                            batch_first = True)
      
      hidden_dim = hparams.hidden_dim if hparams.bidirectional is False else hparams.hidden_dim * 2

      self.dropout = nn.Dropout(hparams.dropout)

      self.lin = nn.Linear(hidden_dim, hparams.num_classes)

      #we need to apply CrossEntropyLoss since the problem formulation is not binary, but padding part is going to be ignored as it should
      self.loss_fn = nn.CrossEntropyLoss(ignore_index = label_vocabulary['<pad>'])


    def forward(self, x):
        embedding_out = self.word_embedding(x)
        dropout_out = self.dropout(embedding_out)
        o, (h, c) = self.lstm(dropout_out)
        o = self.dropout(o)
        logits = self.lin(o)
        predictions = torch.argmax(logits, -1)

        return logits, predictions

    def training_step(self, batch, batch_nb):

        inputs = batch["inputs"]
        labels = batch["outputs"]

        logits, _ = self.forward(inputs)

        logits = logits.view(-1, logits.shape[-1])
        labels = labels.view(-1)

        loss = self.loss_fn(logits, labels)

        self.log('train_loss', loss, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_nb):
        inputs = batch['inputs']
        labels = batch['outputs']

        logits, _ = self.forward(inputs)
        
        logits = logits.view(-1, logits.shape[-1])
        labels = labels.view(-1)
        sample_loss = self.loss_fn(logits, labels)
        self.log('valid_loss', sample_loss, prog_bar=True)

    def test_step(self, batch, batch_nb):
        inputs = batch['inputs']
        labels = batch['outputs']

        logits, _ = self.forward(inputs)

        logits = logits.view(-1, logits.shape[-1])
        labels = labels.view(-1)
        sample_loss = self.loss_fn(logits, labels)
        self.log('test_loss', sample_loss, prog_bar=True)

    def configure_optimizers(self):
        return optim.Adam(self.parameters())

#**Hyperparameters and Training**#

In [None]:
class HParams():
    vocab_size = len(vocabulary)
    hidden_dim = 128
    embedding_dim = 100
    num_classes = len(label_vocabulary)
    bidirectional = False
    num_layers = 1
    dropout = 0.0
hparams = HParams() 
window_size, window_shift = 100, 100
data_module = DataModuleAspectExtraction(train_samples, dev_samples, window_size, window_shift, vocabulary, label_vocabulary, batch_size = 128)
trainer = pl.Trainer(gpus = 1, val_check_interval = 1.0, max_epochs = 100)
model = AspectExtractionModel(hparams, embeddings=embeddings)
trainer.fit(model, datamodule = data_module)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | word_embedding | Embedding        | 356 K 
1 | lstm           | LSTM             | 117 K 
2 | dropout        | Dropout          | 0     
3 | lin            | Linear           | 387   
4 | loss_fn        | CrossEntropyLoss | 0     
----------------------------------------------------
474 K     Trainable params
0         Non-trainable params
474 K     Total params
1.899     Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




#**Test and Predict**#

In [None]:
dev_loss = trainer.test(model, test_dataloaders = data_module.test_dataloader())
print("dev set loss: {}".format(dev_loss))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.5051995515823364}
--------------------------------------------------------------------------------
dev set loss: [{'test_loss': 0.5051995515823364}]


Functions get predictions and allows to see input sentence, its encoding, predictions for this sentence and its decoding.

In [None]:
def print_predictions(model, dev, num_outputs, vocabulary, label_vocabulary):

    model.freeze()

    for i in range(num_outputs):

        print(f"sentence {i}\n")
        sentence = dev[i]
        
        inputs, labels = sentence["inputs"], sentence["outputs"]
        #inputs.to('cpu')
        logits, predictions = model(inputs.unsqueeze(0))

        predictions = predictions.tolist()
        pred = list()
        for indices in predictions:
            pred.append([label_vocabulary.get_itos()[i] for i in indices])

        labels = labels.tolist()
        
        print("token\t\tinput\t\tgold\t\tprediction\t\tdecoded")
        print("-"*100)
        for raw_elem, idx, label, predicted_label, decoded_prediction in zip(dev.get_elem(i), inputs.tolist(), labels, pred[0], pred[0]):
            if idx == 0:
                break
            print(f"{raw_elem['token'] if raw_elem['gt_flag']==1 else ' '}\t\t{vocabulary.get_itos()[idx]}\t\t{label_vocabulary.get_itos()[label]}\t\t{predicted_label}\t\t{raw_elem['token'] if decoded_prediction=='1' else ' '}")
        print("="*30)
    model.unfreeze()


print_predictions(model, data_module.dev, 3, vocabulary, label_vocabulary)

sentence 0

token		input		gold		prediction		decoded
----------------------------------------------------------------------------------------------------
 		Seriously		0		0		 
 		this		0		0		 
 		place		0		0		 
 		<unk>		0		0		 
 		<unk>		0		0		 
sentence 1

token		input		gold		prediction		decoded
----------------------------------------------------------------------------------------------------
 		Not		0		0		 
 		enough		0		0		 
wines		wines		1		1		wines
by		by		1		1		by
the		the		1		0		 
glass		glass		1		0		 
 		either		0		0		 
sentence 2

token		input		gold		prediction		decoded
----------------------------------------------------------------------------------------------------
 		My		0		0		 
 		wife		0		0		 
 		and		0		0		 
 		I		0		0		 
 		always		0		0		 
 		enjoy		0		0		 
 		the		0		0		 
 		young		0		0		 
 		not		0		0		 
 		always		0		0		 
 		well		0		0		 
 		<unk>		0		0		 
 		but		0		0		 
 		<unk>		0		0		 
 		friendly		0		0		 
staff		staff		1		1		staff
 		all		0		0		 
 		of		0		0

Allows to get predicted words for each input.

In [None]:
def predict(model, dataset):

    model.freeze()
    decoded = []
    for i in range(len(dataset)):

        sentence = dataset[i]
        
        inputs = sentence["inputs"]
        logits, predictions = model(inputs.unsqueeze(0))

        predictions = predictions.tolist()
        pred = list()
        for indices in predictions:
            pred.append([label_vocabulary.get_itos()[i] for i in indices])

        decoded_elem = []
        for raw_elem, idx, decoded_prediction in zip(dataset.get_elem(i), inputs.tolist(), pred[0]):
            if idx == 0:
              break
            if decoded_prediction == '1':
              decoded_elem.append(raw_elem['token'])
              
        decoded.append(set(decoded_elem))

    model.unfreeze()
    return decoded


predictions = predict(model, data_module.dev)

The ground truth aspects are not just separate words, but can consist of a several words, but model returns predictions in a word-by-word mode. therefore, we need to have an utility to collect predicted words into the same form as aspects. This simple utility generates predictions for a given sentence in the form of set by using idea of joinig current words with the previous one if their are directly following each other and both are predicted as 1. The output in the form of set is conditioned by the form of **evaluate_extraction** function povided in the github.

In [None]:
def utility(sentence, predicted_set):

    concats = []
    current_string = str()
    for word in word_tokenize(sentence):
      #print("word: ", word)
      if word in predicted_set and current_string != '':
        current_string += (' ')
        current_string += word
      else:
        if word in predicted_set:
          current_string += word
        else:
          if current_string != '':
            concats.append(current_string)
            current_string = str()

    return concats     

In [None]:
"""some useful prints for the prediction section"""
# samples = read_dataset(restaurants_dev)
# predictions = predict(model, data_module.dev)
# for i in range(30, 40):
#   print(samples[i]['text'])
#   print(predictions[i])
#   print(set(utility(samples[i]['text'], predictions[i])))
#   print(predictions[i] == set(utility(samples[i]['text'], predictions[i])))
#   print('-'*30)

In [None]:
def evaluate_extraction(samples, predictions_b):
    scores = {"tp": 0, "fp": 0, "fn": 0}
    printing_interval = 0
    for sample, prediction in zip(samples, predictions_b):
        pred_words = prediction
        pred_terms = set(utility(sample['text'], prediction))
        gt_terms = {term_gt[1] for term_gt in sample["targets"]}
        if printing_interval % 10 == 0:
            print("Predicted: ", pred_terms)
            print("True: ", gt_terms)
            print('-'*30)
        printing_interval += 1

        scores["tp"] += len(pred_terms & gt_terms)
        scores["fp"] += len(pred_terms - gt_terms)
        scores["fn"] += len(gt_terms - pred_terms)

    precision = 100 * scores["tp"] / (scores["tp"] + scores["fp"])
    recall = 100 * scores["tp"] / (scores["tp"] + scores["fn"])
    f1 = 2 * precision * recall / (precision + recall)
    print(precision)
    print(recall)
    print(f1)

predictions = predict(model, data_module.dev)
evaluate_extraction(dev_samples, predictions)

Predicted:  set()
True:  set()
------------------------------
Predicted:  set()
True:  set()
------------------------------
Predicted:  {'food'}
True:  {'food'}
------------------------------
Predicted:  set()
True:  set()
------------------------------
Predicted:  set()
True:  set()
------------------------------
Predicted:  {'table', 'reservation'}
True:  {'table', 'reservation'}
------------------------------
Predicted:  set()
True:  set()
------------------------------
Predicted:  {'dishes', 'Pad thai', 'lad nar', 'on paper', 'thai food'}
True:  {'places', 'dishes', 'Pad thai', 'lad nar', 'thai food'}
------------------------------
Predicted:  {'flavors'}
True:  {'flavors'}
------------------------------
Predicted:  {'ambience', 'Service'}
True:  {'ambience', 'Service'}
------------------------------
Predicted:  set()
True:  set()
------------------------------
Predicted:  {'service', 'food', 'decor'}
True:  {'service', 'food', 'decor'}
------------------------------
Predicted:  {'

#**Save and Load**#

In [None]:
#to save with custom name
#torch.save(model.state_dict(), os.path.join(PATH_OUTPUT_FOLDER, 'best_weights.pt'))

In [None]:
# !rm -rf sample_data
# !mkdir data

In [None]:
#%cd data

In [None]:
#%cd ..

In [None]:
#directory to save checkpoints and path to it
# root_folder = '/content/data/' 
# !mkdir output_folder
# PATH_OUTPUT_FOLDER = os.path.join(root_folder, 'output_folder/')

In [None]:
# model_loaded = AspectExtractionModel(hparams, embeddings=embeddings)
# device = 'cuda'
# model_loaded.load_state_dict(torch.load(os.path.join(PATH_OUTPUT_FOLDER, 'best_weights.pt'), 
#                                      map_location=torch.device(device)))
# predictions_loaded = predict(model_loaded, data_module.dev)
# evaluate_extraction(dev_samples, predictions_loaded)