<a href="https://colab.research.google.com/github/luou-wen/NLP-learning/blob/main/20200424_NLG_Model_evaluation_with_BLEU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

ML pipeline for undergraduate dissertation.

In [None]:
#mount google drive / My Drive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!curl -L https://cpanmin.us | perl - App::cpanminus  # install cpanm
!cpanm XML::Twig

!gdown https://raw.githubusercontent.com/tuetschek/e2e-metrics/master/measure_scores.py
!gdown https://github.com/tuetschek/e2e-metrics/archive/master.zip

!unzip master.zip

In [None]:
!./e2e-metrics-master/measure_scores.py ./e2e-metrics-master/example-inputs/devel-conc.txt ./e2e-metrics-master/example-inputs/baseline-output.txt

Running MS-COCO evaluator...
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
PTBTokenizer tokenized 2198 tokens at 17069.01 tokens per second.
PTBTokenizer tokenized 162 tokens at 1864.15 tokens per second.
setting up scorers...
computing METEOR score...
METEOR: 0.480
computing Rouge score...
ROUGE_L: 0.791
computing CIDEr score...
CIDEr: 2.304
Creating temp directory  /tmp/e2e-eval-81afan6g
Running MTEval to compute BLEU & NIST...
Use of 'Hyphen' in \p{} or \P{} is deprecated because: Supplanted by Line_Break property values; see www.unicode.org/reports/tr14; at /content/e2e-metrics-master/mteval/mteval-v13a-sig.pl line 993.
MT evaluation scorer began on 2020 May 1 at 19:58:53
command line:  /content/e2e-metrics-master/mteval/mteval-v13a-sig.pl -r /tmp/e2e-eval-81afan6g/mteval_ref.sgm -s /tmp/e2e-eval-81afan6g/mteval_src.sgm -t /tmp/e2e-eval-81afan6g/mteval_sys.sgm -f /tmp/e2e-eval-81afan6g/mteval_l

In [None]:
#utils
punc_tokens = {'!': ' <EXCLAIM> ',
 '.': ' <PERIOD> ',
 '?': ' <QMARK> ',
 ',': ' <COMMA> ',
 '(': ' <LPAREN> ',
 ')': ' <RPAREN> ',
 '"': ' <QUOTE> ',
 ';': ' <SEMICOLON> ',
 '\n': ' <RETURN> ',
 '\t': ' <TAB> ',
 '~': ' <TILDE> ',
 '-': ' <HYPHEN> ',
 '\'': ' <APOST> ',
 ':': ' <COLON> '
}


def replace_punctuation(dataset):
    return [''.join([punc_tokens.get(char, char) for char in seq]) for seq in dataset]


def extract_ngrams(sequence, n=2):
    """ Extract n-grams from a sequence """
    ngrams = list(zip(*[sequence[ii:] for ii in range(n)]))

    return ngrams


def corrupt(dataset, p_drop=0.6):
    """ Corrupt sequences in a dataset by randomly dropping words """
    values, counts = np.unique(np.concatenate(dataset), return_counts=True)
    to_drop = set(values[counts > 100])

    out_seq = [[each for each in seq if np.random.rand() > p_drop*int(each in to_drop)] for seq in dataset]

    return out_seq


def shuffle(original_seq, corrupted):
    """ Shuffle elements in a corrupted sequence while keeping bigrams
        appearing in original sequence.
    """

    if not corrupted:
        return corrupted

    # Need to swap words around now but keep bigrams
    # Get bigrams for original sequence
    seq_grams = extract_ngrams(original_seq)
    # Copy this
    cor = corrupted.copy()

    # Here I need to collect the tokens into n-grams that show up in the
    # original sequence. That way when I shuffle, 2-grams, 3-grams, etc
    # will stay together during the randomization.
    to_shuffle = [[cor.pop(0)]]
    while cor:
        if len(cor) == 1:
            to_shuffle.append([cor.pop()])
        elif (to_shuffle[-1][-1], cor[0]) not in seq_grams:
            to_shuffle.append([cor.pop(0)])
        else:
            to_shuffle[-1].append(cor.pop(0))

    random.shuffle(to_shuffle)
    flattened = [elem for lst in to_shuffle for elem in lst]
    return flattened


def get_tokens(dataset):
    # Tokenize our dataset
    corpus = " ".join(dataset)
    vocab_counter = Counter(corpus.split())
    vocab = vocab_counter.keys()
    total_words = sum(vocab_counter.values())

    vocab_freqs = {word: count/total_words for word, count in vocab_counter.items()}
    vocab_sorted = sorted(vocab, key=vocab_freqs.get, reverse=True)

    # Starting at 3 here to reserve special tokens
    vocab_to_int = dict(zip(vocab_sorted, range(3, len(vocab)+3)))

    vocab_to_int["<SOS>"] = 0 # Start of sentence
    vocab_to_int["<EOS>"] = 1 # End of sentence
    vocab_to_int["<UNK>"] = 2 # Unknown word

    int_to_vocab = {val: key for key, val in vocab_to_int.items()}

    return vocab_to_int, int_to_vocab

In [None]:
!cp "/content/gdrive/My Drive/nlg_2020-04-24_08:43:49.989893.pth" /content/

!cp -r "/content/gdrive/My Drive/_Dissertation/e2e-data" /content/

In [None]:
from collections import Counter
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from datetime import datetime

In [None]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_size=300, hidden_size=256, num_layers=2, drop_p=0.5):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, 
                            dropout=drop_p, bidirectional=True)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden
    
    def init_hidden(self, device='cpu'):
        """ Create two tensors with shape (num_layers * num_directions, batch, hidden_size)
            for the hidden state and cell state
        """
        h_0, c_0 = torch.zeros(2, 2*self.num_layers, 1, self.hidden_size, device=device)
        
        return h_0, c_0

In [None]:
# Attention network from http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
class Decoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_size=300, hidden_size=256, 
                       num_layers=2, drop_p=0.1, max_length=50):
        
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.max_length = max_length

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.attn = nn.Linear(self.hidden_size + embedding_size, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2 + embedding_size, self.hidden_size)
        self.dropout = nn.Dropout(drop_p)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers, 
                            dropout=drop_p, bidirectional=True)
        
        self.out = nn.Linear(2 * hidden_size, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        
        # Learns the attention vector (a probability distribution) here for weighting
        # encoder outputs based on the decoder input and encoder hidden vector
        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0][0]), 1)), dim=1)
        
        # Applies the attention vector (again, a probability distribution) to the encoder
        # outputs which weight the encoder_outputs
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        
        # Now the decoder input is combined with the weighted encoder_outputs and
        # passed through a linear transformation as input to the LSTM layer
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)
        
        output, hidden = self.lstm(output, hidden)
        output = self.out(output).view(1, -1)
        output = self.log_softmax(output)
    
        return output, hidden, attn_weights
        
    def init_hidden(self, device='cpu'):
        """ Create two tensors with shape (num_layers * num_directions, batch, hidden_size)
            for the hidden state and cell state
        """
        h_0, c_0 = torch.zeros(2, 2*self.num_layers, 1, self.hidden_size, device=device)
        return h_0, c_0

In [None]:
def predict(input_tensor, encoder, decoder):
  max_length = 50
  with torch.no_grad():
    encoder.eval()
    decoder.eval()

    h, c = encoder.init_hidden(device=device)
    encoder_outputs = torch.zeros(max_length, 2*encoder.hidden_size).to(device)
    enc_outputs, enc_hidden = encoder.forward(input_tensor, (h, c))
    encoder_outputs[:min(enc_outputs.shape[0], max_length)] = enc_outputs[:max_length,0,:]
    
     # First decoder input is the <SOS> token
    dec_input = torch.Tensor([[0]]).type(torch.LongTensor).to(device)
    dec_hidden = enc_hidden

    EOS_TOKEN = vocab_to_int['<EOS>']
    dec_outputs = []

    for i in range(50):
      dec_out, dec_hidden, dec_attn = decoder.forward(dec_input, dec_hidden, encoder_outputs)
      _, out_token = dec_out.topk(1)

      dec_input = out_token.detach().to(device)

      dec_outputs.append(out_token)

      if out_token == EOS_TOKEN:
        break

    return dec_outputs

In [None]:
def get_test_data(dataset,trainset_size=1000,testset_size=100):
  input_tensors = []
  target_tensors = []
  for input_tensor, target_tensor in dataloader(dataset[trainset_size:trainset_size+testset_size]):
    input_tensor = input_tensor.to(device)
    target_tensor = target_tensor.to(device)
    input_tensors.append(input_tensor)
    target_tensors.append(target_tensor)
  return input_tensors, target_tensors

In [None]:
!cp "/content/gdrive/My Drive/_Dissertation/dissertation/e2e-submission-papers/challenge_submissions/corpus.csv"  /content/

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(2943, hidden_size=512, drop_p=0.1).to(device)
decoder = Decoder(2943, hidden_size=512, drop_p=0.1, max_length=50).to(device)

checkpoint = torch.load("/content/nlg_2020-04-24_08:43:49.989893.pth")
encoder.load_state_dict(checkpoint['encoder_sd'])
decoder.load_state_dict(checkpoint['decoder_sd'])

encoder.eval()
decoder.eval()

Decoder(
  (embedding): Embedding(2943, 300)
  (attn): Linear(in_features=812, out_features=50, bias=True)
  (attn_combine): Linear(in_features=1324, out_features=512, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (lstm): LSTM(512, 512, num_layers=2, dropout=0.1, bidirectional=True)
  (out): Linear(in_features=1024, out_features=2943, bias=True)
  (log_softmax): LogSoftmax()
)

In [None]:
trainset = pd.read_csv('/content/e2e-data/e2e-dataset/trainset.csv')
trainset = trainset.assign(clean=replace_punctuation(trainset['ref']))
vocab_to_int, int_to_vocab = get_tokens(trainset['clean'])

In [None]:
temp_input = "Blue Spice coffee shop city centre"
temp_tokens = []
for item in temp_input.split(" "):
  temp_tokens.append(vocab_to_int[item])
temp_tokens

[82, 118, 26, 27, 31, 32]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_tensor = torch.tensor(temp_tokens).view(-1,1).type(torch.LongTensor)
dec_outputs = predict(test_tensor.to(device), encoder, decoder)
print([int_to_vocab[each.item()] for each in dec_outputs])

['Blue', 'Spice', 'is', 'a', 'low', '<HYPHEN>', 'priced', 'coffee', 'shop', 'in', 'the', 'city', 'centre', 'centre', 'near', '<EOS>']


In [None]:
testset = pd.read_csv("/content/e2e-data/e2e-dataset/testset.csv")

In [None]:
import re
# mr = "name[The Vaults], eatType[pub], food[Italian], priceRange[more than £30], customer rating[high], area[city centre], familyFriendly[yes], near[Rainbow Vegetarian Café]"
def mr_to_tokens(mr):
  pattern = "\[.*\]"
  input_string = []
  input_tokens = []
  for string in mr.split(", "):
    match = re.search(pattern, string)
    tmp = match.group(0).replace("[", "").replace("]", "")
    input_string.append(tmp)
  x = " ".join(input_string)
  for word in x.split(" "):
    if (word in vocab_to_int):
      input_tokens.append(vocab_to_int[word])
  return input_tokens

In [None]:
output_words = []
for mr in testset["MR"]:
  test_tokens = mr_to_tokens(mr)
  test_tensor = torch.tensor(test_tokens).view(-1,1).type(torch.LongTensor)
  dec_outputs = predict(test_tensor.to(device), encoder, decoder)
  output_words.append(" ".join([int_to_vocab[each.item()] for each in dec_outputs]))

In [None]:
output_words[-10:]

['The Wrestlers is a low <HYPHEN> priced coffee shop near near the riverside Cuisine near the riverside <EOS>',
 'The coffee shop near The Wrestlers is a friendly coffee shop near Raja Indian Cuisine <PERIOD> <EOS>',
 'Zizzi is a coffee shop shop shop the riverside area <PERIOD> <EOS>',
 'The coffee shop shop The The near The near The Portland <PERIOD> The average customer rating and is family friendly <EOS>',
 'The coffee shop shop The The near The Portland near The Portland Arms <PERIOD> <EOS>',
 'The coffee shop shop The The Sorrento is The The is <PERIOD> The Sorrento <PERIOD> <EOS>',
 'Zizzi is a family friendly pub pub the riverside area <PERIOD> <EOS>',
 'The a average shop shop near near near The near The riverside <PERIOD> <EOS>',
 'The near The Portland Arms <COMMA> a high friendly coffee shop shop high customer rating <PERIOD> <EOS>',
 'The a pub friendly pub The The The The The The The <PERIOD> <EOS>']

In [None]:
punc_tokens = {'<EXCLAIM>': '!',
 '<PERIOD>': '.',
 '<QMARK>': '?',
 '<COMMA>': ',',
 '<LPAREN>': '(',
 '<RPAREN>': ')',
 '<QUOTE> ': '"',
 '<SEMICOLON>': ';',
 '<RETURN>': '\n',
 '<TAB>': '\t',
 '<TILDE>': '~',
 '<HYPHEN>': '-',
 '<APOST>': '\'',
 '<COLON>': ':'
}


def return_punctuation(dataset):
    ret_punc = []
    for seq in dataset:
      for word in seq.split(" "):
        punctuation = punc_tokens.get(word)
        if punctuation:
          seq = seq.replace(" "+word+" ", punctuation)
      seq = seq.replace('<EOS>', '')
      ret_punc.append(seq)
    return ret_punc

test_outputw = output_words
testso = return_punctuation(test_outputw)
test_df = pd.DataFrame(testso)

In [None]:
test_out = pd.read_csv("/content/e2e-data/e2e-dataset/testset_w_refs.csv")

In [None]:
with open("test_out.txt", "w") as f:
  for name, group in test_out.groupby("mr"):  
    for item in group["ref"]:
      f.write(item+"\n")
    f.write("\n")
    

In [None]:
with open("test_model_out.txt", "w") as f:
  for line in testso:
    f.write(line+"\n")

In [None]:
!./e2e-metrics-master/measure_scores.py ./test_out.txt ./test_model_out.txt

Running MS-COCO evaluator...
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
PTBTokenizer tokenized 129948 tokens at 309448.71 tokens per second.
PTBTokenizer tokenized 14221 tokens at 65655.75 tokens per second.
setting up scorers...
computing METEOR score...
METEOR: 0.210
computing Rouge score...
ROUGE_L: 0.409
computing CIDEr score...
CIDEr: 0.489
Creating temp directory  /tmp/e2e-eval-vusvbtt7
Running MTEval to compute BLEU & NIST...
Use of 'Hyphen' in \p{} or \P{} is deprecated because: Supplanted by Line_Break property values; see www.unicode.org/reports/tr14; at /content/e2e-metrics-master/mteval/mteval-v13a-sig.pl line 993.
MT evaluation scorer began on 2020 May 1 at 20:00:29
command line:  /content/e2e-metrics-master/mteval/mteval-v13a-sig.pl -r /tmp/e2e-eval-vusvbtt7/mteval_ref.sgm -s /tmp/e2e-eval-vusvbtt7/mteval_src.sgm -t /tmp/e2e-eval-vusvbtt7/mteval_sys.sgm -f /tmp/e2e-eval-vusvbtt7/mt