# BERT Word Embeddings with Regressors

**Author**: Harry Coppock, Faidon Mitzalis, Maleakhi Wijaya  
**Date**: 20 February 2020

The file contains the following items:
- BERT Pre-processing and embedding algorithms
- Regressors
  - Feed Forward Neural Network
  - LSTM

## BERT Embeddings

### Library

In [1]:
from os.path import exists
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
!pip install pytorch-pretrained-bert
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertAdam
import logging
import torch
from scipy.stats.stats import pearsonr
from sklearn.metrics import mean_squared_error
import time 



### Importing Data

In [0]:
if not exists('enzh_data.zip'):
    !wget -O enzh_data.zip https://competitions.codalab.org/my/datasets/download/03e23bd7-8084-4542-997b-6a1ca6dd8a5f
    !unzip enzh_data.zip

In [3]:
# English-Chinese
# Checking Data
print("---EN-ZH---")
print()

with open("./train.enzh.src", "r") as enzh_src:
  print("Source: ",enzh_src.readline())
with open("./train.enzh.mt", "r") as enzh_mt:
  print("Translation: ",enzh_mt.readline())
with open("./train.enzh.scores", "r") as enzh_scores:
  print("Score: ",enzh_scores.readline())

---EN-ZH---

Source:  The last conquistador then rides on with his sword drawn.

Translation:  最后的征服者骑着他的剑继续前进.

Score:  -1.5284005772625449



In [0]:
# Read scores (train)
with open("./train.enzh.scores") as f:
    content = f.readlines()

scores_train = [float(string.replace("\n", "")) for string in content]

# Read scores (dev)
with open("./dev.enzh.scores") as f:
    content = f.readlines()

scores_dev = [float(string.replace("\n", "")) for string in content]

In [0]:
# Read english (train)
with open("./train.enzh.src") as f:
    content = f.readlines()

english_train = content

# Read english (dev)
with open("./dev.enzh.src") as f:
    content = f.readlines()

english_dev = content

# Read english (test)
with open("./test.enzh.src") as f:
    content = f.readlines()

english_test = content

In [0]:
# Read Chinese (train)
with open("./train.enzh.mt") as f:
    content = f.readlines()

zh_train = content

# Read Chinese (dev)
with open("./dev.enzh.mt") as f:
    content = f.readlines()

zh_dev = content

# Read Chinese (test)
with open("./test.enzh.mt") as f:
    content = f.readlines()
  
zh_test = content

### BERT Pre-processing

In [8]:
# Calculating the maximum sentence length for the english and the german corpus
# before tokenization
max_sent_length_english = max([len(sentence.split()) for sentence in english_train])
max_sent_length_zh = max([len(sentence.split()) for sentence in zh_train])
max_sent_length = max(max_sent_length_english, max_sent_length_zh)
print("Maximum sentence length:", max_sent_length)

Maximum sentence length: 48


In [9]:
# Initialise multilingual bert model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [0]:
def prep_corpus(corpus_en, corpus_zh, max_len):
    """
    Parameters: 
          - corpus: list of sentences
          - max_len: maximum sentence length
    Returns:
          - tokenised ids tensors (#sentences, #max_sentence_length)
          - segment flag for BERT (#sentences, #max_sentence_length)
    """
    indexed_corpus = []
    ids_corpus = []
    
    for counter, sentence in enumerate(corpus_en):
        # Mark beginning and end of sentence    
        marked_sentence_en = "[CLS] " + sentence + " [SEP]" 
        marked_sentence_zh = corpus_zh[counter]   
        tokenized_sentence_en = tokenizer.tokenize(marked_sentence_en)    
        tokenized_sentence_zh = tokenizer.tokenize(marked_sentence_zh)    
        tokenized_sentence = tokenized_sentence_en + tokenized_sentence_zh

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_sentence)
        # Add PADDING (id=0) to achieve fixed sentence length
        padding_num = (max_len*2 - np.shape(indexed_tokens)[0])
        indexed_tokens.extend([0]*padding_num)
        # Append tokenized sentence to corpus
        indexed_corpus.append(indexed_tokens)
        # Set BERT setting
        segments_ids = ([0]*len(tokenized_sentence_en) + [1]*len(tokenized_sentence_zh)
        + [0]*padding_num )
        ids_corpus.append(segments_ids)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor(indexed_corpus)
    segments_tensors = torch.tensor(ids_corpus)
  
    return tokens_tensor, segments_tensors

In [17]:
# Get BERT token ids example
emb = prep_corpus(english_train, zh_train, 75)
print(emb[0][0])

tensor([  101, 10117, 12469, 25735, 11849, 11059, 48543, 10107, 10135, 10169,
        10226, 79400, 34788,   119,   102,  4458,  2775,  5718,  3763,  4463,
         6457,  8575,  5778,  2196,  5718,  2570,  6352,  6356,  2568,  7701,
          119,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

## Regressors

This sections contains the Feed Forward Neural Network used to get the best result and commented LSTM.

### Setup

In [12]:
# Torch Setup
print('Torch version: {}, CUDA: {}'.format(torch.__version__, torch.version.cuda))
cuda_available = torch.cuda.is_available()
if not torch.cuda.is_available():
  print('WARNING: You may want to change the runtime to GPU!')
  DEVICE = 'cpu'
else:
  DEVICE = 'cuda:0'

Torch version: 1.4.0, CUDA: 10.1


In [0]:
def set_seed(seed):
    """ Set all seeds to make results reproducible (deterministic mode).
        When seed is a false-y value or not supplied, disables deterministic mode. """

    if seed:
        logging.info(f"Running in deterministic mode with seed {seed}")
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(seed)
    else:
        logging.info(f"Running in non-deterministic mode")

set_seed(4)

### Model Implementation

In [0]:
class EvaluationModel(nn.Module):
    """
    Main model class for the project. (FFNN based)
    """
    def __init__(self):   
        super(EvaluationModel, self).__init__()      
    
        # Embedding layer (BERT) which will be freezed later 
        self.bert_layer = BertModel.from_pretrained('bert-base-multilingual-cased')
        
        ########################################################################
        # For LSTM, run this code and replace self.fc1 below, the rest is same
        # self.lstm = nn.LSTM(768, 200, num_layers=1, bidirectional=True)
        # self.fc1 = nn.Linear(200, 200) 
        ########################################################################

        # Linear layers
        self.fc1 = nn.Linear(768, 200) 
        self.fc2 = nn.Linear(200, 100)
        self.fc3 = nn.Linear(100, 50)
        self.fc4 = nn.Linear(50, 1)  

        # Create the loss, don't sum or average, we'll take care of it
        # in the training loop for logging purposes
        self.loss = nn.MSELoss()

    def forward(self, tokens_tensor, segments_tensors):
        """
        Responsible for forward pass.
        """
        out, _ = self.bert_layer(tokens_tensor, segments_tensors)
        x = (out[-1])[:,0,:] # gets the last layer of embeddings
        
        ########################################################################
        # For LSTM, run this code
        # x = out[-1]
        # x = x.permute(1, 0, 2)
        # x,(h_n, c_n) = self.lstm(x)
        # x = h_n[-1]
        ########################################################################

        # Skip this line for rnn, the rest is the same
        x = x.view(tokens_tensor.size(0), -1)

        # Pass to linear layers
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
    
        return x
  
    def train_model(self, optim, train_tokens, train_segments, train_scores,
                  val_tokens, val_segments, val_scores, test_tokens, test_segments,
                  n_epochs=100, batch_size=64, shuffle=False):
        """Trains the model."""
        # Get batches for the training data
        tokens_batches, segments_batches, labels_batches = \
                self.get_batches(train_tokens,
                                 train_segments,
                                  train_scores, batch_size)

        for eidx in range(1, n_epochs + 1):
            start_time = time.time()
            epoch_loss = 0
            epoch_items = 0

            # Enable training mode
            self.train()

            # Shuffle the batch order or not
            if shuffle:
                batch_order = torch.randperm(tokens_batches.size(0))
            else:
                batch_order = torch.arange(tokens_batches.size(0))

            # Start training
            for iter_count, idx in enumerate(batch_order):
                tokens_batch = tokens_batches[idx].to(DEVICE)
                segments_batch = segments_batches[idx].to(DEVICE)
                labels_batch = labels_batches[idx].to(DEVICE)
        
                # Clear the gradients
                optim.zero_grad()

                # Get output vector of size (batch size, 1)
                x = self.forward(tokens_batch, segments_batch)

                # Apply loss function
                loss = self.loss(x.view(-1), labels_batch.view(-1))

                # Backprop the average loss and update parameters
                loss.backward()
                optim.step()

                # Sum the loss for reporting, along with the denominator
                batch_loss = loss.detach().mean()

                time_spent = time.time() - start_time

            # Evaluate on valid set every epoch
            rmse, pearson = self.evaluate(val_tokens, val_segments, val_scores, batch_size=batch_size)
            print(f'[Epoch {eidx:<3}] ended with valid rmse: {rmse:6.2f}, pearson: {pearson[0]:6.3f}')

            # Write prediction result every epoch
            self.test_model(test_tokens, test_segments, batch_size=10, epoch=eidx)

    def evaluate(self, test_tokens, test_segments, test_scores, batch_size=32):
        """Evaluates given data set in evaluation mode."""
    
        # Initialise results tensor
        out = torch.tensor([])
    
        # Split tokens into batches
        tokens_batches, segments_batches, labels_batches = \
                self.get_batches(test_tokens, test_segments,
                                 test_scores, batch_size)
        # Eval mode
        self.eval()

        with torch.no_grad():
            batch_order = torch.arange(tokens_batches.size(0))

            # Start training
            for iter_count, idx in enumerate(batch_order):

                tokens_batch = tokens_batches[idx].to(DEVICE)
                segments_batch = segments_batches[idx].to(DEVICE)
        
                # Get results from network
                results = (self.forward(tokens_batch, segments_batch)).cpu()
                temp = torch.cat((out,results),0)
                out = temp

        # Normalize by the number of tokens in the test set
        RMSE = mean_squared_error(labels_batches.view(-1), out.view(-1), squared=False)
        pears = pearsonr(labels_batches.view(-1), out.view(-1))
    
        # Switch back to training mode
        self.train()

        # return the perplexity and loss
        return RMSE, pears

    def test_model(self, test_tokens, test_segments, batch_size=10, epoch=0):
        """Test returns a txt file of the predicted test scores."""
    
        # Initialise results tensor
        out = torch.tensor([])
    
        # Split tokens into batches
        tokens_batches, segments_batches = \
                self.get_batches(test_tokens, test_segments,
                                None, batch_size, test=True)
        # Eval mode
        self.eval()

        with torch.no_grad():
            batch_order = torch.arange(tokens_batches.size(0))

      
            for iter_count, idx in enumerate(batch_order):
                tokens_batch = tokens_batches[idx].to(DEVICE)
                segments_batch = segments_batches[idx].to(DEVICE)
        
                # Get results from network
                results = (self.forward(tokens_batch, segments_batch)).cpu()
                temp = torch.cat((out,results), 0)
                out = temp

        # Write scores to txt file for prediction
        path = "/content/drive/My Drive/Colab Notebooks/NLP_group/en-zh/predictions"\
              + str(epoch) + ".txt"
        np.savetxt(path,out.numpy())
    
        # Switch back to training mode
        self.train()

    def get_batches(self, tokens_tensor, segments_tensor,
              labels, batch_size=64, test=False):
        """
        Parameters:
            - tokens_tensor: (number of sentence pairs, max_sentece_length*2)
            - segments_tensor: (number of sentence pairs, max_sentece_length*2)
            - labels: (number of sentence pairs, max_sentece_length*2)    
        Returns:
            - tokens_tensor_batch: (batch_size, max_sentece_length*2)
            - segments_tensor_batch: (batch_size, max_sentece_length*2)
            - labels_tensor_batch: (batch_size, max_sentece_length*2)
        """
        # Get the number of training sentences
        n_samples = tokens_tensor.size(0)
        n_batches = n_samples // batch_size
        n_samples = n_batches * batch_size

        if not test:
            # Get random sequence of samples
            permutation = torch.randperm(n_samples)

            tokens_tensor = tokens_tensor[permutation, :]
            segments_tensor = segments_tensor[permutation, :]
            labels = labels[permutation]

            # Re-arrange into batches
            tokens_batch = tokens_tensor.view(n_batches, batch_size,
                                                        tokens_tensor.size(1))
            segments_tensor_batch = segments_tensor.view(n_batches, batch_size,
                                                        segments_tensor.size(1))
            labels_tensor_batch = labels.view(n_batches, batch_size)

            return tokens_batch, segments_tensor_batch, labels_tensor_batch
        else:
            order = torch.arange(0,n_samples).long()
            tokens_tensor = tokens_tensor[order, :]
            segments_tensor = segments_tensor[order, :]

            # Re-arrange into batches
            tokens_batch = tokens_tensor.view(n_batches, batch_size,
                                                        tokens_tensor.size(1))
            segments_tensor_batch = segments_tensor.view(n_batches, batch_size,
                                                        segments_tensor.size(1))
            return tokens_batch, segments_tensor_batch

In [0]:
# Get the input in the right format
# Test prep corpus and the rest
max_sent_length = 75
train_tokens, train_segments = prep_corpus(english_train, zh_train, max_sent_length)
train_scores = torch.tensor(scores_train)

val_tokens, val_segments = prep_corpus(english_dev, zh_dev, max_sent_length)
val_scores = torch.tensor(scores_dev)

test_tokens, test_segments = prep_corpus(english_test, zh_test, max_sent_length)

In [22]:
# Create model
model = EvaluationModel()

# Freeze bert layers
for param in model.bert_layer.parameters():
    param.requires_grad = False

# Move model to device
model = model.to(DEVICE)

# Create the optimizer (no need for frozen parameters)
model_optimizer = BertAdam(filter(lambda p: p.requires_grad, model.parameters()),
                     lr=2e-5,
                     warmup=.1)

print('Begin training:')
# Train model:
model.train_model(model_optimizer, train_tokens, train_segments,
                  train_scores, val_tokens, val_segments, val_scores,
                  test_tokens, test_segments, n_epochs=20, shuffle=False, batch_size=16)



Begin training:
[Epoch 1  ] ended with valid rmse:   0.91, pearson:  0.251
[Epoch 2  ] ended with valid rmse:   0.90, pearson:  0.277
[Epoch 3  ] ended with valid rmse:   0.89, pearson:  0.290
[Epoch 4  ] ended with valid rmse:   0.88, pearson:  0.294
[Epoch 5  ] ended with valid rmse:   0.88, pearson:  0.300
[Epoch 6  ] ended with valid rmse:   0.88, pearson:  0.306
[Epoch 7  ] ended with valid rmse:   0.88, pearson:  0.313
[Epoch 8  ] ended with valid rmse:   0.88, pearson:  0.319
[Epoch 9  ] ended with valid rmse:   0.88, pearson:  0.323
[Epoch 10 ] ended with valid rmse:   0.88, pearson:  0.327
[Epoch 11 ] ended with valid rmse:   0.88, pearson:  0.332
[Epoch 12 ] ended with valid rmse:   0.88, pearson:  0.336
[Epoch 13 ] ended with valid rmse:   0.88, pearson:  0.339
[Epoch 14 ] ended with valid rmse:   0.87, pearson:  0.342
[Epoch 15 ] ended with valid rmse:   0.87, pearson:  0.345
[Epoch 16 ] ended with valid rmse:   0.87, pearson:  0.346
[Epoch 17 ] ended with valid rmse:   0.8