# Baseline Word Embeddings with Regressors

**Author**: Maleakhi Wijaya, Faidon Mitzalis, Harry Coppock  
**Date**: 20 February 2020

The file contains the following items:
- Baseline word embeddings techniques mentioned in the report (Word2Vec & GloVe)
- Regressors
  - Random Forest
  - Support Vector Machine (SVR)
  - Feed Forward Neural Network

## Baseline Word Embeddings


### Library

In [80]:
!pip install pytorch-pretrained-bert
from pytorch_pretrained_bert import BertAdam
from os.path import exists
import torchtext
import spacy
import numpy as np
import torch
from nltk import download
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import string
import jieba
import gensim 
from torchtext import data
from sklearn.svm import SVR
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
import torch.nn as nn
import torch.nn.functional as F
import logging
from sklearn.metrics import mean_squared_error
import time
import torch.optim as optim



### Importing Data

In [0]:
if not exists('enzh_data.zip'):
    !wget -O enzh_data.zip https://competitions.codalab.org/my/datasets/download/03e23bd7-8084-4542-997b-6a1ca6dd8a5f
    !unzip enzh_data.zip

In [54]:
# English-Chinese
# Checking Data
print("---EN-ZH---")
print()

with open("./train.enzh.src", "r") as enzh_src:
  print("Source: ",enzh_src.readline())
with open("./train.enzh.mt", "r") as enzh_mt:
  print("Translation: ",enzh_mt.readline())
with open("./train.enzh.scores", "r") as enzh_scores:
  print("Score: ",enzh_scores.readline())

---EN-ZH---

Source:  The last conquistador then rides on with his sword drawn.

Translation:  最后的征服者骑着他的剑继续前进.

Score:  -1.5284005772625449



### Pre-processing English with GloVe

In [55]:
# Downloading spacy models for english
!spacy download en_core_web_md
!spacy link en_core_web_md en300

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')

[38;5;1m✘ Link 'en300' already exists[0m
To overwrite an existing link, use the --force flag



In [0]:
# Embeddings
glove = torchtext.vocab.GloVe(name='6B', dim=100)

# tokenizer model
nlp_en =spacy.load('en300')

In [62]:
# Function related to generated English embeddings
# Setup stop words
download('stopwords') #stopwords dictionary, run once
stop_words_en = set(stopwords.words('english'))

def preprocess(sentence, nlp):
    """
    Pre-process sentence by normalisation, tokenizing, removing stop words, and only restrict 
    string to alphabetic characters only.

    Parameters:
    - sentence: sentence in a corpus
    - nlp: tokenizer used

    Returns:
    - list of words that has been preprocess
    """
    text = sentence.lower()
    doc = [token.lemma_ for token in  nlp.tokenizer(text)]
    doc = [word for word in doc if word not in stop_words_en]
    doc = [word for word in doc if word.isalpha()]
    return doc

def get_word_vector(embeddings, word):
    """
    Get vector representation for a given word (word embedding).

    Parameters:
    - embeddings: embedding object
    - word: string (word) to be converted to vector representation
    """
    try:
      vec = embeddings.vectors[embeddings.stoi[word]]
      return vec
    except KeyError:
      pass

def get_sentence_vector(embeddings, line):
    """
    Get sentence embedding by averaging word embeddings in sentence.

    Parameters:
    - embeddings: embedding object
    - line: sentence (list of words)

    Returns:
    - sentence embedding
    """
    vectors = []
    for w in line:
        emb = get_word_vector(embeddings,w)
        
        # Only add when word is in the dictionary
        if emb is not None:
            vectors.append(emb)
   
    return torch.mean(torch.stack(vectors), dim=0)


def get_embeddings(f, embeddings, lang):
    """
    Main methods to open file, and perform above functions.

    Parameters
    - f: file name
    - embeddings: embeddings object
    - lang: language

    Returns:
    - list of sentence embedding
    """
    file = open(f) 
    lines = file.readlines() 
    sentences_vectors =[]

    for l in lines:
        sentence= preprocess(l,lang)
        try:
            vec = get_sentence_vector(embeddings,sentence)
            sentences_vectors.append(vec)
        except:
            sentences_vectors.append(np.zeros((100,)))

    return sentences_vectors

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Pre-processing Chinese with Word2Vec



In [59]:
!wget -c https://github.com/Tony607/Chinese_sentiment_analysis/blob/master/data/chinese_stop_words.txt

!wget -O zh.zip http://vectors.nlpl.eu/repository/20/35.zip

!unzip zh.zip 

--2020-02-27 21:43:03--  https://github.com/Tony607/Chinese_sentiment_analysis/blob/master/data/chinese_stop_words.txt
Resolving github.com (github.com)... 192.30.253.112
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘chinese_stop_words.txt’

chinese_stop_words.     [<=>                 ]       0  --.-KB/s               chinese_stop_words.     [ <=>                ] 419.55K  --.-KB/s    in 0.05s   

2020-02-27 21:43:04 (7.84 MB/s) - ‘chinese_stop_words.txt’ saved [429623]

--2020-02-27 21:43:05--  http://vectors.nlpl.eu/repository/20/35.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.225
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.225|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1458485917 (1.4G) [application/zip]
Saving to: ‘zh.zip’


2020-02-27 21:44:13 (20.8 MB/s) - ‘zh.zip’ saved [1458485917/1458485917]

Arc

In [63]:
# Load pre-trained word2vec using gensim
wv_from_bin = KeyedVectors.load_word2vec_format("model.bin", binary=True) 

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# Function related to generating chinese embeddings.
stop_words = [line.rstrip() for line in open('./chinese_stop_words.txt',"r", encoding="utf-8")]

def get_sentence_vector_zh(line):
    """
    Generate sentence embeddings.

    Parameters:
    - line: list of words
    
    Returns:
    - sentence embeddings
    """
    vectors = []
    for w in line:
        try:
            emb = wv_from_bin[w]
            vectors.append(emb)
        except:
            pass # Do not add if the word is out of vocabulary
    if vectors:
        vectors = np.array(vectors)
        return np.mean(vectors, axis=0)  
    else:
        return np.zeros((100,))

def processing_zh(sentence):
    """
    Tokenization and preprocessing.

    Parameters:
    - sentence: string of words

    Returns:
    - list of tokens
    """
    seg_list = jieba.lcut(sentence,cut_all=True)
    doc = [word for word in seg_list if word not in stop_words]
    docs = [e for e in doc if e.isalnum()]
    return docs

def get_sentence_embeddings_zh(f):
    """
    Main function that call functions from above.

    Parameters:
    - f: file name

    Returns:
    - list of sentence embeddings
    """
    file = open(f) 
    lines = file.readlines() 
    sentences_vectors =[]
    for l in lines:
        sent  = processing_zh(l)
        vec = get_sentence_vector_zh(sent)

        if vec is not None:
            sentences_vectors.append(vec)
        else:
            print(l)
    
    return sentences_vectors

In [0]:
# Get sentence embeddings
zh_train_mt = get_sentence_embeddings_zh("./train.enzh.mt")
zh_train_src = get_embeddings("./train.enzh.src",glove,nlp_en)
f_train_scores = open("./train.enzh.scores",'r')
zh_train_scores = f_train_scores.readlines()

zh_val_src = get_embeddings("./dev.enzh.src",glove,nlp_en)
zh_val_mt = get_sentence_embeddings_zh("./dev.enzh.mt")
f_val_scores = open("./dev.enzh.scores",'r')
zh_val_scores = f_val_scores.readlines()

zh_test_mt = get_sentence_embeddings_zh("./test.enzh.mt")
zh_test_src = get_embeddings("./test.enzh.src",glove,nlp_en)

# Convert into required format for input to the statistical regressors
zh_train_src = np.array([arr.tolist() for arr in zh_train_src])
zh_train_mt = np.array([arr.tolist() for arr in zh_train_mt])
zh_val_src = np.array([arr.tolist() for arr in zh_val_src])
zh_val_mt = np.array([arr.tolist() for arr in zh_val_mt])
zh_test_src = np.array([arr.tolist() for arr in zh_test_src])
zh_test_mt = np.array([arr.tolist() for arr in zh_test_mt])

In [66]:
# Ensure the right number of training data, testing data
print(f"Training mt: {len(zh_train_mt)} Training src: {len(zh_train_src)}")
print()
print(f"Validation mt: {len(zh_val_mt)} Validation src: {len(zh_val_src)}")
print()
print(f"Test mt: {len(zh_test_mt)} Test src: {len(zh_test_src)}")

Training mt: 7000 Training src: 7000

Validation mt: 1000 Validation src: 1000

Test mt: 1000 Test src: 1000


### Concatenate GloVe and Word2Vec

The cell below are responsible to generate the baseline word embeddings of 200 dimensions.

In [0]:
# Generate X_train embeddings
X_train = np.concatenate((zh_train_src, zh_train_mt), axis=1)
X_train = X_train.flatten()
X_train_zh = X_train.reshape(7000, 200)

# Generate X_val embeddings
X_val = np.concatenate((zh_val_src, zh_val_mt), axis=1)
X_val = X_val.flatten()
X_val_zh = X_val.reshape(1000, 200)

# Generate X_test embeddings
X_test = np.concatenate((zh_test_src, zh_test_mt), axis=1)
X_test = X_test.flatten()
X_test_zh = X_test.reshape(1000, 200)

# Scores
train_scores = np.array(zh_train_scores).astype(float)
y_train_zh =train_scores

val_scores = np.array(zh_val_scores).astype(float)
y_val_zh =val_scores

## SVM


In [0]:
def rmse(predictions, targets):
    """
    Calculate root mean square.
    """
    return np.sqrt(((predictions - targets) ** 2).mean())

In [70]:
# Run SVM with different kernels
for k in ['linear','poly','rbf','sigmoid']:
    clf_t = SVR(kernel=k)
    clf_t.fit(X_train_zh, y_train_zh)
    print(k)
    predictions = clf_t.predict(X_val_zh)
    pearson = pearsonr(y_val_zh, predictions)
    print(f'RMSE: {rmse(predictions,y_val_zh)} Pearson {pearson[0]}')
    print("-"*50)


linear
RMSE: 0.9044962563186333 Pearson 0.3017781690203462
--------------------------------------------------
poly
RMSE: 0.8990697909416231 Pearson 0.3032902746054339
--------------------------------------------------
rbf
RMSE: 0.8900985622788053 Pearson 0.3403404558003603
--------------------------------------------------
sigmoid
RMSE: 7.152607007355879 Pearson -0.03977439348067312
--------------------------------------------------


In [0]:
# Predict (run this to make predictions to test)
clf_zh = SVR(kernel='rbf')
clf_zh.fit(X_train_zh, y_train_zh)

predictions_zh = clf_zh.predict(X_test_zh)

## Random Tree Forest

In [73]:
# Random forest training and evaluation using validation
rf = RandomForestRegressor(n_estimators = 100, random_state = 666)
rf.fit(X_train_zh, y_train_zh);
predictions = rf.predict(X_val_zh)

pearson = pearsonr(y_val_zh, predictions)
print('RMSE:', rmse(predictions,y_val_zh))
print(f"Pearson {pearson[0]}")

RMSE: 0.8830922225183105
Pearson 0.24458603542177823


In [0]:
# Predict (run this to make predictions to test)
predictions_zh = rf.predict(X_test_zh)

## Feed Forward Neural Network

This section contains code for feed forward neural network along with code to get the appropriate input for a neural network.

### Setup & Input Pre-processing

In [76]:
# Enable GPU
print('Torch version: {}, CUDA: {}'.format(torch.__version__, torch.version.cuda))
cuda_available = torch.cuda.is_available()
if not torch.cuda.is_available():
  print('WARNING: You may want to change the runtime to GPU!')
  device = 'cpu'
else:
  device = 'cuda:0'

Torch version: 1.4.0, CUDA: 10.1


In [0]:
def set_seed(seed):
    """ Set all seeds to make results reproducible (deterministic mode).
        When seed is a false-y value or not supplied, disables deterministic mode. """

    if seed:
        logging.info(f"Running in deterministic mode with seed {seed}")
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(seed)
    else:
        logging.info(f"Running in non-deterministic mode")

set_seed(4)

In [0]:
# Convert data from numpy -> torch tensor
X_train_zh = torch.Tensor(X_train_zh) 
X_val_zh = torch.Tensor(X_val_zh)

y_train_zh = torch.Tensor(y_train_zh)
y_val_zh = torch.Tensor(y_val_zh)

X_test_zh = torch.Tensor(X_test_zh)

### Model Implementation

In [0]:
class FFNN(nn.Module):
    """
    Vanilla Feed Forward Neural Network.
    """
    def __init__(self):
        super(FFNN, self).__init__()

        # Try 4 linear layers
        self.fc1 = nn.Linear(200, 800)
        self.fc2 = nn.Linear(800, 200)
        self.fc3 = nn.Linear(200, 100)
        self.fc4 = nn.Linear(100, 1)

        # Loss function
        self.loss = nn.MSELoss()
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)

        return x
    
    def get_batches(self, X, y=None, batch_size=64, test=False):
        """
        Method to separate data into batches.
        """
        # Get the number of training samples
        n_samples = X.size(0)
        n_batches = n_samples // batch_size
        n_samples = n_batches * batch_size
        
        # If it is not test, then shuffle the order
        if not test:
            permutation = torch.randperm(n_samples) # Return indices randomly ordered

            # Shuffle the dataset
            X = X[permutation, :]
            y = y[permutation]

            # Get this into batches
            X_batch = X.view(n_batches, batch_size, X.size(1))
            y_batch = y.view(n_batches, batch_size)

            return X_batch, y_batch
        else:
            order = torch.arange(0, n_samples).long()
            X = X[order, :]
            
            X_batch = X.view(n_batches, batch_size, X.size(1))

            return X_batch
    
    def train_model(self, optim, X_train, y_train, X_val, y_val, X_test, n_epochs=10, batch_size=64, shuffle=False):
        """
        Train FFNN model for n epochs.
        """
        # Get batches for training data
        X_batches, y_batches = self.get_batches(X_train, y_train, batch_size, False)

        # Iterate over n epochs
        for eidx in range(1, n_epochs+1):
            self.train() # enable training mode

            # Shuffle batch order
            if shuffle:
                batch_order = torch.randperm(X_batches.size(0))
            else:
                batch_order = torch.arange(X_batches.size(0))
            
            # Start training
            for iter_count, idx in enumerate(batch_order):
                X_batch = X_batches[idx].to(device)
                y_batch = y_batches[idx].to(device)

                # Clear gradient
                optim.zero_grad()

                # Forward pass
                preds = self.forward(X_batch)
                loss = self.loss(preds.view(-1), y_batch.view(-1))

                # Backward pass
                loss.backward()
                optim.step()

            # At the end of the epochs, evaluate on dev set
            rmse, pearson = self.evaluate(X_val, y_val, batch_size=batch_size)
            print(f"[Epoch {eidx:<3}] ended with valid rmse: {rmse:6.2f}, pearson: {pearson[0]:6.3f}")
            self.test_model(X_test, batch_size=10, epoch=eidx)

    def evaluate(self, X_val, y_val, batch_size=64):
        """
        Evaluate pearson and rmse (on test or validation).
        """
        # Initialise result tensor
        out = torch.tensor([])

        # Split into batches
        X_batches, y_batches = self.get_batches(X_val, y_val, batch_size)

        self.eval() # evaluation mode

        with torch.no_grad():
            batch_order = torch.arange(X_batches.size(0))

            for iter_count, idx in enumerate(batch_order):
                X_batch = X_batches[idx].to(device)
                
                results = (self.forward(X_batch)).cpu()
                temp = torch.cat((out, results), 0)
                out = temp
        
        # Normalise by the number of tokens in the test set
        RMSE = mean_squared_error(y_batches.view(-1), out.view(-1), squared=False)
        pears = pearsonr(y_batches.view(-1), out.view(-1))

        self.train() # switch back to training mode

        # Return metrics
        return RMSE, pears
    
    def test_model(self, X_test, batch_size=10, epoch=1):
        """
        Test model and generate text files.
        """
        # Initialise results tensor
        out = torch.tensor([])

        # Split tokens into batches
        X_batches = self.get_batches(X_test, batch_size=batch_size, test=True)

        # Evaluation mode
        self.eval()

        with torch.no_grad():
            batch_order = torch.arange(X_batches.size(0))

            for iter_count, idx in enumerate(batch_order):
                X_batch = X_batches[idx].to(device)

                # Get results
                results = (self.forward(X_batch)).cpu()
                temp = torch.cat((out, results), 0)
                out = temp
        
        # Write scores to text file for predictions every epochs
        path = "/content/drive/My Drive/Colab Notebooks/NLP_group/en-zh/predictions_new"\
              + str(epoch) + ".txt"
        np.savetxt(path,out.numpy())
    
        # Switch back to training mode
        self.train()

In [82]:
# Create model
model = FFNN()
model = model.to(device)

# Create optimizer
optimizer = BertAdam(model.parameters(), lr=2e-5, warmup=.1)

# Train the model for 50 epochs
model.train_model(optimizer, X_train_zh, y_train_zh, X_val_zh, y_val_zh, X_test_zh, n_epochs=50)



[Epoch 2  ] ended with valid rmse:   0.87, pearson:  0.270
[Epoch 3  ] ended with valid rmse:   0.86, pearson:  0.291
[Epoch 4  ] ended with valid rmse:   0.86, pearson:  0.305
[Epoch 5  ] ended with valid rmse:   0.85, pearson:  0.314
[Epoch 6  ] ended with valid rmse:   0.85, pearson:  0.322
[Epoch 7  ] ended with valid rmse:   0.85, pearson:  0.327
[Epoch 8  ] ended with valid rmse:   0.85, pearson:  0.331
[Epoch 9  ] ended with valid rmse:   0.85, pearson:  0.334
[Epoch 10 ] ended with valid rmse:   0.85, pearson:  0.336
[Epoch 11 ] ended with valid rmse:   0.85, pearson:  0.338
[Epoch 12 ] ended with valid rmse:   0.84, pearson:  0.339
[Epoch 13 ] ended with valid rmse:   0.84, pearson:  0.340
[Epoch 14 ] ended with valid rmse:   0.84, pearson:  0.341
[Epoch 15 ] ended with valid rmse:   0.84, pearson:  0.342
[Epoch 16 ] ended with valid rmse:   0.84, pearson:  0.342
[Epoch 17 ] ended with valid rmse:   0.84, pearson:  0.343
[Epoch 18 ] ended with valid rmse:   0.85, pearson:  0.3