# Import and preprocess

In this section, we will load packages and models required for this NLP task and the dataset of English and German sentences.


In [0]:
# Pip install required NLP packages
!pip install bert
!pip install -U spacy

In [0]:
# Download spacy language models
!spacy download en_core_web_md
!spacy link en_core_web_md en300

!spacy download de_core_news_md
!spacy link de_core_news_md de300

In [0]:
# Import required packages
import numpy as np
import torch
import scipy
import spacy
import bert
import random
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
import re
import itertools

In [0]:
# Download stopwords for each language
nlp_de =spacy.load('de300')
nlp_en =spacy.load('en300')

nltk.download('stopwords')
stop_words_en = set(stopwords.words('english'))
stop_words_de = set(stopwords.words('german'))

In [0]:
# Get training, validation and test datasets
from os.path import exists
if not exists('ende_data.zip'):
    !wget -O ende_data.zip https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d
    !unzip ende_data.zip

When reading the sentences from the dataset, we perform basic preprocessing which include lowercasing the words, splitting connected words and remove punctuation or symbols in the remove_list.


In [0]:
remove_list = ['\n', '.', ',', '/', '\'', '-', '_']

# Wrapping method that extracts and preprocess sentences in a file.
# Parameters: file. Returns: list of sentences.
def get_sentences(f):
    file = open(f)
    lines = file.readlines()
    sentences = []
    
    for l in lines:
        sentences.append(preprocess(l))
        
    return sentences

# Method to preprocess individual sentences.
# Parameters: a single string representing a sentence. 
# Returns: a single string after pre-processing
def preprocess(line):
    text = line.lower()
    l = text.split(" ")
    
    for i, word in enumerate(l):
        if '_' in word:
            del l[i]
            l[i:i] = re.sub('_', ' ', word).split()
        elif '/' in word:
            del l[i]
            l[i:i] = re.sub('/', ' / ', word).split()
        elif len(' '.join(word.split('-')).split())>=2:
            del l[i]
            l[i:i] = re.sub('-', ' ', word).split()
    
    l = ' '.join([word for word in l if word not in remove_list])
    return l

In [0]:
en_train_sentences = get_sentences("./train.ende.src")
de_train_sentences = get_sentences("./train.ende.mt")
en_val_sentences = get_sentences("./dev.ende.src")
de_val_sentences = get_sentences("./dev.ende.mt")

with open("./train.ende.scores", "r") as f:
  train_scores = [l.rstrip('\n') for l in f.readlines()]
with open("./dev.ende.scores", "r") as f:
  val_scores = [l.rstrip('\n') for l in f.readlines()]

In [0]:
en_test_sentences = get_sentences("./test.ende.src")
de_test_sentences = get_sentences("./test.ende.mt")

# Word Embedding

This section contains the different types of word embedding models used, which are separate language BERT models or a multilingual BERT model.
To choose a specific type of word-embedding, proceed to the respective session and run the cells.

## English and German BERT tokenizer

With this tokenizer, English and German sentences are converted to word embeddings using separate models. Words within sentences are first converted to their word vectors and are then processed to ensure that the model is able to accurately identify the context of the words.
Additionally, with this word embedding method, the Named-Entity-Recognition (NER) tag can be used as part of the pre-processing method

In [0]:
symbol_list = ['\n', '$', '+', '-PRON-', '<', '=', '>', '°', '¥', '£', '\'s']

# Method that converts words in a sentence to word-embeddings. The sentence is first
# converted to tokens and then stopwords, punctuations, symbols are removed.
# Parameters: List of sentences, spacy language model, string value of language (either 'EN' or 'DE')
# and a boolean to identify if entity should be replaced (if True) or removed (if False)
# Returns: list of list of word-embeddings in each sentence
def spacy_word_emb(corpus, nlp, lang, replace_ent=True):
  corpus_doc = []
  if replace_ent:
    vec_length = 25
  else:
    vec_length = 23

  for r, sentence in enumerate(corpus):
    doc = nlp(sentence)
    t = []
    for c, token in enumerate(doc):
      if token.is_punct or token.lemma_ in symbol_list:
        continue
      elif re.search('\d', token.lemma_) is not None:
        continue

      if token.ent_type_ is "":
        if lang == 'en':
          if token.lemma_ in stop_words_en:
            continue
          else:
            vocabulary_en.append(token.lemma_)
        elif lang == 'de':
          if token.lemma_ in stop_words_de:
            continue
          else:
            vocabulary_de.append(token.lemma_)
        t.append(token.vector_norm)
      elif replace_ent:
        if token.ent_type_ not in ent_to_vec:
          random_vec = np.random.uniform(-1, 1, (300,))
          ent_to_vec[token.ent_type_] = np.linalg.norm(random_vec)
        t.append(ent_to_vec[token.ent_type_])
    
    if len(t) < vec_length:
        t.extend([0.0 for i in range(vec_length-len(t))])
    
    corpus_doc.append(t)
  return corpus_doc

In [0]:
ent_to_vec = {} # dictionary mapping entity tag type to it's vector norm
vocabulary_en = [] # All words in english corpus
vocabulary_de = [] # All words in german corpus

X_train_en = spacy_word_emb(en_train_sentences, nlp_en, 'en')
X_train_de = spacy_word_emb(de_train_sentences, nlp_de, 'de')
X_val_en = spacy_word_emb(en_val_sentences, nlp_en, 'en')
X_val_de = spacy_word_emb(en_val_sentences, nlp_de, 'de')

In [0]:
X_test_en = spacy_word_emb(en_test_sentences, nlp_en, 'en')
X_test_de = spacy_word_emb(de_test_sentences, nlp_de, 'de')

### Evaluating tokenization

This **optional** section checks how well words are tokenized for each language.

In [0]:
import pandas as pd
from google.colab import files

pd.set_option('display.max_colwidth', -1)

In [0]:
vocabulary_total = vocabulary_en.copy()
vocabulary_total.extend(vocabulary_de)

vocabulary = len(set(vocabulary_total)) + len(ent_to_vec) # total words in vocabulary

In [0]:
unique_en_words, unique_en_counts = np.unique(vocabulary_en, return_counts=True)
df_unique_en = pd.DataFrame({'word':unique_en_words, 'count':unique_en_counts}, columns=['word', 'count'])
unique_de_words, unique_de_counts = np.unique(vocabulary_de, return_counts=True)
df_unique_de = pd.DataFrame({'word':unique_de_words, 'count':unique_de_counts}, columns=['word', 'count'])

with pd.ExcelWriter('Unique_counts.xlsx') as writer:  
    df_unique_en.to_excel(writer, sheet_name='EN') # represents words and counts of unique english words
    df_unique_de.to_excel(writer, sheet_name='DE') # represents words and counts of unique german words

files.download('Unique_counts.xlsx')

## Multilingual bert tokenizer

With this tokenizer, English and German sentences are converted to word embeddings using the same model. Words within sentences are first converted to their word vectors and are then processed to ensure that the model is able to accurately identify the context of the words.

In [0]:
from sentence_transformers import SentenceTransformer
s_tokenizer = SentenceTransformer('distiluse-base-multilingual-cased')

In [0]:
X_train_en = s_tokenizer.encode(en_train_sentences)
X_train_de = s_tokenizer.encode(de_train_sentences)
X_val_en = s_tokenizer.encode(en_val_sentences)
X_val_de = s_tokenizer.encode(de_val_sentences)

In [0]:
X_test_en = s_tokenizer.encode(en_test_sentences)
X_test_de = s_tokenizer.encode(de_test_sentences)

## Part-of-Speech tag

In this section, the Part-of-Speech (POS) tags are extracted from each sentence and saved in lists. The tags can then be embedded to be input into the regression model as a vector.

In [0]:
# Identify POS tags with respect to each sentence in the file.
# Parameters: List of sentences, language of sentences.
# Return: A list of list of POS tags within a sentence.
def get_pos_tag(corpus, nlp):
    pos_list = []
    for sentence in corpus:
        doc = nlp(sentence)
        pos_list.append([token.pos_ for token in doc])
    return pos_list

In [0]:
en_train_pos = get_pos_tag(en_train_sentences, nlp_en)
de_train_pos = get_pos_tag(de_train_sentences, nlp_de)
en_val_pos = get_pos_tag(en_val_sentences, nlp_en)
de_val_pos = get_pos_tag(de_val_sentences, nlp_de)

In [0]:
en_test_pos = get_pos_tag(en_test_sentences, nlp_en)
de_test_pos = get_pos_tag(de_test_sentences, nlp_de)

# Prepare training samples

This section does the final processing of the separate English and German training vectors, scaling (if required) and PCA (if required).

In [0]:
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler

scaled = False

## Word Vectors

Currently, the word vectors for the English and German training corpuses are in separate arrays. Either the difference in the vector embeddings or the concatenated forms of the two vectors can be taken, by running the respective sections. Additionally, the validation set is merged with the training set due to the presence of KFold cross-validation during training.

### Difference in vector embeddings

In [0]:
X_train_diff = np.subtract(np.array(X_train_en), np.array(X_train_de))
X_val_diff = np.subtract(np.array(X_val_en), np.array(X_val_de))

X = np.concatenate((X_train_diff, X_val_diff), axis=0)
y = np.concatenate((np.array(train_scores), np.array(val_scores)), axis=0).astype('float')

print(X.shape)
print(y.shape)

In [0]:
XTest = np.subtract(np.array(X_test_en), np.array(X_test_de))
print(XTest.shape)

### Concatenate vector embeddings

In [0]:
X_en = np.concatenate((np.array(X_train_en), np.array(X_val_en)), axis=0)
X_de = np.concatenate((np.array(X_train_de), np.array(X_val_de)), axis=0)

X = np.concatenate((X_en, X_de), axis=1)
print(X.shape)
y = np.concatenate((np.array(train_scores), np.array(val_scores)), axis=0).astype('float')
print(y.shape)

In [0]:
XTest = np.concatenate((X_test_en, X_test_de), axis=1)
print(XTest.shape)

## Standardise data

The data can **optionally** be scaled to improve the performance of the regression model or if PCA is performed at a later stage.

In [0]:
scaled = True

scaler = {} # Save all scalers in a dictionary

if X.ndim == 2:
  scaler[0] = StandardScaler()
  X = scaler[0].fit_transform(X)
else:
  for i in range(X.shape[1]): # Process each layer differently is X is 3 dimensional
    scaler[i] = StandardScaler()
    X[:, i, :] = scaler[i].fit_transform(X[:, i, :]) 

scaler['y'] = StandardScaler()
y = np.squeeze(scaler['y'].fit_transform(y.reshape(-1,1)))

In [0]:
if X.ndim == 2:
  XTest = scaler[0].transform(XTest)
else:
  for i in range(X.shape[1]):
    XTest[:, i, :] = scaler[i].transform(XTest[:, i, :]) 

## Concatenate POS tag vectors

Similar to the one-hot-encoded vector, the vector length is set to be the length of unique POS tags. However, for each sentence, the counts of the respective POS tags are added into the vector instead of the value 1 to indicate if it is present or not. 

Run this section if you intend for the **POS tags to be input** in the regression model

In [0]:
def pos_to_ohe(l1, l2):
    unique_tags_l1 = np.unique(list(itertools.chain.from_iterable(l1)))
    unique_tags_l2 = np.unique(list(itertools.chain.from_iterable(l2)))
    global_unique_tags = np.unique(np.concatenate((unique_tags_l1, unique_tags_l2), axis=0))
    
    ohe_tokens_l1 = np.empty((len(l1), len(global_unique_tags)))
    for r, tok in enumerate(l1):
        for c, tag in enumerate(global_unique_tags):
            ohe_tokens_l1[r, c] = tok.count(tag)
    ohe_tokens_l2 = np.empty((len(l2), len(global_unique_tags)))
    for r, tok in enumerate(l2):
        for c, tag in enumerate(global_unique_tags):
            ohe_tokens_l2[r, c] = tok.count(tag)
    
    return ohe_tokens_l1, ohe_tokens_l2

In [0]:
en_tags = list(itertools.chain(en_train_pos, en_val_pos))
de_tags = list(itertools.chain(de_train_pos, de_val_pos))
en_tags, de_tags = pos_to_ohe(en_tags, de_tags)

X_tags = np.concatenate((en_tags, de_tags), axis=1)
X = np.concatenate((X, X_tags), axis=1)
print(X.shape)

In [0]:
en_test_tags, de_test_tags = pos_to_ohe(en_test_pos, de_test_pos)
X_test_tags = np.concatenate((en_test_tags, de_test_tags), axis=1)
XTest = np.concatenate((XTest, X_test_tags), axis=1)
print(XTest.shape)

## Principal Component Analysis

Principal Component Analysis (PCA) can **optionally** be performed when facing a dataset with a large number of features.

*Note: Data needs to be scaled before performing PCA*

In [0]:
from sklearn.decomposition import PCA

In [0]:
pca = PCA(0.95) # Keep 95% of variance of dataset
pca.fit(X[:len(en_train_sentences)])
print("Number of remaining components {}".format(pca.n_components_))
X = pca.transform(X)

In [0]:
XTest = pca.transform(XTest)

# Training regression model

This section contains all the regression models used for predicting the translation score. All Keras models present will first have their parameters optimised by the package talos or a simple grid-search if the model is an sklearn model.


Run the subsection with the model of your choice.

In [0]:
!pip install talos

In [0]:
import talos
from scipy.stats.stats import pearsonr
from sklearn.model_selection import KFold

In [0]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

## SVM

In [0]:
from sklearn.svm import SVR

testPredictions = np.empty((8, XTest.shape[0])) #Empty array for outputing predictions on test
i = 0

for k in ['linear', 'sigmoid', 'poly', 'rbf']:
    kf = KFold(n_splits=8, shuffle=True, random_state=42) # Having 8 splits in cross validation model
    pearson_arr = []
    print(k)
    for train_index, val_index in kf.split(X):
        Xtrain, Xval = X[train_index], X[val_index]
        Ytrain, Yval = y[train_index], y[val_index]
        
        clf_t = SVR(kernel=k)
        clf_t.fit(Xtrain, Ytrain)
        predictions = clf_t.predict(Xval)
        testPrediction = clf_t.predict(XTest)
        
        if scaled: # Transform data to original scale if data was scaled
          predictions = scaler['y'].inverse_transform(predictions)
          Yval = scaler['y'].inverse_transform(Yval)
          testPrediction = scaler['y'].inverse_transform(testPrediction)
        
        testPredictions[i] = testPrediction
        i += 1
        
        pearson = pearsonr(Yval, predictions)
        pearson_arr.append(pearson[0])
        print(f'RMSE: {rmse(predictions,Yval)} Pearson {pearson[0]}')

    print(f'Average Pearson: {np.mean(pearson_arr)}')
    print()

## RFR

In [0]:
from sklearn.ensemble import RandomForestRegressor

testResults = np.empty((8, XTest.shape[0]))
i=0

kf = KFold(n_splits=8, shuffle=True, random_state=42)
pearson_arr = []
for train_index, val_index in kf.split(X):
    Xtrain, Xval = X[train_index], X[val_index]
    Ytrain, Yval = y[train_index], y[val_index]
    
    rf = RandomForestRegressor(n_estimators = 1000, random_state = 666)
    rf.fit(Xtrain, Ytrain)
    predictions = rf.predict(Xval)
    testResults[i] = rf.predict(XTest)
    i += 1

    pearson = pearsonr(Yval, predictions)
    pearson_arr.append(pearson[0])
    print(f'RMSE: {rmse(predictions,Yval)} Pearson {pearson[0]}')

print()
print(f'Average Pearson: {np.mean(pearson_arr)}')

RMSE: 0.8761084233518699 Pearson 0.09034672501500221
RMSE: 0.7914148332181788 Pearson 0.04846836762064428
RMSE: 0.8190332842803869 Pearson -0.00610533394767424
RMSE: 0.8347071630107122 Pearson 0.020459422385660862
RMSE: 0.908480919291526 Pearson 0.08223520718842349
RMSE: 0.7979303088893454 Pearson -5.409202845798458e-05
RMSE: 0.8631800254681851 Pearson -0.027683724937758416
RMSE: 0.8991916843025259 Pearson 0.0005034948950817501

Average Pearson: 0.026021258273865245


## XGBoost

In [0]:
import xgboost as xgb

In [0]:
# Method that builds and trains model
def xgb_model(x_train, y_train, x_val, y_val=None, params):
    model = xgb.XGBRegressor(colsample_bytree=params['colsample_bytree'],
                 gamma=0.3,                 
                 learning_rate=params['learning_rate'],
                 max_depth=3,
                 min_child_weight=params['min_child_weight'],
                 n_estimators=10000,                                                                    
                 reg_alpha=params['reg_alpha'],
                 reg_lambda=params['reg_lambda'],
                 subsample=0.6)
    
    if y_val is not None: # Return to talos function
      history = model.fit(x_train, y_train, validation_data=(x_val, y_val))
      return history, model
    else: # Return to predict output
      model.fit(x_train, y_train)
      return model.predict(x_val), model

xgb_params = {
    'colsample_bytree':[0.4,0.6],
    'min_child_weight':[1.5,6],
    'learning_rate':[0.1,0.07],
    'reg_alpha':[1e-5, 1e-2],
    'reg_lambda':[1e-5, 0.45],
}

In [0]:
t = talos.Scan(x=X, y=y, model=xgb_model, fraction_limit=0.05,
               params=xgb_params, experiment_name='XGB')
r = talos.Reporting(t)
xgb_best_params = r.best_params
# r.data.sort_values(by=['val_mean_squared_error']) #run if you want to receive table of tests

In [0]:
kf = KFold(n_splits=8, shuffle=True, random_state=42)
pearson_arr = []
testResults = np.empty((8, XTest.shape[0]))
i=0

for train_index, val_index in kf.split(X):
    Xtrain, Xval = X[train_index], X[val_index]
    Ytrain, Yval = y[train_index], y[val_index]
    
    predictions, xgb_model = xgb_model(Xtrain, Ytrain, Xval, None, xgb_best_params)
    testResult = xgb_model.predict(XTest)

    if scaled:
      predictions = scaler['y'].inverse_transform(predictions)
      Yval = scaler['y'].inverse_transform(Yval)
      testResult = scaler['y'].inverse_transform(testResult)
    
    testResults[i] = testResult
    i += 1

    pearson = pearsonr(Yval, predictions)
    pearson_arr.append(pearson[0])
    print(f'RMSE: {rmse(predictions,Yval)} Pearson {pearson[0]}')

print()
print(f'Average Pearson: {np.mean(pearson_arr)}')

# Evaluating Training

This **optional** section evaluates the predictions by the model and identifies the best and worst performing sentences in terms of guaging the accuracy of translation and range of scores predicted.

In [0]:
en_corpus = en_train_sentences.copy()
en_corpus.extend(en_val_sentences)

de_corpus = de_train_sentences.copy()
de_corpus.extend(de_val_sentences)

In [0]:
# Methhod that finds the most well-predicted and worst-predicted scores of translation
# Parameters: list of model predicted scores, list of actual scores, indexes within the training corpus,
# the number of best/worst predicted sentences to output, to get the worst (True) or best (False) scores
# Returns a dataframe of the best/worst English and German sentences with the actual and predicted score
def extremes(predictons, target, val_ind, n=5, mistakes=True):
    diff = np.array(predictions)-np.array(target)
    a, b, c, d = (list(t) for t in zip(*sorted(zip(np.abs(diff), val_ind, predictions, target))))
    df = pd.DataFrame(columns=['EN', 'DE', 'Actual', 'Pred'])
    
    for i in range(1, n+1):
        if mistakes:
          i = -i
        pred = c[i]
        actual = d[i]
        ind = b[i]
        en_sentence = en_corpus[ind]
        de_sentence = de_corpus[ind]
        df = df.append(pd.Series([en_sentence, de_sentence, actual, pred], index=df.columns), 
                       ignore_index=True)
        
    return df

In [0]:
plt.hist(predictions, bins=20) # To plot range of values output

In [0]:
extremes(predictions, Yval, val_index, 50, True)

# Output Results

This section takes the mean out of the 8 predictions during the KFold cross validation and writes it into a text file.

In [0]:
testResult = np.mean(testResults, axis=0)
plt.hist(testResult, bins=20)
plt.show()

In [0]:
from zipfile import ZipFile

def writeScores(scores):
    print("")
    with open(fn, 'w') as output_file:
        for idx,x in enumerate(scores):
            output_file.write(f"{x}\n")

fn = "predictions.txt"
writeScores(testResult)
files.download(fn)

In [0]:
with ZipFile("en-de_svr.zip","w") as newzip:
	newzip.write("predictions.txt")