## 1. Downloads and imports

In [None]:
from os.path import exists
if not exists('ende_data.zip'):
    !wget -O ende_data.zip https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d
    !unzip ende_data.zip

In [None]:
!spacy download en_core_web_md
!spacy link en_core_web_md en300

!spacy download de_core_news_md
!spacy link de_core_news_md de300

In [None]:
import spacy
import numpy as np

In [None]:
nlp_de =spacy.load('de300')
nlp_en =spacy.load('en300')

In [None]:
from nltk import download
from nltk.corpus import stopwords
download('stopwords') #stopwords dictionary, run once

stop_words_en = set(stopwords.words('english'))
stop_words_de = set(stopwords.words('german'))

In [None]:
!pip install keras_self_attention

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Embedding, LSTM, GRU, GlobalMaxPooling1D, SpatialDropout1D, Bidirectional, Flatten, Input, Concatenate
from keras_self_attention import SeqSelfAttention
import tensorflow as tf

In [None]:
from scipy.stats import pearsonr 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.model_selection import KFold

In [None]:
from sklearn.utils import shuffle

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
import matplotlib.pyplot as plt

## 2. Utilities

In [None]:
def get_embeddings(lines, nlp, stopwords, lang):
  unknown = nlp.vocab['unk'].vector
  punctuation = [',','.','...','\'', '"', '(', ')', '[', ']']
  lines_embs = []
  
  documents = nlp.pipe(lines, batch_size=32, n_threads=7)
  for doc in documents:
    l = []
    for token in doc:
      if token.text in stopwords or token.text in punctuation:
        continue
      if not token.has_vector:
        l.append(unknown)
      else:
        l.append(token.vector)
    lines_embs.append(l)
  return lines_embs

In [None]:
def pad_sent(lst):
    pad = 35 # maximum sentence length for train, validation and test data
    arr = []
    for i in lst:
      arr.append(np.concatenate((i, ([np.zeros(300)] * (pad-len(i)))), axis=0))
    return arr

In [None]:
# Converts scores file to list of floats
def get_scores(f):
  scores = open(f, 'r').readlines()
  for i in range(len(scores)):
    scores[i] = float(scores[i])
  return scores

## 3. Shuffling and splitting

In [None]:
# Combines training and validation data, shuffles and splits to 8000 and 1000
def shuffle_and_split():
  train_f_en = open('./train.ende.src')
  lines_train_en = train_f_en.readlines()
  train_f_de = open('./train.ende.mt')
  lines_train_de = train_f_de.readlines()
  val_f_en = open('./dev.ende.src')
  lines_val_en = val_f_en.readlines()
  val_f_de = open('./dev.ende.mt')
  lines_val_de = val_f_de.readlines()

  data = []
  for i in range(len(lines_train_en)):
    data.append((lines_train_en[i], lines_train_de[i]))
  
  for i in range(len(lines_val_en)):
    data.append((lines_val_en[i], lines_val_de[i]))

  scores = get_scores('./train.ende.scores')
  scores = scores + get_scores('./dev.ende.scores')

  X_train, X_val, y_train, y_val = train_test_split(data, scores, train_size=0.875, random_state=42, shuffle=True)

  return X_train, X_val, y_train, y_val

In [None]:
# Gets training and validation splits
X_train, X_val, y_train, y_val = shuffle_and_split()
english_train = [x for (x, _) in X_train]
german_train = [y for (_, y) in X_train]
english_val = [x for (x, _) in X_val]
german_val = [y for (_, y) in X_val]

## 4. Sentence averages to MLP

Get embeddings and pad training and validation data

In [None]:
english_embs = get_embeddings(english_train, nlp_en, stop_words_en, 'en')

In [None]:
english_embs = pad_sent(english_embs)

In [None]:
german_embs = get_embeddings(german_train, nlp_de, stop_words_de, 'de')

In [None]:
german_embs = pad_sent(german_embs)

In [None]:
for i in range(len(english_embs)):
  english_embs[i] = np.array(english_embs[i]).mean(axis=0)

In [None]:
for i in range(len(german_embs)):
  german_embs[i] = np.array(german_embs[i]).mean(axis=0)

In [None]:
english_embs2 = get_embeddings(english_val, nlp_en, stop_words_en, 'en')

In [None]:
german_embs2 = get_embeddings(german_val, nlp_de, stop_words_de, 'de')

In [None]:
english_embs2 = pad_sent(english_embs2)

In [None]:
german_embs2 = pad_sent(german_embs2)

Find averages

In [None]:
for i in range(len(german_embs2)):
  german_embs2[i] = np.array(german_embs2[i]).mean(axis=0)

In [None]:
for i in range(len(english_embs2)):
  english_embs2[i] = np.array(english_embs2[i]).mean(axis=0)

In [None]:
X_train = []
for i in range(len(english_embs)):
  X_train.append(np.concatenate((np.array(english_embs[i]), np.array(german_embs[i]))))

In [None]:
X_val = []
for i in range(len(english_embs2)):
  X_val.append(np.concatenate((np.array(english_embs2[i]), np.array(german_embs2[i]))))

MLP Model

In [None]:
m = Sequential()
m.add(Dense(64, activation='relu', input_dim=600))
m.add(Dense(128, activation='relu', input_dim=600))
m.add(Dense(64, activation='relu', input_dim=600))
m.add(Dense(1))
m.summary()
m.compile(loss='mse',
    optimizer='Adam',
    metrics=['mae'])

In [None]:
m.fit(np.array(X_train),np.array(y_train), epochs=10, validation_data=(np.array(X_val), y_val), verbose=1)

In [None]:
pearson_score, _ = pearsonr(m.predict(np.array(X_val)).squeeze(), y_val)
print("Pearson score: ", pearson_score)

## 5. Parameter tuning with cross validation for LSTM

In [None]:
def param_tuning():
  lstm_units = [32, 64, 128]
  lstm_dropouts = [0.1, 0.2, 0.01]
  dense_neurons = [[64, 128], [32, 64], [128, 256]]
  dense_activations = [["relu", "relu"], ["tanh", "tanh"], ["relu", "tanh"]]
  model_id = 0

  for units in lstm_units:
    for dropout in lstm_dropouts:
      for neurons in dense_neurons:
        for activations in dense_activations:
          print(dropout, neurons)
          cross_validation(units, dropout, neurons, activations, model_id, get_embeddings)
          model_id += 1

In [None]:
def get_baseline_lstm_model(lstm_units=64, lstm_dropout=0.1, num_of_dense=3, dense_neurons=[64,128], dense_activations=["relu", "relu"]):
  # LSTM Approach
  inputA = Input(shape=(60,300))
  inputB = Input(shape=(60,300))

  # first branch for first input
  x = SeqSelfAttention()(inputA)
  x = Bidirectional(LSTM(units=lstm_units, return_sequences=False, dropout=lstm_dropout))(x)
  # second branch for second input
  y = SeqSelfAttention()(inputB)
  y = Bidirectional(LSTM(units=lstm_units, return_sequences=False, dropout=lstm_dropout))(y)
  # combines the two branches
  combined = Concatenate(axis=-1)([x, y])
  # FC layers
  z = Dense(dense_neurons[0], activation=dense_activations[0])(combined)
  for i in range(1, num_of_dense - 1):
    z = Dense(dense_neurons[i], activation=dense_activations[i])(z)
  z = Dense(1)(z)

  model = Model(inputs=[inputA, inputB], outputs=z)
  model.summary()
  model.compile(
      loss='mse',
      optimizer='Adam',
      metrics=['mae']
  )
  return model

In [None]:
def cross_validation(units, dropout, neurons, activations, model_id, get_embeddings):
  kf = KFold(n_splits=8, shuffle=False, random_state=None)

  train_f_en = open('./train.ende.src')
  lines_train_en = train_f_en.readlines()
  train_f_de = open('./train.ende.mt')
  lines_train_de = train_f_de.readlines()
  val_f_en = open('./dev.ende.src')
  lines_val_en = val_f_en.readlines()
  val_f_de = open('./dev.ende.mt')
  lines_val_de = val_f_de.readlines()
  train_scores_f = open('./train.ende.scores')
  train_scores = train_scores_f.readlines()
  val_scores_f = open('./dev.ende.scores')
  val_scores = val_scores_f.readlines()

  # Combine training and validation data
  data = []
  for i in range(len(lines_train_en)):
    data.append((lines_train_en[i], lines_train_de[i]))
  
  for i in range(len(lines_val_en)):
    data.append((lines_val_en[i], lines_val_de[i]))

  scores = []
  for score in train_scores:
    scores.append(float(score))
  
  for score in val_scores:
    scores.append(float(score))

  shuffle(data, scores, random_state=42)
  
  average_pearson = 0
  average_mse = 0
  split = 0
  for train_index, val_index in kf.split(np.array(data)):
    # Get splits
    X_train, y_train = np.array(data)[train_index], np.array(scores)[train_index]
    X_val, y_val = np.array(data)[val_index], np.array(scores)[val_index]
    X_train = X_train.tolist()
    X_val = X_val.tolist()
    y_train = y_train.tolist()
    y_val = y_val.tolist()
    en_train_input = [x for (x, _) in X_train]
    de_train_input = [y for (_, y) in X_train]
    en_val_input = [x for (x, _) in X_val]
    de_val_input = [y for (_, y) in X_val]

    # Get embeddings
    en_train_input = get_embeddings(en_train_input, nlp_en, stop_words_en, 'en')
    de_train_input = get_embeddings(de_train_input, nlp_de, stop_words_de, 'de')
    en_train_input = pad_sent(en_train_input)
    de_train_input = pad_sent(de_train_input)
    en_val_input = get_embeddings(en_val_input, nlp_en, stop_words_en, 'en')
    de_val_input = get_embeddings(de_val_input, nlp_de, stop_words_de, 'de')
    en_val_input = pad_sent(en_val_input)
    de_val_input = pad_sent(de_val_input)

    # Keep 500 samples for testing
    X_test_en = en_val_input[500:]
    X_test_de = de_val_input[500:]
    y_test = y_val[500:]
    en_val_input = en_val_input[:500]
    de_val_input = de_val_input[:500]
    y_val = y_val[:500]
    
    # Train model
    model = get_baseline_lstm_model(lstm_units=units, lstm_dropout=dropout, dense_neurons=neurons, dense_activations=activations)
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
    history = model.fit([np.array(en_train_input), np.array(de_train_input)],np.array(y_train), epochs=10, validation_data=([en_val_input, de_val_input], y_val), verbose=1, batch_size=512, callbacks=[es])

    # Get metrics for validation predictions
    predictions = model.predict([np.array(X_test_en), np.array(X_test_de)])
    (pearson, _) = pearsonr(predictions.squeeze(), y_test)
    average_pearson += pearson
    print("Pearson score: ", pearson)
    mse, _ = model.evaluate([np.array(X_test_en), np.array(X_test_de)], y_test)
    average_mse += mse
    print("MSE: ", mse)
    split += 1
  print("Average pearson score: ", average_pearson / 8)
  print("Average mse: ", average_mse / 8)

In [None]:
param_tuning()

## 6. Different embeddings with LSTM

In [None]:
# Install FastText
!git clone https://github.com/facebookresearch/fastText.git
!pip install ./fastText/.
import fasttext
import fasttext.util

Embeddings class

In [None]:
# Embeddings class for FastText and Muse
class Embedding:

  def __init__(self):
    self.ft = None
    self.ft_de = None
    self.nlp_de = None
    self.nlp_en = None
    self.wvecs = None
    self.german_wvecs = None

  def download_fast_text(self):
    !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
    !gunzip cc.en.300.bin.gz
    !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz
    !gunzip cc.de.300.bin.gz

    self.ft = fasttext.load_model('cc.en.300.bin')
    self.ft_de = fasttext.load_model('cc.de.300.bin')
    fasttext.util.reduce_model(self.ft, 100)
    self.ft.save_model('/drive/My Drive/cc.en.100.bin')

    fasttext.util.reduce_model(self.ft_de, 100)
    self.ft.save_model('drive/My Drive/cc.de.100.bin')

  def load_fast_text(self):
    from google.colab import drive
    drive.mount('/content/drive')
    self.ft_en = fasttext.load_model('drive/My Drive/cc.de.100.bin')
    self.ft_de = fasttext.load_model('drive/My Drive/cc.en.100.bin')

  def load_muse(self):
    !wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.en.vec
    !wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.de.vec

    self.wvecs = {}
    with open("./wiki.multi.en.vec", "r") as ende_src:
      for line in ende_src:
        word = line.split(" ")[0]
        vector = [float(a) for a in line.split(" ")[1:]]
        self.wvecs[word] = vector

    self.german_wvecs = {}
    with open("./wiki.multi.de.vec", "r") as ende_src:
      for line in ende_src:
        word = line.split(" ")[0]
        vector = [float(a) for a in line.split(" ")[1:]]
        self.german_wvecs[word] = vector

In [None]:
def get_fasttext_embeddings(lines, nlp, stopwords, lang):
  unknown = nlp.vocab['unk'].vector
  punctuation = [',','.','...','\'', '"', '(', ')', '[', ']']
  lines_embs = []
  
  documents = nlp.pipe(lines, batch_size=32, n_threads=7)
  embedding = Embedding()
  embedding.load_fast_text()
  for doc in documents:
    embs = []
    for token in doc:
      if token.text in stopwords or token.text in punctuation:
        continue
      if lang == 'en':
        embs.append(embedding.ft_en.get_word_vector(token.text))
      else:
        embs.append(embedding.ft_de.get_word_vector(token.text))
    lines_embs.append(embs)
  return lines_embs

In [None]:
def get_fasttext_embedding_results():
  # Get best model
  best_model = get_baseline_lstm_model(lstm_units=64, lstm_dropout=0.1, num_of_dense=3, dense_neurons=[64,128], dense_activations=["relu", "relu"])

  # Cross validation
  cross_validation(64, 0.1, [64,128],['relu','relu'], 0, None, get_fasttext_embeddings)


In [None]:
get_fasttext_embedding_results()

In [None]:
!pip install bert-embedding

In [None]:
from bert_embedding import BertEmbedding

In [None]:
def get_bert_embeddings(lines, nlp, stopwords, lang):
  unknown = nlp.vocab['unk'].vector
  punctuation = [',','.','...','\'', '"', '(', ')', '[', ']']
  lines_embs = []

  documents = nlp.pipe(lines, batch_size=32, n_threads=7)
  embedding = BertEmbedding(model='bert_12_768_12', dataset_name='wiki_multilingual')
  for doc in documents:
    l = []
    embs = []
    for token in doc:
      if token.text in stopwords or token.text in punctuation:
        continue
      l.append(token.text)
    lines_embs.append(l)
  
  new_lines = []
  for line in lines_embs:
    line = " ".join([w for w in line])
    new_lines.append(line)

  bert_res = embedding(new_lines)
  res = [emb for (_,emb) in bert_res]
  return res

In [None]:
def get_bert_embedding_results():
  # Get best model
  best_model = get_baseline_lstm_model(lstm_units=64, lstm_dropout=0.1, num_of_dense=3, dense_neurons=[64,128], dense_activations=["relu", "relu"])

  # Cross validation
  cross_validation(64,0.1,[64,128],['relu','relu'], 0, None, get_bert_embeddings)

In [None]:
get_bert_embedding_results()

## 7. Best LSTM model training and validation

In [None]:
# Get training and validation embeddings
lines_en = open('./train.ende.src').readlines()
lines_de = open('./train.ende.mt').readlines()

english_train_embeddings = get_embeddings(lines_en, nlp_en, stop_words_en, 'en')
german_train_embeddings = get_embeddings(lines_de, nlp_de, stop_words_de, 'de')

scores_train = get_scores('./train.ende.scores')

english_val_embeddings = get_embeddings(open('./dev.ende.src').readlines(), nlp_en, stop_words_en, 'en')
german_val_embeddings = get_embeddings(open('./dev.ende.mt').readlines(), nlp_de, stop_words_de, 'de')

scores_val = get_scores('./dev.ende.scores')


In [None]:
best_model = get_baseline_lstm_model(lstm_units=64, lstm_dropout=0.1, num_of_dense=3, dense_neurons=[64,128], dense_activations=["relu", "relu"])

In [None]:
english_train_embeddings = pad_sent(english_train_embeddings)

In [None]:
german_train_embeddings = pad_sent(german_train_embeddings)

In [None]:
english_val_embeddings = pad_sent(english_val_embeddings)
german_val_embeddings = pad_sent(german_val_embeddings)

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
history = best_model.fit([np.array(english_train_embeddings), np.array(german_train_embeddings)],np.array(scores_train), epochs=10, validation_data=([english_val_embeddings[:500], german_val_embeddings[:500]], scores_val[:500]), verbose=1, batch_size=1024, callbacks=[es])

In [None]:
pearsonr(best_model.predict([np.array(english_val_embeddings[500:]), np.array(german_val_embeddings[500:])]).squeeze(), np.array(scores_val[500:]))

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
# plt.show()
plt.savefig('loss.png')

In [None]:
# Get test embeddings
english_test_embeddings = get_embeddings(open('./test.ende.src').readlines(), nlp_en, stop_words_en, 'en')
english_test_embeddings = pad_sent(english_test_embeddings)
german_test_embeddings = get_embeddings(open('./test.ende.mt').readlines(), nlp_de, stop_words_de, 'de')
german_test_embeddings = pad_sent(german_test_embeddings)

In [None]:
predictions_test = model.predict([np.array(english_test_embeddings), np.array(german_test_embeddings)])

In [None]:
f = open("predictions.txt", "w")
for num in predictions_test:
  f.write(f"{num[0]}\n")
f.close()