#**1. IMPORTING**

In [0]:
from os.path import exists
if not exists('ende_data.zip'):
    !wget -O ende_data.zip https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d
    !unzip ende_data.zip

In [44]:
if not exists('requirements.txt'):
  !wget https://raw.githubusercontent.com/Unbabel/KiwiCutter/master/requirements.txt
!pip install -r requirements.txt

--2020-02-28 18:12:11--  https://raw.githubusercontent.com/Unbabel/KiwiCutter/master/requirements.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1419 (1.4K) [text/plain]
Saving to: ‘requirements.txt’


2020-02-28 18:12:11 (220 MB/s) - ‘requirements.txt’ saved [1419/1419]

Collecting alembic==1.0.11
[?25l  Downloading https://files.pythonhosted.org/packages/7b/8b/0c98c378d93165d9809193f274c3c6e2151120d955b752419c7d43e4d857/alembic-1.0.11.tar.gz (1.0MB)
[K     |████████████████████████████████| 1.0MB 2.7MB/s 
[?25hCollecting appnope==0.1.0
  Downloading https://files.pythonhosted.org/packages/87/a9/7985e6a53402f294c8f0e8eff3151a83f1fb901fa92909bb3ff29b4d22af/appnope-0.1.0-py2.py3-none-any.whl
Collecting attrs==19.1.0
  Downloading https://files.pythonhosted

In [0]:
# All imports

# !git clone https://github.com/facebookresearch/fastText.git
# !pip install ./fastText/.
# import fasttext
# import fasttext.util

import spacy
import numpy as np

from nltk import download
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity as sim

import pickle

from scipy.optimize import linear_sum_assignment

import tqdm

import kiwi

In [46]:
# Get OpenKiwi model
if not exists('estimator_en_de.zip'):
    !wget -o estimator_en_de.zip 'https://github.com/unbabel/KiwiCutter/releases/download/v1.0/estimator_en_de.torch.zip'
    !unzip estimator_en_de.torch
model_kiwi = kiwi.load_model('estimator_en_de.torch')

model_kiwi.predict({'source': ['I like to hike in the mountains'],
                    'target':['Ich wandere gerne in den Bergen']})

Archive:  estimator_en_de.torch.zip
  inflating: estimator_en_de.torch   


{'gap_tags': [[0.5304548740386963,
   0.0649648904800415,
   0.0730169266462326,
   0.08218620717525482,
   0.03723501041531563,
   0.15140247344970703,
   0.02526249922811985]],
 'sentence_scores': [0.6461204886436462],
 'tags': [[0.9378803372383118,
   0.9655714631080627,
   0.9754980206489563,
   0.9426648616790771,
   0.9505487680435181,
   0.9422469735145569]]}

# **2. Preprocessing and Feature Extraction**

In [0]:
# Download and load different kinds of embeddings
class Embedding:
  def __init__(self):
    self.ft = None
    self.ft_de = None
    self.nlp_de = None
    self.nlp_en = None
    self.wvecs = None
    self.german_wvecs = None
    # stopwords dictionary, run once
    download('stopwords')
    self.stop_words_en = set(stopwords.words('english'))
    self.stop_words_de = set(stopwords.words('german'))

  def download_fast_text(self):
    !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
    !gunzip cc.en.300.bin.gz
    !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz
    !gunzip cc.de.300.bin.gz
    from google.colab import drive
    drive.mount('/content/drive')

    # FastText models are too large, hence reduce the model dimentions to 100
    self.ft = fasttext.load_model('cc.en.300.bin')
    self.ft_de = fasttext.load_model('cc.de.300.bin')
    fasttext.util.reduce_model(self.ft, 100)
    self.ft.save_model('/drive/My Drive/cc.en.100.bin')

    fasttext.util.reduce_model(self.ft_de, 100)
    self.ft.save_model('drive/My Drive/cc.de.100.bin')

  def load_fast_text(self):
    self.ft_en = fasttext.load_model('drive/My Drive/cc.de.100.bin')
    self.ft_de = fasttext.load_model('drive/My Drive/cc.en.100.bin')

  def load_spacy(self):
    !spacy download en_core_web_md
    !spacy link en_core_web_md en300

    !spacy download de_core_news_md
    !spacy link de_core_news_md de300

    self.nlp_de = spacy.load('de300')
    self.nlp_en = spacy.load('en300')

  def load_muse(self):
    !wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.en.vec
    !wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.de.vec

    self.wvecs = {}
    with open("./wiki.multi.en.vec", "r") as ende_src:
      for line in ende_src:
        word = line.split(" ")[0]
        vector = [float(a) for a in line.split(" ")[1:]]
        self.wvecs[word] = vector

    self.german_wvecs = {}
    with open("./wiki.multi.de.vec", "r") as ende_src:
      for line in ende_src:
        word = line.split(" ")[0]
        vector = [float(a) for a in line.split(" ")[1:]]
        self.german_wvecs[word] = vector

In [0]:
# Helper functions
class HelperMethods:

  @staticmethod
  def get_sentence_emb(line, nlp, lang):
    if lang == 'en':
      text = line.lower()
      l = [token.lemma_ for token in nlp.tokenizer(text)]
      l = ' '.join([word for word in l if word not in embedding.stop_words_en])

    elif lang == 'de':
      text = line.lower()
      l = [token.lemma_ for token in nlp.tokenizer(text)]
      l = ' '.join([word for word in l if word not in embedding.stop_words_de])

    sentence = nlp(l)
    return sentence.vector

  @staticmethod
  def get_sentence_emb_using_word_embs(line, nlp, lang):
      if lang == 'en':
          text = line.lower()
          l = [token.lemma_ for token in nlp.tokenizer(text)]
          l = [word for word in l if word not in embedding.stop_words_en]

      elif lang == 'de':
          text = line.lower()
          l = [token.lemma_ for token in nlp.tokenizer(text)]
          l = [word for word in l if word not in embedding.stop_words_de]

      sentence = []
      for word in l:
          sentence.append(nlp(word).vector)
      return sentence

  @staticmethod
  def get_tokens(line, lang, nlp):
    text = line.lower()
    l = [token.lemma_ for token in nlp.tokenizer(text)]
    l = ' '.join([word for word in l])
    return nlp(l)

In [0]:
# Static methods that compute the features for a sentence or a pair of sentencess
class FeatureExtraction:
  @staticmethod
  def get_pos_tag_counts(line, lang, nlp):
    pos_counts = {'ADJ': 0,
                  'ADP': 0,
                  'ADV': 0,
                  'AUX': 0,
                  'CONJ':	0,
                  'CCONJ':0,
                  'DET':	0,
                  'INTJ':	0,
                  'NOUN': 0,
                  'NUM': 0,
                  'PART':	0,
                  'PRON': 0,
                  'PROPN': 0,
                  'PUNCT': 0,
                  'SCONJ': 0,
                  'SYM': 0,
                  'VERB': 0,
                  'X': 0,
                  'SPACE': 0}
    
    sen = HelperMethods.get_tokens(line, lang, nlp)
    for token in sen:
      pos_counts[token.pos_] += 1
    
    return pos_counts

  @staticmethod
  def get_kiwi_scores(en_sent, de_sent):
    scores = model_kiwi.predict({'source': [en_sent], 'target':[de_sent]})
    return scores['sentence_scores'][0]

  @staticmethod
  def get_cosine_sim_with_google_translations(filepath):
    file = open(filepath)
    lines = file.readlines()
    return np.array(lines).astype(float)

  @staticmethod
  def get_named_entities(line, lang, nlp):
    named_entities = {
        u'CARDINAL': 0, 
        u'DATE': 0, 
        u'EVENT': 0, 
        u'FAC': 0, 
        u'GPE': 0, 
        u'LANGUAGE': 0, 
        u'LAW': 0, 
        u'LOC': 0, 
        u'MONEY': 0, 
        u'NORP': 0, 
        u'ORDINAL': 0, 
        u'ORG': 0, 
        u'PERCENT': 0, 
        u'PERSON': 0, 
        u'PRODUCT': 0, 
        u'QUANTITY': 0, 
        u'TIME': 0, 
        u'WORK_OF_ART': 0, 
        u'': 0, 
        u'MISC': 0,
        u'PER': 0
    }
    sen = HelperMethods.get_tokens(line, lang, nlp)
    for ent in sen.ents:
      if ent.label_ in named_entities:
        named_entities[ent.label_] += 1
      else:
        print("Encountered unknown label:", ent.label_)
        named_entities[''] += 1
    
    return named_entities

  @staticmethod
  def get_num_of_tokens(sent, lang, nlp):
    tokens = HelperMethods.get_tokens(sent, lang, nlp)
    return len(tokens)

  @staticmethod
  def get_avg_source_token_length(sent, nlp_en):
    tokens = HelperMethods.get_tokens(sent, 'en', nlp_en)
    sum = 0
    for token in tokens:
      sum += len(token)
    return sum / len(tokens)

  @staticmethod
  def get_num_of_punctuations(sent, nlp):
    punctuation = [',','.','...','\'', '"', '(', ')', '[', ']']
    tokens = HelperMethods.get_tokens(sent, 'en', nlp)
    sum = 0
    for token in tokens:
      if token.text in punctuation:
        sum += 1
    return sum

  @staticmethod
  def get_num_of_numberic_tokens(sent, nlp):
    tokens = HelperMethods.get_tokens(sent, 'en', nlp)
    sum = 0
    for token in tokens:
      if token.text.isdigit():
        sum += 1
    return sum

  @staticmethod
  def get_num_of_alpha_tokens(sent, nlp):
    tokens = HelperMethods.get_tokens(sent, 'en', nlp)
    sum = 0
    for token in tokens:
      if token.text.isalpha():
        sum += 1
    return sum

  @staticmethod
  def get_ratio_of_target_source_lengths(en_sent, de_sent, nlp_en, nlp_de):
    standard_ratio_of_target_source = 219 / 200
    num_of_tokens_en = FeatureExtraction.get_num_of_tokens(en_sent, 'en', nlp_en)
    num_of_tokens_de = FeatureExtraction.get_num_of_tokens(de_sent, 'de', nlp_de)
    return (num_of_tokens_en / num_of_tokens_de) - standard_ratio_of_target_source

  @staticmethod
  def get_percentage_of_numbers_in_target(sent, nlp):
    return FeatureExtraction.get_num_of_numberic_tokens(sent, nlp) / FeatureExtraction.get_num_of_tokens(sent, 'de', nlp)

  # Helper method for get_similarities - The Hungarian matching algorithm
  @staticmethod
  def get_most_matching_words(matrix):
    row_ind, col_ind = linear_sum_assignment(matrix)
    return row_ind, col_ind

  # Helper method for get_similarities
  @staticmethod
  def find_order(arr):
    row_ind, col_ind = get_most_matching_words(arr)
    return col_ind

  # Get the cosine similarities between two embeddings by matching two lists of words
  # using the Hungarian matching algorithm. 
  @staticmethod
  def get_similarities(english_embs, german_embs):
    x_vals = []
    for idx in tqdm.tqdm(range(len(german_embs))):
      arr = None

      for i in range(len(german_embs[idx])):
        inner_arr = []
        for j in range(len(english_embs[idx])):
          inner_arr.append(-sim([german_embs[idx][i],english_embs[idx][j]])[0][1])
        if arr is None:
          arr = np.array([inner_arr])
        else:
          arr = np.concatenate((arr, [inner_arr]), axis=0)

      if arr is None:
        x_vals.append([])
        continue
  
      max_length = max(len(german_embs[idx]), len(english_embs[idx]))
      blanks = np.zeros((max_length,max_length))
      blanks[:arr.shape[0],:arr.shape[1]] = arr
      arr = blanks
      order = find_order(arr)

      vals = []
      for i in range(len(order)):
        vals.append(arr[i][order[i]])
      x_vals.append(np.array(vals))
    return x_vals

In [0]:
# Methods that perform preprocessing and convert sentences into embeddings
class Preprocessing:
  # Pad the sentence embeddings with 0
  def pad_sentences(self, embeddings):
    pad = 2900
    padded_embeddings = []
    for i in embeddings:
      padded_embeddings.append(np.concatenate((i, ([0] * (pad - len(i)))), axis=0))
    return padded_embeddings

  # Returns the fastText embeddings of all its sentences given a filepath
  def get_fast_text_embeddings(self, f, nlp, stopwords, ftm):
    punctuation = [',','.','...','\'', '"', '(', ')', '[', ']']
    lines_embs = []
    file = open(f) 
    lines = file.readlines()
    for l in lines:
      text = l.lower()
      l = [token.lemma_ for token in nlp.tokenizer(text)]
      l = [word for word in l if word not in stopwords]
      l = [word for word in l if word not in punctuation]
      l = "".join([word for word in l])
      l = l.rstrip()
      lines_embs.append(ftm.get_sentence_vector(l))
    return lines_embs

  # Returns the SpaCy embeddings of all its sentences given a filepath
  def get_spacy_embeddings(self, f, nlp, stopwords):
    punctuation = [',','.','...','\'', '"', '(', ')', '[', ']']
    file = open(f) 
    lines = file.readlines()
    documents = nlp.pipe(lines, batch_size=32, n_threads=7)
    lines_embs = []
    for doc in documents:
      l = []
      for token in doc:
        if token.text in stopwords or token.text in punctuation:
          continue
        else:
          l.append(token.vector)
      lines_embs.append(np.mean(np.array(l), axis=0))
    return lines_embs

In [51]:
# Creation of common objects that can be used for any types of model
embedding = Embedding()
embedding.load_spacy()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Collecting en_core_web_md==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4MB)
[K     |████████████████████████████████| 95.4MB 1.0MB/s 
[?25hBuilding wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.1.0-cp36-none-any.whl size=97126236 sha256=a0617070563d0bd43b6867ed972c684058416bd7462b9c22d55e6697139a496e
  Stored in directory: /tmp/pip-ephem-wheel-cache-z8y4o0cq/wheels/c1/2c/5f/fd7f3ec336bf97b0809c86264d2831c5dfb00fc2e239d1bb01
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_md -->
/usr/local/lib/py

In [0]:
preprocessing = Preprocessing()

#**3A. Embeddings**

This section uses embeddings as input to the diffrenet models in later section. 

In [0]:
# Get English and German FastText Embeddings by passing in arguments:
# en_filepath (the English sentences filepath) and de_filepath (the German sentences filepath)
def get_fast_text_embeddings(en_filepath, de_filepath):
  embedding.download_fast_text()
  embedding.load_fast_text()

  concatenated_english_embs = preprocessing.get_fast_text_embeddings(en_filepath, 
                                                        embedding.nlp_en, 
                                                        embedding.stop_words_en, 
                                                        embedding.ft)
  concatenated_german_embs = preprocessing.get_fast_text_embeddings(de_filepath, 
                                                       embedding.nlp_de, 
                                                       embedding.stop_words_de, 
                                                       embedding.ft_de)

  english_embs = []
  for e in concatenated_english_embs:
    sentence_embs = []
    for i in range(0, len(e), 100):
      sentence_embs.append(e[i : (i + 100)])
    english_embs.append(sentence_embs)

  german_embs = []
  for e in concatenated_german_embs:
    sentence_embs = []
    for i in range(0, len(e), 100):
      sentence_embs.append(e[i : (i + 100)])
    german_embs.append(sentence_embs)
  
  return english_embs, german_embs

In [0]:
# Get Spacy Embedding
def get_spacy_embeddings(en_filepath, de_filepath):
  english_embs = preprocessing.get_spacy_embeddings(en_filepath, embedding.nlp_en, embedding.stop_words_en)
  german_embs = preprocessing.get_spacy_embeddings(de_filepath, embedding.nlp_de, embedding.stop_words_de)
  return english_embs, german_embs

In [0]:
english_embs, german_embs = get_spacy_embeddings("./train.ende.src", "./train.ende.mt")

In [0]:
with open('english_embs.txt', 'wb') as pickle_file:
  pickle.dump(english_embs, pickle_file)
with open('german_embs.txt', 'wb') as pickle_file:
  pickle.dump(german_embs, pickle_file)

In [0]:
def get_combined_embeddings(english_emb, german_emb):
  features = []
  for value in english_emb:
    features.append(value)
  for value in german_emb:
    features.append(value)

  return features

In [151]:
# Appending English and German word embeddings
samples = []
for i in tqdm.tqdm(range(len(english_embs))):
  samples.append(get_combined_embeddings(english_embs[i], german_embs[i]))

100%|██████████| 7000/7000 [00:00<00:00, 11880.87it/s]


In [0]:
with open('train_features.txt', 'wb') as pickle_file:
  pickle.dump(samples, pickle_file)

In [0]:
samples_best_features = samples

#**3B. Features**
This section extracts features from the input sentences and feed them as input to the diffrenet models in later section.

In [0]:
def get_all_features(line_en, line_de, cosine_sim_with_google_translations):
  features = []

  english_pos_counts = FeatureExtraction.get_pos_tag_counts(line_en, 'en', embedding.nlp_en)
  german_pos_counts = FeatureExtraction.get_pos_tag_counts(line_de, 'de', embedding.nlp_de)
  for k in english_pos_counts.keys():
    features.append(english_pos_counts[k])

  for k in german_pos_counts.keys():
    features.append(german_pos_counts[k])

  english_named_entities = FeatureExtraction.get_named_entities(line_en, 'en', embedding.nlp_en)
  german_named_entities = FeatureExtraction.get_named_entities(line_de, 'de', embedding.nlp_de)
  for k in english_named_entities.keys():
    features.append(english_named_entities[k])

  for k in german_named_entities.keys():
    features.append(german_named_entities[k])

  features.append(FeatureExtraction.get_kiwi_scores(line_en, line_de))
  features.append(FeatureExtraction.get_num_of_tokens(line_en, 'en', embedding.nlp_en))
  features.append(FeatureExtraction.get_num_of_tokens(line_de, 'de', embedding.nlp_de))
  features.append(FeatureExtraction.get_avg_source_token_length(line_en, embedding.nlp_en))
  features.append(FeatureExtraction.get_num_of_punctuations(line_en, embedding.nlp_en))
  features.append(FeatureExtraction.get_num_of_punctuations(line_de, embedding.nlp_de))
  features.append(FeatureExtraction.get_num_of_numberic_tokens(line_en, embedding.nlp_en))
  features.append(FeatureExtraction.get_num_of_numberic_tokens(line_de, embedding.nlp_de))
  features.append(FeatureExtraction.get_num_of_alpha_tokens(line_en, embedding.nlp_en))
  features.append(FeatureExtraction.get_num_of_alpha_tokens(line_de, embedding.nlp_de))
  features.append(FeatureExtraction.get_ratio_of_target_source_lengths(line_en, line_de, embedding.nlp_en, embedding.nlp_de))
  features.append(FeatureExtraction.get_percentage_of_numbers_in_target(line_de, embedding.nlp_de))
  features.append(cosine_sim_with_google_translations)
  
  return features

In [60]:
# Get all the features for the English and German training sentences

file = open('./train.ende.src') 
lines_en = file.readlines()
file = open('./train.ende.mt')
lines_de = file.readlines()

train_cosine_sim_with_google_translations = FeatureExtraction.get_cosine_sim_with_google_translations('./train_google_translate_api.txt')
samples = []
for i in tqdm.tqdm(range(len(lines_en))):
  samples.append(get_all_features(lines_en[i], lines_de[i], train_cosine_sim_with_google_translations[i]))

100%|██████████| 7000/7000 [42:41<00:00,  2.69it/s]


In [0]:
with open('train_features.txt', 'wb') as pickle_file:
  pickle.dump(samples, pickle_file)

## **4. Feature Selection**

In [0]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from itertools import compress
from scipy.stats import pearsonr

import pickle
import numpy as np
from itertools import compress
from scipy.stats import pearsonr
import sklearn

In [0]:
# LOADING TRAIN SCORES
train_scores_file = open("./train.ende.scores",'r')
train_scores = train_scores_file.readlines()

file = open('./train_features.txt', 'rb')
samples = pickle.load(file)
file.close()

#**4A. Variance Threshold**

In [64]:
sel = VarianceThreshold(threshold=0.5)
X_train = sel.fit_transform(samples)
features_selection_boolean_vector = sel.get_support()

print("The number of features left is", len(X_train[0]))
print("The feature extraction matrix is", features_selection_boolean_vector)

# Get the best features from the whole set of features
samples_best_features = []
for sample in samples:
  samples_best_features.append(list(compress(sample, features_selection_boolean_vector)))

The number of features left is 29
The feature extraction matrix is [ True  True  True False False  True  True False  True  True False False
 False  True False False  True False False  True  True  True False  True
 False  True False  True  True False  True  True  True False False  True
  True False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False  True  True False
  True  True  True  True  True  True False False False]


#**4B. RFE (Recursive Feature Elimination)**

In [96]:
# The model for RFE:
model = BayesianRidge(n_iter=200, tol=0.001, alpha_1=1e-06, 
                      alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06)
# model = LinearRegression(fit_intercept=False)

# Initializing RFE model with the number of features to get from RFE
number_of_features = 27
rfe = RFE(model, number_of_features)

# Transforming data using RFE
rfe.fit_transform(samples, train_scores)
features_selection_boolean_vector = rfe.support_

# Get the best features from the whole set of features
samples_best_features = []
for sample in samples:
  samples_best_features.append(list(compress(sample, features_selection_boolean_vector)))

print("The number of features left is", number_of_features)
print("The feature extraction matrix is", features_selection_boolean_vector)

The number of features left is 27
The feature extraction matrix is [False False False False False False  True  True False False  True False
 False False False False False  True False False False False  True  True
 False False False  True  True  True False False False False False False
  True False  True  True False False False False False False False  True
  True  True False  True False  True False False False False False False
 False False False False False False False False False False False False
 False False False False False False  True False False False  True  True
  True  True  True  True  True  True False False  True]


#**4C. No feature selection**

In [0]:
features_selection_boolean_vector = [True] * (len(samples[0]))

samples_best_features = []
for sample in samples:
  samples_best_features.append(list(compress(sample, features_selection_boolean_vector)))

#**5. Load Validation Data**

In [68]:
# Run if using features, get all the features for the English and German validation sentences
# IMPORTANT: Skip this cell if val_samples has been saved to a text file before
val_src_file = open('./dev.ende.src') 
val_lines_en = val_src_file.readlines()
val_mt_file = open('./dev.ende.mt')
val_lines_de = val_mt_file.readlines()

val_cosine_sim_with_google_translations = FeatureExtraction.get_cosine_sim_with_google_translations('./val_google_translate_api.txt')
val_samples = []
for i in tqdm.tqdm(range(len(val_lines_en))):
    val_samples.append(get_all_features(val_lines_en[i], val_lines_de[i], val_cosine_sim_with_google_translations[i]))

with open('val_features.txt', 'wb') as pickle_file:
  pickle.dump(val_samples, pickle_file)

100%|██████████| 1000/1000 [06:03<00:00,  2.72it/s]


In [0]:
# Run if using features, load features of validation sentences from a text file
file = open('./val_features.txt', 'rb')
val_samples = pickle.load(file)
file.close()

In [0]:
# Run if using features, retrieve the best subset of features from the full feature set
val_samples_best_features = []
for val_sample in val_samples:
  val_samples_best_features.append(list(compress(val_sample, features_selection_boolean_vector)))

In [0]:
# Run if using embeddings, get the embeddings of validation senteces
val_english_embs, val_german_embs = get_spacy_embeddings("./dev.ende.src", "./dev.ende.mt")

In [147]:
# Run if using embeddings, appending English and German validation word embeddings
val_samples_best_features = []
for i in tqdm.tqdm(range(len(val_english_embs))):
  val_samples_best_features.append(get_combined_embeddings(val_english_embs[i], val_german_embs[i]))

100%|██████████| 1000/1000 [00:00<00:00, 11801.58it/s]


In [0]:
# Read the validation scores
val_scores_file = open("./dev.ende.scores",'r')
val_scores = val_scores_file.readlines()

#**6A. Training on Linear Regression Model and Evaluation on Validation**

In [0]:
# Fit the input model with x_train and y_train, and then predict with x_test and
# compute the pearson_correlation, mae and mse with the y_test
def get_metrics(x_train, y_train, x_test, y_test, model):
  model.fit(x_train, y_train)
  y_hat = model.predict(x_test)
  mse = sklearn.metrics.mean_squared_error(y_hat, y_test)
  mae = sklearn.metrics.mean_absolute_error(y_hat, y_test)
  pearson_cor, p_value = pearsonr(np.array(y_hat.flatten()), y_test.astype(float))

  return pearson_cor, mae, mse

In [154]:
# The estimators that will be executed
highest_pearson = 0
best_linear_model = None
estimators = [('Support Vector Machine Regressor', SVR(C=10, epsilon=0.01, gamma=0.0001)),
              ('Bayesian Ridge', BayesianRidge(n_iter=200, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06)),
              ('Random Forest Regressor', RandomForestRegressor()),
              ('Pylonomial Features with Linear Regression', make_pipeline(PolynomialFeatures(1), LinearRegression(fit_intercept=False))),
              ('Ridge Regressor', sklearn.linear_model.Ridge()),
              ('Orthogonal Matching', sklearn.linear_model.OrthogonalMatchingPursuit(n_nonzero_coefs=1)),
              ('Ridge', sklearn.linear_model.Ridge(alpha=.5))]

for name, model in estimators:
  pearson_cor, mae, mse = get_metrics(np.array(samples_best_features), np.array(train_scores), 
                                     np.array(val_samples_best_features), np.array(val_scores).astype(float), model)
  print(f'[%s] MSE: %.4f MAE %.4f with peason correlation of %0.4f' % (name, mse, mae, pearson_cor))
  if pearson_cor > highest_pearson:
    highest_pearson = pearson_cor
    best_linear_model = model

[Support Vector Machine Regressor] MSE: 0.7662 MAE 0.4876 with peason correlation of 0.1446
[Bayesian Ridge] MSE: 0.7311 MAE 0.5219 with peason correlation of 0.1572


KeyboardInterrupt: ignored

In [0]:
clf = best_linear_model

#**6B. Training on Neural Networks and Evaluation on Validation**

In [0]:
from keras.models import Sequential
from keras.layers import Dense
from tensorflow import keras
from keras import optimizers
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from tensorflow.keras import layers
from numpy.random import seed

import tensorflow as tf
from scipy.stats import pearsonr

In [157]:
layers1 = [16]
layers2 = [8]
layers3 = [4]
layers4 = [8]
highest_pearson = 0
best_model = None

es=EarlyStopping(monitor='val_loss', mode='min', verbose=1)

for layer1 in layers1:
  for layer2 in layers2:
    for layer3 in layers3:
      for layer4 in layers4:
        model = Sequential()
        model.add(Dense(layer1, input_dim=27, activation='relu'))
        # model.add(Dropout(0.2))
        model.add(Dense(layer2, activation='relu'))
        # model.add(Dropout(0.2))
        model.add(Dense(layer3, activation='relu'))
        model.add(Dense(layer4, activation='relu'))
        model.add(Dense(1))
        
        # optimizer = optimizers.RMSprop(0.001)
        
        model.compile(loss='mse',
                      optimizer='Adam',
                      metrics=['mae', 'mse'])
        
        model.fit(np.array(samples_best_features), 
                  np.array(train_scores),
                  epochs=50,
                  validation_data=(np.array(val_samples_best_features[:500]), np.array(val_scores[:500]).astype(float)),
                  callbacks=[es], 
                  verbose=1)

        predictions_validation = model.predict(np.array(val_samples_best_features[500:]))
        y_hat = np.array(predictions_validation.flatten())
        y_test = np.array(val_scores[500:]).astype(float)

        peason_cor, p_value = pearsonr(y_hat, y_test)
        mse = sklearn.metrics.mean_squared_error(y_hat, y_test)
        mae = sklearn.metrics.mean_absolute_error(y_hat, y_test)
        if peason_cor > highest_pearson:
          highest_pearson = peason_cor
          best_model = model
        print("The models are: [%f, %f, %f, %f]" % (layer1, layer2, layer3, layer4))
        print("Pearson Correlation: %f, MSE: %f, MAE: %f" % (peason_cor, mse, mae))

Train on 7000 samples, validate on 500 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 00004: early stopping
The models are: [16.000000, 8.000000, 4.000000, 8.000000]
Pearson Correlation: 0.088894, MSE: 0.596441, MAE: 0.452051


In [0]:
clf = best_model

#**7. Load test dataset and predict test dataset**

In [89]:
# Get all the features for the English and German test sentences
# IMPORTANT: Skip this cell if test_samples has been saved to a text file before
file = open('./test.ende.src') 
lines_en = file.readlines()
file = open('./test.ende.mt')
lines_de = file.readlines()

test_cosine_sim_with_google_translations = FeatureExtraction.get_cosine_sim_with_google_translations('./test_google_translate_api.txt')

test_samples = []
for i in tqdm.tqdm(range(len(lines_en))):
  test_samples.append(get_all_features(lines_en[i], lines_de[i], test_cosine_sim_with_google_translations[i]))

with open('test_features.txt', 'wb') as pickle_file:
  pickle.dump(test_samples, pickle_file)

  1%|          | 8/1000 [00:02<05:45,  2.87it/s]


KeyboardInterrupt: ignored

In [0]:
# Load features of test sentences from a text file
file = open('./test_features.txt', 'rb')
test_samples = pickle.load(file)
file.close()

In [0]:
# Retrieve the best subset of features from the full feature set
test_samples_best_features = []
for test_sample in test_samples:
  test_samples_best_features.append(list(compress(test_sample, features_selection_boolean_vector)))

In [130]:
test_best_features_predictions = clf.predict(np.array(test_samples_best_features))

<keras.engine.sequential.Sequential object at 0x7f307cf389b0>


In [0]:
def write_test_predications(test_predictions):
  f = open("predictions.txt", "w")
  for num in test_predictions:
    f.write(f"{num[0]}\n")
  f.close()

In [0]:
write_test_predications(test_best_features_predictions)

In [0]:
from zipfile import ZipFile

zipObj = ZipFile('predictions.txt.zip', 'w')
 
# Add multiple files to the zip
zipObj.write('predictions.txt')
 
# close the Zip File
zipObj.close()