This notebook takes approximately 30 minutes on a regular machine (Macbook Pro 2018), it is possible to load the result of this notebook in the folder data/datasets with the command line pd.read_pickle('data/datasets/dataset_with_features.pickle)

# Imports

In [1]:
import os
import io
import pickle
import multiprocessing
from multiprocessing import Pool
from itertools import repeat

import re
import spacy
import en_core_web_md
import de_core_news_md

import numpy as np
import pandas as pd

## Helper functions

In [2]:
def spacy_analysis(sentence,nlp):
    doc = nlp(sentence)
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
              token.shape_, token.is_alpha, token.is_stop,token.vector.shape)

def entity_analysis(sentence,nlp):
    print(sentence+"\n"+"\n"+"Analysis"+"\n"+"--------")
    doc = nlp(sentence)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

def extract_sentences(filename,lower=False):
    if lower:
        data = [l.lower().strip() for l in open(filename) if l.strip()]
    else:
        data = [l.strip() for l in open(filename) if l.strip()]
    return data

def remove_unnecessary_spaces(sentence):
    return re.sub(' +',' ',sentence.strip())

def remove_word_from_sentence(sentence,word):
    new_sentence = sentence.replace(word,"")
    return remove_unnecessary_spaces(new_sentence)

def remove_words_from_list(list_1,list_2,isin=False):
    if isin:
        return [word for word in list_1 if word in list_2]
    else:
        return [word for word in list_1 if word not in list_2]

def remove_multiple_words_from_sentence(sentence,words,isin=False):
    if isin:
        splited = [word for word in sentence.split() if word in words]
    else:
        splited = [word for word in sentence.split() if word not in words]
    return " ".join(splited)

# Loading

### Loading Spacy

In [3]:
nlp_en = en_core_web_md.load()
nlp_ge = de_core_news_md.load()

### Loading Training Data

In [4]:
path_train_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.src')
path_train_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.mt')
path_train_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.scores')

In [5]:
train_sentences_en = pd.DataFrame(extract_sentences(path_train_en),columns = ['sentences_en'])
train_sentences_ge = pd.DataFrame(extract_sentences(path_train_ge),columns = ['sentences_ge'])
train_scores = pd.read_csv(path_train_scores,header=None)
train_scores = train_scores.rename(columns={0:"scores"})

In [6]:
train_dataset = pd.merge(train_sentences_en,train_sentences_ge,left_index=True,right_index=True)

### Loading Validation Data

In [7]:
path_dev_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.src')
path_dev_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.mt')
path_dev_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.scores')

In [8]:
dev_sentences_en = pd.DataFrame(extract_sentences(path_dev_en),columns = ['sentences_en'])
dev_sentences_ge = pd.DataFrame(extract_sentences(path_dev_ge),columns = ['sentences_ge'])
dev_scores = pd.read_csv(path_dev_scores,header=None)
dev_scores = dev_scores.rename(columns={0:"scores"})

In [9]:
dev_dataset = pd.merge(dev_sentences_en,dev_sentences_ge,left_index=True,right_index=True)

### Loading Test Data

In [10]:
path_test_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'test.ende.src')
path_test_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'test.ende.mt')
path_test_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'test.ende.scores')

In [11]:
test_sentences_en = pd.DataFrame(extract_sentences(path_test_en),columns = ['sentences_en'])
test_sentences_ge = pd.DataFrame(extract_sentences(path_test_ge),columns = ['sentences_ge'])

In [12]:
test_dataset = pd.merge(test_sentences_en,test_sentences_ge,left_index=True,right_index=True)

### Merging the three

In [13]:
dataset = pd.concat([train_dataset, dev_dataset])
dataset = pd.concat([dataset,test_dataset])
dataset = dataset.reset_index(drop=True)

### Loading MUSE embeddings

In [106]:
def load_vec(emb_path, nmax=50000):
    '''
    Loading function to load MUSE embeddings
    '''
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

path = os.path.join(os.path.dirname(os.getcwd()), 'data','muse')

src_path = path+"/wiki.multi.en.vec"
tgt_path = path+"/wiki.multi.de.vec"
nmax = 300000  # maximum number of word embeddings to load

# Here src_embeddings corresponds to the English language and tgt_embeddings to the German Language
src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

### Loading nltk utils

In [15]:
import nltk
nltk.download("popular")

from nltk.corpus import wordnet
from nltk.stem import PorterStemmer #stem for english
from nltk.stem.cistem import Cistem
from nltk.corpus import stopwords

porter = PorterStemmer()
cistem = Cistem()
stopwords_en = set(stopwords.words('english'))
stopwords_ge = set(stopwords.words('german'))

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/marcdelaferriere/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/marcdelaferriere/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/marcdelaferriere/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/marcdelaferriere/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/marcdelaferriere/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/marcdelaferriere/nltk

### Sentence length

In [16]:
def sentence_length(sentence):
    '''
    Helper function to compute the length of a sentence
    '''
    leng=0
    sentence=sentence.split(" ")
    for token in sentence:
        leng+=1
    return leng

In [17]:
def difference_length(a, b):
    '''
    Helper function to compute the difference between the length of two sentences
    '''
    lenga=0
    a=a.split(" ")
    for token in a:
        lenga+=1
    
    lengb=0
    b=b.split(" ")
    for token in b:
        lengb+=1
    
    return abs(lenga-lengb)

In [18]:
# Computing the difference in sentence length between the English and the German sentence

dataset['english_sentence_length'] = dataset['sentences_en'].apply(sentence_length)
dataset['german_sentence_length'] = dataset['sentences_ge'].apply(sentence_length)
dataset['sentence_length_difference'] = dataset['english_sentence_length'] - dataset['german_sentence_length']

### POS extraction

In [19]:
def extract_pos(pos,sentence,nlp):
    '''
    Helper function to extract find the words in the sentence corresponding to a specific Part-of-speech using Spacy 
    nlp: corresponds to the Spacy nlp corresponding to the language of the sentence
    '''
    doc = nlp(sentence)
    res = ""
    for token in doc:
        if token.pos_==pos:
            res += token.text + " "
    res = res.strip()
    return res

In [20]:
# Extracting verbs and counting their difference in number between the two sentences

dataset["german_verbs"] = dataset.sentences_ge.apply(lambda x:extract_pos("VERB",x,nlp_ge))
dataset["english_verbs"] = dataset.sentences_en.apply(lambda x:extract_pos("VERB",x,nlp_en))
dataset['verbs_diff'] = dataset.apply(lambda row: difference_length(row['english_verbs'], row['german_verbs']),axis=1)

In [21]:
# Extracting adjectives and counting their difference in number between the two sentences

dataset["german_adjectives"] = dataset.sentences_ge.apply(lambda x:extract_pos("ADJ",x,nlp_ge))
dataset["english_adjectives"] = dataset.sentences_en.apply(lambda x:extract_pos("ADJ",x,nlp_en))
dataset['adjectives_diff'] = dataset.apply(lambda row: difference_length(row['english_adjectives'], row['german_adjectives']),axis=1)

In [22]:
# Extracting adverbs and counting their difference in number between the two sentences

dataset["german_adverbs"] = dataset.sentences_ge.apply(lambda x:extract_pos("ADV",x,nlp_ge))
dataset["english_adverbs"] = dataset.sentences_en.apply(lambda x:extract_pos("ADV",x,nlp_en))
dataset['adverbs_diff'] = dataset.apply(lambda row: difference_length(row['english_adverbs'], row['german_adverbs']),axis=1)

In [23]:
# Extracting nouns and counting their difference in number between the two sentences

dataset["german_nouns"] = dataset.sentences_ge.apply(lambda x:extract_pos("NOUN",x,nlp_ge))
dataset["english_nouns"] = dataset.sentences_en.apply(lambda x:extract_pos("NOUN",x,nlp_en))
dataset['nouns_diff'] = dataset.apply(lambda row: difference_length(row['english_nouns'], row['german_nouns']),axis=1)

### Lemmatizer

In [24]:
def lemmatizer(sentence, nlp):
    '''
    Helper function to lemmatize a sentence using Spacy 
    nlp: corresponds to the Spacy nlp corresponding to the language of the sentence
    '''
    sentence = nlp(sentence)
    lemmatized_sentence = ""
    for token in sentence:
        lemmatized_sentence += token.lemma_ + " "
    
    lemmatized_sentence = lemmatized_sentence.strip()
    return lemmatized_sentence

In [25]:
# Lemmatizing sentences

dataset['english_lemma'] = dataset['sentences_en'].apply (lambda x: lemmatizer(x, nlp_en))
dataset['german_lemma'] = dataset['sentences_ge'].apply (lambda x: lemmatizer(x, nlp_ge))

### Sentiment Analysis

In [26]:
from textblob import TextBlob as textblob_en
from textblob_de import TextBlobDE as textblob_ge

In [27]:
def sentiment(sentence, textblob):
    '''
    Function to compute the sentiment of a sentence using textblob
    '''
    text = textblob(sentence)
    score = text.sentiment.polarity
    return score

In [28]:
# Computing the values coming out of the sentiment function for each sentence

dataset['english_sentence_sentiment'] = dataset['sentences_en'].apply(lambda x: sentiment(x, textblob_en))
dataset['german_sentence_sentiment'] = dataset['sentences_ge'].apply(lambda x: sentiment(x, textblob_ge))

In [29]:
# Computing the values coming out of the sentiment function for each lemmatized sentence

dataset['english_sentence_lemma_sentiment'] = dataset['english_lemma'].apply(lambda x: sentiment(x, textblob_en))
dataset['german_sentence_lemma_sentiment'] = dataset['german_lemma'].apply(lambda x: sentiment(x, textblob_ge))

In [30]:
def stdz(data):
    '''
    Function to standardize a given dataset
    '''
    data_stdz = (data - data.mean())/data.std()
    return data_stdz

In [31]:
# Standardize the sentiment values

dataset['std_english_sentence_sentiment']= stdz(dataset['english_sentence_sentiment'])
dataset['std_german_sentence_sentiment']= stdz(dataset['german_sentence_sentiment'])

In [33]:
# Computing the maximum absolute values coming out of the sentiment function between the lemmatized sentences
# and the original ones.

dataset["max_sentiment_english_bool"] = dataset.apply(lambda row:abs(row['english_sentence_lemma_sentiment'])>abs(row['english_sentence_sentiment']),axis=1)
dataset["max_sentiment_german_bool"] = dataset.apply(lambda row:abs(row['german_sentence_lemma_sentiment'])>abs(row['german_sentence_sentiment']),axis=1)

dataset['max_sentiment_english'] = dataset.apply(lambda row: row["english_sentence_lemma_sentiment"] 
                                                 if row["max_sentiment_english_bool"]
                                                 else row["english_sentence_sentiment"],axis=1)

dataset = dataset.drop(columns=["max_sentiment_english_bool"])

dataset['max_sentiment_german'] = dataset.apply(lambda row: row["german_sentence_lemma_sentiment"] 
                                                 if row["max_sentiment_german_bool"]
                                                 else row["german_sentence_sentiment"],axis=1)

dataset = dataset.drop(columns=["max_sentiment_german_bool"])

In [35]:
# Standardize the sentiment values

dataset['std_max_english_sentiment']= stdz(dataset['max_sentiment_english'])
dataset['std_max_german_sentiment']= stdz(dataset['max_sentiment_german'])

## Constructing the full vocabluary

### Removing entities

In [36]:
def remove_punct(sentence, nlp):
    '''
    Function to remove the punctuation of a sentence given its Spacy nlp
    '''
    sentence=nlp(sentence)
    clean_sentence = ""
    for token in sentence:
        if (token.is_alpha):
            clean_sentence+= (token.text.lower() + " ")
    clean_sentence=clean_sentence.strip()
    return clean_sentence

def remove_entities(sentence,entity_list):
    '''
    Function to remove the entities from a sentence given an entity list
    '''
    sentence_without_entities = sentence
    for person in entity_list:
        sentence_without_entities = sentence_without_entities.replace(person,"")
    return sentence_without_entities

In [37]:
def get_entities(sentence_en,sentence_ge):
    '''
    Helper function to extract Entities using Spacy
    '''
    doc = nlp_en(sentence_en)
    persons_list = []
    for ent in doc.ents:
        # the entities extracted are part of these general categories (organizations,persons,county...etc)
        if (ent.label_=="PERSON" or ent.label_=="ORG" or 
            ent.label_=="LOC" or ent.label_=="GPE" 
            or ent.label == "FAC" or ent.label == "NORP"):
            if ent.text in sentence_ge:
                persons_list += [ent.text]
            else:
                ent_text_clean = ent.text.replace("the","").replace("The","").strip()
                if ent_text_clean in sentence_ge:
                    persons_list += [ent_text_clean]
    return persons_list

In [38]:
# Finding the entities

dataset['entities']=dataset.apply(lambda row: get_entities(row["sentences_en"],
                                                         row["sentences_ge"]),axis=1)

In [39]:
# Removing entities from sentences

dataset['sentences_en_no_entities'] = dataset.apply(lambda row: remove_entities(row['sentences_en'], row["entities"]), axis=1)
dataset['sentences_ge_no_entities'] = dataset.apply(lambda row: remove_entities(row['sentences_ge'], row["entities"]), axis=1)

In [40]:
# Removing punctuation from sentences

dataset['sentences_en_clean'] = dataset.apply(lambda row: remove_punct(row['sentences_en_no_entities'], nlp_en), axis=1)
dataset['sentences_ge_clean'] = dataset.apply(lambda row: remove_punct(row['sentences_ge_no_entities'], nlp_ge), axis=1)

In [41]:
print("Here is a list of some entities removed from sentences"+"\n")
print(dataset[dataset["entities"].apply(lambda x:x!=[])]["entities"].sample(3))

Here is a list of some entities removed from sentences

8721                    [Carly, Jack, Hal Munson]
177     [Henry, David, Emma, Mary Margaret, Neal]
8659                [Poirot, Monsieur Desjardeux]
Name: entities, dtype: object


### Create first version of vocabulary for embedding

In [42]:
def add_words_to_vocab(vocab,sentence,nlp,word_2id,stemmer):
    '''
    Function creating a mapping between the vocabulary and a MUSE embedding in the corresponding language. It tests
    first if the word can be find directly, then if its tokenized form can be found, then its synonyms and then
    its stemmed form.
    Returns: vocab which is the mapping, out_of_vocab which corresponds to the words which weren't found
    '''
    mini_sentence = nlp(sentence)
    out_of_vocab = []
    for token in mini_sentence:
        if token.text not in vocab.keys():
            try:
                vocab[token.text] = word_2id[token.text]
            except:
                try:
                    vocab[token.text] = word_2id[token.lemma_]
                except:
                    try:
                        synonyms = wordnet.synsets(token.text)[0].lemmas()
                        for i in range(10):
                            synonym = synonyms[i].name()
                            try:
                                vocab[token.text] = word_2id[synonym]
                                break
                            except:
                                continue
                    except:
                        try:
                            vocab[token.text] = word_2id[stemmer.stem(token.text)]
                        except:
                            out_of_vocab.append(token.text)
    return vocab,out_of_vocab

In [43]:
def create_vocab(sentences,nlp,word2id,stemmer):
    '''
    Function which iterates the function add_words_to_vocab to compute a vocabulary mapping
    '''
    vocab_2_embedding_idx = {}
    out_of_vocab = []
    
    for i,sentence in enumerate(sentences):
        vocab_2_embedding_idx,out_of_vocab_current = add_words_to_vocab(vocab_2_embedding_idx,sentence,
                                                                        nlp,word2id,stemmer)
        
        for out_of_vocab_word in out_of_vocab_current:
            out_of_vocab += [(out_of_vocab_word,i)]
            
    return vocab_2_embedding_idx,out_of_vocab


In [44]:
vocab_2_embedding_idx_en,out_of_vocab_en = create_vocab(dataset.sentences_en_clean,nlp_en,src_word2id,porter)
vocab_2_embedding_idx_ge,out_of_vocab_ge = create_vocab(dataset.sentences_ge_clean,nlp_ge,tgt_word2id,cistem)

### Removing non german words from german vocabulary

In [45]:
def multiprocessing_remove_words_from_list(list_1,list_2):
    '''
    Function using multiprocessing to check if a word from a list is not in another list
    '''
    n_cpu = multiprocessing.cpu_count()
    pool = Pool(n_cpu)
    n = len(list_1)
    split_idx = int(n/n_cpu)
    splited_list_1 = [list_1[split_idx*i:split_idx*(i+1)] 
                                   if i<n_cpu-1 
                                   else list_1[split_idx*i:]
                                   for i in range(n_cpu)]
    
    words_to_be_removed_list = pool.starmap(remove_words_from_list,
                                            zip(splited_list_1,repeat(list_2)))
    
    words_to_be_removed = []
    for i in range(n_cpu):
        words_to_be_removed += words_to_be_removed_list[i]
        
    return words_to_be_removed

In [47]:
# current vocab computed in the corpus
current_german_vocab_corpus = list(vocab_2_embedding_idx_ge.keys())

#large list of true german words 
#source : https://gist.github.com/MarvinJWendt/2f4f4154b8ae218600eb091a5706b5f4#file-wordlist-german-txt
german_words = list(pd.read_csv("data/german_words/wordlist-german.txt",header=None)[0])
german_words = [str(word).lower() for word in german_words]

words_to_be_removed = multiprocessing_remove_words_from_list(current_german_vocab_corpus,german_words)

In [48]:
#Example of words removed from german vocabulary
idxs = np.random.choice(np.arange(len(words_to_be_removed)),10)

print("Here is a list of some of the words which were embedded in MUSE as vectors in german language"+"\n")
print(np.array(words_to_be_removed)[idxs])

Here is a list of some of the words which were embedded in MUSE as vectors in german language

['colloquium' 'screenplay' 'tibia' 'proprios' 'unitary' 'falling' 'durga'
 'volunteers' 'nuevo' 'quit']


In [49]:
def update_vocabulary(dataset,vocab_2_idx,out_of_vocab,words_to_be_removed):
    '''
    Function which updates the values of out_of_vocab and vocab_2_idx knowing the words which must be removed
    '''
    new_out_of_vocab = []
    
    print("Vocabulary length before:{}".format(len(vocab_2_idx)))
    print("Out of vocabulary length before:{}".format(len(out_of_vocab))+"\n")
    
    for word in words_to_be_removed:
        idxs =  list(dataset[dataset.sentences_ge_clean.apply(lambda x: word in x)].index)
        if len(idxs)<=3:
            for idx in idxs:
                new_out_of_vocab += [(word,idx)]       
    
    for word in [pair[0] for pair in new_out_of_vocab]:
        try:
            vocab_2_idx.pop(word,None)
        except:
            continue
            
    new_out_of_vocab += out_of_vocab
    
    print("Vocabulary length after:{}".format(len(vocab_2_idx)))
    print("Out of vocabulary length after:{}".format(len(new_out_of_vocab)))
    
    return vocab_2_idx,new_out_of_vocab

In [50]:
vocab_2_embedding_idx_ge,out_of_vocab_ge = update_vocabulary(dataset,
                                                             vocab_2_embedding_idx_ge,
                                                             out_of_vocab_ge,
                                                             words_to_be_removed)

Vocabulary length before:19762
Out of vocabulary length before:4962

Vocabulary length after:18007
Out of vocabulary length after:7195


### Process out of vocab german words

In [51]:
def process_translated_out_of_vocab_words(out_of_vocab_german,word2id,dataset,out_of_vocab_english,
                                          muse_word2id,german_words):
    '''
    Function which checks for each sentence in the dataset which has out of vocab german words. Then, it checks if 
    the exact same word is found in the English sentence and removes it from both (with incrementing a counter of 
    non_translated_words if relevant). Then it checks if the out of vocab German words could be split in two
    meaning full German words which are embedded in MUSE and separates them if it is the case.
    Returns: the updated dataset and the new corpus vocabulary
    '''
    
    dataset["non_translated_words"] = 0
    dataset["sentences_en_cleaner"] = dataset["sentences_en_clean"].copy()
    dataset["sentences_ge_cleaner"] = dataset["sentences_ge_clean"].copy()
    
    for k,pair in enumerate(out_of_vocab_german):
        word,idx = pair[0],pair[1]
        en_sentence = dataset.loc[idx]["sentences_en_cleaner"]
        ge_sentence = dataset.loc[idx]["sentences_ge_cleaner"]
        count_non_translated = dataset.loc[idx]["non_translated_words"]

        ### If the word is exactly the same as a word in the source sentence
        if word in en_sentence:
            if word not in out_of_vocab_english:
                ### if the word is in the english vocab -> it could have been translated
                ### count it as a non translated word and remove it from both sentences
                dataset.at[idx,"sentences_en_cleaner"] = remove_word_from_sentence(en_sentence,word)
                dataset.at[idx,"sentences_ge_cleaner"] = remove_word_from_sentence(ge_sentence,word)
                dataset.at[idx,"non_translated_words"] = count_non_translated+1
            else:
                ### if the word is not in the english vocab -> it could not have been translated
                ### don't count it as a non translated word and remove it from both sentences
                dataset.at[idx,"sentences_en_cleaner"] = remove_word_from_sentence(en_sentence,word)
                dataset.at[idx,"sentences_ge_cleaner"] = remove_word_from_sentence(ge_sentence,word)
                
            continue

        ### Test for subwords
        subword = word[:-1]
        while len(subword)>0:
            if (subword in (muse_word2id.keys())):
                subword1 = subword
                subword2 = word[len(subword1):]
                if (subword2 in muse_word2id.keys() and (subword1 in german_words) and subword2 in (german_words)):
                    if len(subword2)>2 and len(subword1)>2:
                        ### if the word can be splitted in 2 meaningful words do it
                        dataset.at[idx,"sentences_ge_cleaner"] = ge_sentence.replace(word,subword1+" "+subword2)
                        word2id[subword1] = muse_word2id[subword1]
                        word2id[subword2] = muse_word2id[subword2]
                        break
                    elif len(subword1)>2 and len(subword2)<=2:
                        ### if the word can be found by using a shorter version which can be found in vocab
                        dataset.at[idx,"sentences_ge_cleaner"] = ge_sentence.replace(word,subword1)
                        word2id[subword1] = muse_word2id[subword1]
                        break
                    elif len(subword1)<=2 and len(subword2)>2:
                        ### if the word can be found by using a shorter version which can be found in vocab
                        dataset.at[idx,"sentences_ge_cleaner"] = ge_sentence.replace(word,subword2)
                        word2id[subword2] = muse_word2id[subword2]
                        break
                    else:
                        subword = subword[:-1]
                else:
                    subword = subword[:-1]

            else:
                subword = subword[:-1]
    return dataset,word2id

In [52]:
# Update the dataset and the corpus vocabulary
dataset,new_vocab_2_embedding = process_translated_out_of_vocab_words(out_of_vocab_ge,vocab_2_embedding_idx_ge,
                                                                      dataset,
                                                                      out_of_vocab_en,tgt_word2id,german_words)

### Prepare sentences for embedding by removing out of vocab words

In [53]:
#Final vocabulary for the corpus in English and in German
vocab_fin_en = list(vocab_2_embedding_idx_en.keys())
vocab_fin_ge = list(vocab_2_embedding_idx_ge.keys())

#Sentences preprocessed for embedding
dataset["sentences_en_preprocessed"] = dataset.sentences_en_cleaner.apply(lambda x:remove_multiple_words_from_sentence(x,vocab_fin_en,True))
dataset["sentences_ge_preprocessed"] = dataset.sentences_ge_cleaner.apply(lambda x:remove_multiple_words_from_sentence(x,vocab_fin_ge,True))

### Compute correlation features

In [107]:
def get_correlation(src_emb,tgt_emb):
    '''
    Function implemented in MUSE Github which computes the correlation between two vectors
    '''
    corr = (src_emb / np.linalg.norm(src_emb)).dot(tgt_emb / np.linalg.norm(tgt_emb))
    return corr

In [113]:
def get_emb(word,language):
    '''
    Helper function which fetches the embedding of a word given the chosen language in ["en","ge"]
    '''
    if language=="en":
        return src_embeddings[src_word2id[word]]
    else:
        return tgt_embeddings[tgt_word2id[word]]

In [109]:
def get_corr_matrix(words_en_list,words_ge_list):
    '''
    Function which computes the correlation matrix (discussed in the report)
    '''
    n = len(words_en_list)
    m = len(words_ge_list)
    corr_matrix = np.zeros((n,m))
    for i,word_en in enumerate(words_en_list):
        for j,word_ge in enumerate(words_ge_list):
            corr_matrix[i,j] = get_correlation(get_emb(word_en,"en"),get_emb(word_ge,"ge"))
            
    return corr_matrix

In [110]:
def get_word_matches(corr_matrix):
    '''
    Function which finds the best word matches
    '''
    if len(corr_matrix)==0:
        return {}
    best_match_row = np.argmax(corr_matrix,axis=0)
    best_match_col = np.argmax(corr_matrix,axis=1)
    couples = {}
    tmp_corr_matrix = corr_matrix.copy()
    n = corr_matrix.shape[0] 
    m = corr_matrix.shape[1]
    dim = min(n,m)
    while len(couples.keys())<dim:
        # iterates over the rows of the correlation matrix
        for i in range(n):
            # If a value is the maximum of the row and the column, it corresponds to a good word couple.
            # The column and the row containing this word are set to zero so they can't be picked in the
            # following pass (because correlation values are >0)
            if (i == best_match_row[best_match_col[i]]) and (i not in couples.keys()):
                couples[i] = best_match_col[i]
                tmp_corr_matrix[i,:] = np.zeros(m)
                tmp_corr_matrix[:,best_match_col[i]] = np.zeros(n)
                best_match_row = np.argmax(tmp_corr_matrix,axis=0)
                best_match_col = np.argmax(tmp_corr_matrix,axis=1)
    return couples

In [117]:
def get_word_couples(words_en,words_ge):
    '''
    Function which returns the best word matches, the correlation scores and the remaining words which did not match
    '''
    if len(words_en)==0 or len(words_ge)==0:
        return np.nan,np.nan,np.nan,np.nan
    
    words_en_list = words_en.split()
    words_ge_list = words_ge.split()
    #computing the correlation matrix
    corr_mat = get_corr_matrix(words_en_list,words_ge_list)
    #computing the best word matches
    word_couples_idx = get_word_matches(corr_mat)
    score = 0
    
    for i in word_couples_idx.keys():
        score+=corr_mat[i,word_couples_idx[i]]
    score/=len(word_couples_idx)
    
    if len(words_en_list)>len(words_ge_list):
        #Dealing with non picked words english words 
        kept_words_idx = np.array(list(word_couples_idx.keys()))
        left_words_idx = np.setdiff1d(np.arange(len(words_en_list)),kept_words_idx)
        left_words = [words_en_list[i] for i in left_words_idx]
        non_match_corr = np.mean(np.max(corr_mat[left_words_idx,:],axis=1))
    elif len(words_en_list)<len(words_ge_list):
        #Dealing with non picked words german words 
        kept_words_idx = np.array(list(word_couples_idx.values()))
        left_words_idx = np.setdiff1d(np.arange(len(words_ge_list)),kept_words_idx)
        left_words = [words_ge_list[i] for i in left_words_idx]
        non_match_corr = np.mean(np.max(corr_mat[:,left_words_idx],axis=0))
    else:
        left_words = []
        non_match_corr=np.nan
        
    word_couples = {}
    for key,val in word_couples_idx.items():
        word_couples[words_en_list[key]] = words_ge_list[val]
        
        
    return word_couples,score,left_words,non_match_corr

In [59]:
# Computing the correlation features

dataset["correlation"] = dataset.apply(lambda row:get_word_couples(row["sentences_en_preprocessed"],row["sentences_ge_preprocessed"])[1],axis=1)
dataset["left_words"] = dataset.apply(lambda row:get_word_couples(row["sentences_en_preprocessed"],row["sentences_ge_preprocessed"])[2],axis=1)
dataset["non_match_correlation"] = dataset.apply(lambda row:get_word_couples(row["sentences_en_preprocessed"],row["sentences_ge_preprocessed"])[3],axis=1)
dataset["n_left_words"] = dataset["left_words"].apply(lambda x:len(x) if isinstance(x,list) else 0)

In [60]:
# Computed the ratio between non_translated words and sentence length

dataset["max_len_en_len_ge"] = dataset.apply(lambda row:max(len(row["sentences_en_preprocessed"].split()),
                                                len(row["sentences_ge_preprocessed"].split())),axis=1)
dataset["non_translated_ratio"] = dataset["non_translated_words"]/dataset["max_len_en_len_ge"]
dataset["non_translated_ratio"] = dataset["non_translated_ratio"].fillna(0)
dataset["non_translated_ratio"] = dataset["non_translated_ratio"].apply(lambda x:x if not np.isinf(x) else 0)
dataset = dataset.drop(columns=["max_len_en_len_ge"])

In [61]:
# Filling missing values with the mean of the features

dataset["correlation"] = dataset["correlation"].fillna(dataset["correlation"].mean())
dataset["non_match_correlation"] = dataset["non_match_correlation"].fillna(dataset["non_match_correlation"].mean())

### Laser

In [64]:
from laserembeddings import Laser

laser = Laser()

In [65]:
def get_laser_correlation(src_sentence,tgt_sentence):
    '''
    Function to compute the laser correlation between vectors
    '''
    embeddings = laser.embed_sentences([src_sentence, tgt_sentence], lang=['en', 'de'])
    src_emb, tgt_emb = embeddings[0], embeddings[1]
    corr = (src_emb / np.linalg.norm(src_emb)).dot(tgt_emb / np.linalg.norm(tgt_emb))
    return corr

In [66]:
#The two following cells should not be run as it is very long to compute the laser embeddings (around 40 minutes for the whole
#dataset) so these features are provided and merged with the dataset just below

#dataset['sentence_correlation'] = dataset.apply(lambda row: 
#                                                get_laser_correlation(row["sentences_en"], row["sentences_ge"]),axis=1)

In [None]:
# collecting the sentence level embeddings from LASER
# set of 1024 features per sentence 

# full_laser_english = laser.embed_sentences(dataset['sentences_en'].tolist(), lang='en')
# full_laser_german = laser.embed_sentences(dataset['sentences_ge'].tolist(), lang='de')

# these are saved in the following lines, and loaded for the neural network regression model
# np.save('laser_1024_english', full_laser_english)
# np.save('laser_1024_german', full_laser_german)

In [67]:
laser_test = pd.read_pickle('data/laser_embeddings/test_dataset_corrleations_laser.pickle')[["sentence_correlation"]]
laser_train_val =  pd.read_pickle('data/laser_embeddings/dataset_corrleations_laser.pickle')[["sentence_correlation"]]
all_laser = pd.concat([laser_train_val,laser_test])
all_laser = all_laser.reset_index(drop=True)

In [68]:
final_dataset = pd.merge(dataset,all_laser,left_index=True,right_index=True)

### Saving the dataset

In [69]:
#final_dataset.to_pickle("data/datasets/dataset_with_features.pickle")