# Imports

In [None]:
import pandas as pd
import spacy
import re
import os
import io
import numpy as np
import en_core_web_md
import de_core_news_md
import pickle
from multiprocessing import Pool

## Helper functions

In [None]:
def spacy_analysis(sentence,nlp):
    doc = nlp(sentence)
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
              token.shape_, token.is_alpha, token.is_stop,token.vector.shape)

def entity_analysis(sentence,nlp):
    print(sentence+"\n"+"\n"+"Analysis"+"\n"+"--------")
    doc = nlp(sentence)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

def extract_sentences(filename,lower=False):
    if lower:
        data = [l.lower().strip() for l in open(filename) if l.strip()]
    else:
        data = [l.strip() for l in open(filename) if l.strip()]
    return data

def remove_unnecessary_spaces(sentence):
    return re.sub(' +',' ',sentence.strip())

def remove_word_from_sentence(sentence,word):
    new_sentence = sentence.replace(word,"")
    return remove_unnecessary_spaces(new_sentence)

def remove_multiple_words_from_sentence(sentence,words,isin=False):
    if isin:
        splited = [word for word in sentence.split() if word in words]
    else:
        splited = [word for word in sentence.split() if word not in words]
    return " ".join(splited)

# Loading

### Loading Spacy

In [None]:
nlp_en = en_core_web_md.load()
nlp_ge = de_core_news_md.load()

### Loading Training Data

In [None]:
path_train_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.src')
path_train_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.mt')
path_train_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.scores')

In [None]:
train_sentences_en = pd.DataFrame(extract_sentences(path_train_en),columns = ['sentences_en'])
train_sentences_ge = pd.DataFrame(extract_sentences(path_train_ge),columns = ['sentences_ge'])
train_scores = pd.read_csv(path_train_scores,header=None)
train_scores = train_scores.rename(columns={0:"scores"})

In [None]:
train_dataset = pd.merge(train_sentences_en,train_sentences_ge,left_index=True,right_index=True)
#train_dataset = pd.merge(train_dataset,train_scores,left_index=True,right_index=True)

### Loading Validation Data

In [None]:
path_dev_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.src')
path_dev_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.mt')
path_dev_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.scores')

In [None]:
dev_sentences_en = pd.DataFrame(extract_sentences(path_dev_en),columns = ['sentences_en'])
dev_sentences_ge = pd.DataFrame(extract_sentences(path_dev_ge),columns = ['sentences_ge'])
dev_scores = pd.read_csv(path_dev_scores,header=None)
dev_scores = dev_scores.rename(columns={0:"scores"})

In [None]:
dev_dataset = pd.merge(dev_sentences_en,dev_sentences_ge,left_index=True,right_index=True)
#dev_dataset = pd.merge(dev_dataset,dev_scores,left_index=True,right_index=True)

### Loading Test Data

In [None]:
path_test_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'test.ende.src')
path_test_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'test.ende.mt')
path_test_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'test.ende.scores')

In [None]:
test_sentences_en = pd.DataFrame(extract_sentences(path_test_en),columns = ['sentences_en'])
test_sentences_ge = pd.DataFrame(extract_sentences(path_test_ge),columns = ['sentences_ge'])

In [None]:
test_dataset = pd.merge(test_sentences_en,test_sentences_ge,left_index=True,right_index=True)

### Merging the three

In [None]:
dataset = pd.concat([train_dataset, dev_dataset])
dataset = pd.concat([dataset,test_dataset])
dataset = dataset.reset_index(drop=True)

### Loading embeddings

In [None]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

path = os.path.join(os.path.dirname(os.getcwd()), 'data','muse')

src_path = path+"/wiki.multi.en.vec"
tgt_path = path+"/wiki.multi.de.vec"
nmax = 300000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

### Loading nltk utils

In [None]:
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer #stem for english
from nltk.stem.cistem import Cistem
from nltk.corpus import stopwords

porter = PorterStemmer()
cistem = Cistem()
stopwords_en = set(stopwords.words('english'))
stopwords_ge = set(stopwords.words('german'))

## Constructing the full vocabluary

### Removing entities

In [None]:
def remove_punct(sentence, nlp):
    sentence=nlp(sentence)
    clean_sentence = ""
    for token in sentence:
        if (token.is_alpha):
            clean_sentence+= (token.text.lower() + " ")
    clean_sentence=clean_sentence.strip()
    return clean_sentence

def remove_entities(sentence,persons_list):
    sentence_without_persons = sentence
    for person in persons_list:
        sentence_without_persons = sentence_without_persons.replace(person,"")
    return sentence_without_persons

In [None]:
def get_entities(sentence_en,sentence_ge):
    doc = nlp_en(sentence_en)
    persons_list = []
    for ent in doc.ents:
        if (ent.label_=="PERSON" or ent.label_=="ORG" or 
            ent.label_=="LOC" or ent.label_=="GPE" 
            or ent.label == "FAC" or ent.label == "NORP"):
            if ent.text in sentence_ge:
                persons_list += [ent.text]
            else:
                ent_text_clean = ent.text.replace("the","").replace("The","").strip()
                if ent_text_clean in sentence_ge:
                    persons_list += [ent_text_clean]
    return persons_list

In [None]:
dataset['person']=dataset.apply(lambda row: get_entities(row["sentences_en"],
                                                         row["sentences_ge"]),axis=1)

In [None]:
dataset['sentences_en_no_propnouns'] = dataset.apply(lambda row: remove_entities(row['sentences_en'], row["person"]), axis=1)
dataset['sentences_ge_no_propnouns'] = dataset.apply(lambda row: remove_entities(row['sentences_ge'], row["person"]), axis=1)

In [None]:
dataset['sentences_en_clean'] = dataset.apply(lambda row: remove_punct(row['sentences_en_no_propnouns'], nlp_en), axis=1)
dataset['sentences_ge_clean'] = dataset.apply(lambda row: remove_punct(row['sentences_ge_no_propnouns'], nlp_ge), axis=1)

### Create first version of vocab

In [None]:
def add_words_to_vocab(vocab,sentence,nlp,word_2id,stemmer):
    mini_sentence = nlp(sentence)
    out_of_vocab = []
    for token in mini_sentence:
        if token.text not in vocab.keys():
            try:
                vocab[token.text] = word_2id[token.text]
            except:
                try:
                    vocab[token.text] = word_2id[token.lemma_]
                except:
                    try:
                        synonyms = wordnet.synsets(token.text)[0].lemmas()
                        for i in range(10):
                            synonym = synonyms[i].name()
                            try:
                                vocab[token.text] = word_2id[synonym]
                                break
                            except:
                                continue
                    except:
                        try:
                            vocab[token.text] = word_2id[stemmer.stem(token.text)]
                        except:
                            out_of_vocab.append(token.text)
    return vocab,out_of_vocab

In [None]:
#English

vocab_2_embedding_idx_en = {}
out_of_vocab_en = []

sentences_en = dataset.sentences_en_clean

for i,sentence in enumerate(sentences_en):
    vocab_2_embedding_idx_en,out_of_vocab_current = add_words_to_vocab(vocab_2_embedding_idx_en,sentence,
                                                                       nlp_en,src_word2id,porter)
    
    for out_of_vocab_word in out_of_vocab_current:
        out_of_vocab_en += [(out_of_vocab_word,i)]


In [None]:
#German

vocab_2_embedding_idx_ge = {}
out_of_vocab_ge = []

sentences_ge = dataset.sentences_ge_clean

for i,sentence in enumerate(sentences_ge):
    vocab_2_embedding_idx_ge,out_of_vocab_current = add_words_to_vocab(vocab_2_embedding_idx_ge,sentence,
                                                                nlp_ge,tgt_word2id,cistem)
    for out_of_vocab_word in out_of_vocab_current:
        out_of_vocab_ge += [(out_of_vocab_word,i)]

### Removing non german words from german vocabulary

In [None]:
def remove_words_from_list(list_1,list_2 = german_words,isin=False):
    if isin:
        return [word for word in list_1 if word in list_2]
    else:
        return [word for word in list_1 if word not in list_2]

In [None]:
current_german_vocab_corpus = list(vocab_2_embedding_idx_ge.keys())
#large list of german words downloaded here : https://gist.github.com/MarvinJWendt/2f4f4154b8ae218600eb091a5706b5f4#file-wordlist-german-txt
german_words = list(pd.read_csv("../data/wordlist-german.txt",header=None)[0])
german_words = [str(word).lower() for word in german_words]

In [None]:
pool = Pool(4)
n = len(current_german_vocab_corpus)
split = int(n/4)

words_to_be_removed_list = pool.map(remove_words_from_list,[current_german_vocab_corpus[0:split],
                                                 current_german_vocab_corpus[split:split*2],
                                                 current_german_vocab_corpus[split*2:split*3],
                                                 current_german_vocab_corpus[split*3:]])

In [None]:
words_to_be_removed = words_to_be_removed_list[0] + words_to_be_removed_list[1] + \
                      words_to_be_removed_list[2] + words_to_be_removed_list[3]

### Update out_of_vocab list and vocab_2_embedding_idx_ge

In [None]:
new_out_of_vocab_ge = []
for word in words_to_be_removed:
    idxs =  list(dataset[dataset.sentences_ge_clean.apply(lambda x: word in x)].index)
    if len(idxs)<=3:
        for idx in idxs:
            new_out_of_vocab_ge += [(word,idx)]
            
out_of_vocab_ge += new_out_of_vocab_ge

In [None]:
for word in [pair[0] for pair in new_out_of_vocab_ge]:
    try:
        vocab_2_embedding_idx_ge.pop(word,None)
    except:
        continue

### Save and load vocab dicts

In [None]:
#Save
#with open("../data/vocab_en_tvl.pkl","wb") as f:
#    pickle.dump(vocab_2_embedding_idx_en, f, pickle.HIGHEST_PROTOCOL)
#with open("../data/vocab_ge_tvl.pkl","wb") as f:
#    pickle.dump(vocab_2_embedding_idx_ge, f, pickle.HIGHEST_PROTOCOL)
    
#Load    
#with open("../data/vocab_en.pkl", 'rb') as f:
#    vocab_2_embedding_idx_en = pickle.load(f)
#with open("../data/vocab_ge.pkl", 'rb') as f:
#    vocab_2_embedding_idx_ge = pickle.load(f)

### Process out of vocab german words

In [None]:
dataset["non_translated_words"] = 0
dataset["sentences_en_cleaner"] = dataset["sentences_en_clean"].copy()
dataset["sentences_ge_cleaner"] = dataset["sentences_ge_clean"].copy()

In [None]:
def process_translated_out_of_vocab_words(pair,word2id,dataset,out_of_vocab_english):
    word,idx = pair[0],pair[1]
    en_sentence = dataset.loc[idx]["sentences_en_cleaner"]
    ge_sentence = dataset.loc[idx]["sentences_ge_cleaner"]
    count_non_translated = dataset.loc[idx]["non_translated_words"]
    
    ### If the word is exactly the same as a word in the source sentence
    if word in en_sentence:
        if word not in out_of_vocab_english:
            ### if the word is in the english vocab -> it could have been translated
            ### count it as a non translated word and remove it from both sentences
            dataset.at[idx,"sentences_en_cleaner"] = remove_word_from_sentence(en_sentence,word)
            dataset.at[idx,"sentences_ge_cleaner"] = remove_word_from_sentence(ge_sentence,word)
            dataset.at[idx,"non_translated_words"] = count_non_translated+1
            return dataset
        else:
            ### if the word is not in the english vocab -> it could not have been translated
            ### don't count it as a non translated word and remove it from both sentences
            dataset.at[idx,"sentences_en_cleaner"] = remove_word_from_sentence(en_sentence,word)
            dataset.at[idx,"sentences_ge_cleaner"] = remove_word_from_sentence(ge_sentence,word)
            return dataset
        
    ### Test for subwords
    subword = word[:-1]
    while len(subword)>0:
        if subword in word2id.keys():
            subword1 = subword
            subword2 = word[len(subword1):]
            if subword2 in word2id.keys():
                if len(subword2)>2:
                    ### if the word can be splitted in 2 meaningful words do it
                    dataset.at[idx,"sentences_ge_cleaner"] = ge_sentence.replace(word,subword1+" "+subword2)
                    return dataset
                else:
                    ### if the word can be found by using a shorter version which can be found in vocab
                    dataset.at[idx,"sentences_ge_cleaner"] = ge_sentence.replace(word,subword1)
                    return dataset
            else:
                return dataset
            
        else:
            subword = subword[:-1]
    return dataset

In [None]:
for pair in out_of_vocab_ge:
    dataset = process_translated_out_of_vocab_words(pair,vocab_2_embedding_idx_ge,dataset,out_of_vocab_en)

### Remove stop words

In [None]:
dataset["sentences_en_cleaner"] = dataset.sentences_en_cleaner.apply(lambda x:remove_multiple_words_from_sentence(x,stopwords_en))
dataset["sentences_ge_cleaner"] = dataset.sentences_ge_cleaner.apply(lambda x:remove_multiple_words_from_sentence(x,stopwords_ge))
dataset["sentences_ge_cleaner"] = dataset.sentences_ge_cleaner.apply(lambda x:remove_multiple_words_from_sentence(x,stopwords_en))

In [None]:
vocab_fin_en = list(vocab_2_embedding_idx_en.keys())
vocab_fin_ge = list(vocab_2_embedding_idx_ge.keys())

### Prepare sentences for embedding

In [None]:
dataset["sentences_en_final"] = dataset.sentences_en_cleaner.apply(lambda x:remove_multiple_words_from_sentence(x,vocab_fin_en,True))
dataset["sentences_ge_final"] = dataset.sentences_ge_cleaner.apply(lambda x:remove_multiple_words_from_sentence(x,vocab_fin_ge,True))

### Compute correlation features

In [None]:
def get_correlation(src_emb,tgt_emb):
    corr = (src_emb / np.linalg.norm(src_emb)).dot(tgt_emb / np.linalg.norm(tgt_emb))
    return corr

In [None]:
def get_emb(word,language):
    if language=="en":
        return src_embeddings[vocab_2_embedding_idx_en[word]]
    else:
        return tgt_embeddings[vocab_2_embedding_idx_ge[word]]

In [None]:
def get_corr_matrix(words_en_list,words_ge_list):
    n = len(words_en_list)
    m = len(words_ge_list)
    corr_matrix = np.zeros((n,m))
    for i,word_en in enumerate(words_en_list):
        for j,word_ge in enumerate(words_ge_list):
            corr_matrix[i,j] = get_correlation(get_emb(word_en,"en"),get_emb(word_ge,"ge"))
            
    return corr_matrix

In [None]:
def get_word_matches(corr_matrix):
    if len(corr_matrix)==0:
        return {}
    best_match_row = np.argmax(corr_matrix,axis=0)
    best_match_col = np.argmax(corr_matrix,axis=1)
    couples = {}
    tmp_corr_matrix = corr_matrix.copy()
    n = corr_matrix.shape[0] 
    m = corr_matrix.shape[1]
    dim = min(n,m)
    while len(couples.keys())<dim:
        for i in range(n):
            if (i == best_match_row[best_match_col[i]]) and (i not in couples.keys()):
                couples[i] = best_match_col[i]
                tmp_corr_matrix[i,:] = np.zeros(m)
                tmp_corr_matrix[:,best_match_col[i]] = np.zeros(n)
                best_match_row = np.argmax(tmp_corr_matrix,axis=0)
                best_match_col = np.argmax(tmp_corr_matrix,axis=1)
    return couples

In [None]:
def get_word_couples(words_en,words_ge):
    if len(words_en)==0 or len(words_ge)==0:
        return np.nan,np.nan,np.nan
    
    words_en_list = words_en.split()
    words_ge_list = words_ge.split()
    corr_mat = get_corr_matrix(words_en_list,words_ge_list)
    word_couples_idx = get_word_matches(corr_mat)
    score = 0
    
    for i in word_couples_idx.keys():
        score+=corr_mat[i,word_couples_idx[i]]
    score/=len(word_couples_idx)
    
    if len(words_en_list)>len(words_ge_list):
        kept_words_idx = np.array(list(word_couples_idx.keys()))
        left_words_idx = np.setdiff1d(np.arange(len(words_en_list)),kept_words_idx)
        left_words = [words_en_list[i] for i in left_words_idx]
        
    elif len(words_en_list)<len(words_ge_list):
        kept_words_idx = np.array(list(word_couples_idx.values()))
        left_words_idx = np.setdiff1d(np.arange(len(words_en_list)),kept_words_idx)
        left_words = [words_ge_list[i] for i in left_words_idx]
        
    else:
        left_words = []
        
    word_couples = {}
    for key,val in word_couples_idx.items():
        word_couples[words_en_list[key]] = words_ge_list[val]
        
    return word_couples,score,left_words

In [None]:
dataset["correlation"] = dataset.apply(lambda row:get_word_couples(row["sentences_en_final"],row["sentences_ge_final"])[1],axis=1)

In [None]:
dataset["correlation"] = dataset["correlation"].fillna(dataset["correlation"].mean())

### Sentence length

In [None]:
def sentence_length(sentence):
    leng=0
    sentence=sentence.split(" ")
    for token in sentence:
        leng+=1
    return leng

In [None]:
def difference_length(a, b):
    lenga=0
    a=a.split(" ")
    for token in a:
        lenga+=1
    
    lengb=0
    b=b.split(" ")
    for token in b:
        lengb+=1
    
    return abs(lenga-lengb)

In [None]:
dataset['english_sentence_length'] = dataset['sentences_en'].apply(sentence_length)
dataset['german_sentence_length'] = dataset['sentences_ge'].apply(sentence_length)
dataset['sentence_length_difference'] = dataset['english_sentence_length'] - dataset['german_sentence_length']

### POS extraction

In [None]:
def extract_pos(pos,sentence,nlp):
    doc = nlp(sentence)
    res = ""
    for token in doc:
        if token.pos_==pos:
            res += token.text + " "
    res = res.strip()
    return res

In [None]:
dataset["german_verbs"] = dataset.sentences_ge.apply(lambda x:extract_pos("VERB",x,nlp_ge))
dataset["english_verbs"] = dataset.sentences_en.apply(lambda x:extract_pos("VERB",x,nlp_en))
dataset['verbs_diff'] = dataset.apply(lambda row: difference_length(row['english_verbs'], row['german_verbs']),axis=1)

In [None]:
dataset["german_adjectives"] = dataset.sentences_ge.apply(lambda x:extract_pos("ADJ",x,nlp_ge))
dataset["english_adjectives"] = dataset.sentences_en.apply(lambda x:extract_pos("ADJ",x,nlp_en))
dataset['adjectives_diff'] = dataset.apply(lambda row: difference_length(row['english_adjectives'], row['german_adjectives']),axis=1)

In [None]:
dataset["german_adverbs"] = dataset.sentences_ge.apply(lambda x:extract_pos("ADV",x,nlp_ge))
dataset["english_adverbs"] = dataset.sentences_en.apply(lambda x:extract_pos("ADV",x,nlp_en))
dataset['adverbs_diff'] = dataset.apply(lambda row: difference_length(row['english_adverbs'], row['german_adverbs']),axis=1)

In [None]:
dataset["german_nouns"] = dataset.sentences_ge.apply(lambda x:extract_pos("NOUN",x,nlp_ge))
dataset["english_nouns"] = dataset.sentences_en.apply(lambda x:extract_pos("NOUN",x,nlp_en))
dataset['nouns_diff'] = dataset.apply(lambda row: difference_length(row['english_nouns'], row['german_nouns']),axis=1)

### Lemmatizer

In [None]:
def lemmatizer(sentence, nlp):
    sentence = nlp(sentence)
    lemmatized_sentence = ""
    for token in sentence:
        lemmatized_sentence += token.lemma_ + " "
    
    lemmatized_sentence = lemmatized_sentence.strip()
    return lemmatized_sentence

In [None]:
dataset['english_lemma'] = dataset['sentences_en'].apply (lambda x: lemmatizer(x, nlp_en))
dataset['german_lemma'] = dataset['sentences_ge'].apply (lambda x: lemmatizer(x, nlp_ge))

### Sentiment Analysis

In [None]:
from textblob import TextBlob as textblob_en
from textblob_de import TextBlobDE as textblob_ge

In [None]:
def sentiment(sentence, textblob):
    text = textblob(sentence)
    score = text.sentiment.polarity
    return score

In [None]:
dataset['english_sentence_sentiment'] = dataset['sentences_en'].apply(lambda x: sentiment(x, textblob_en))
dataset['german_sentence_sentiment'] = dataset['sentences_ge'].apply(lambda x: sentiment(x, textblob_ge))

In [None]:
dataset['english_sentence_lemma_sentiment'] = dataset['english_lemma'].apply(lambda x: sentiment(x, textblob_en))
dataset['german_sentence_lemma_sentiment'] = dataset['german_lemma'].apply(lambda x: sentiment(x, textblob_ge))

In [None]:
def stdz(data):
    data_stdz = (data - data.mean())/data.std()
    return data_stdz

In [None]:
dataset['std_english_sentence_sentiment']= stdz(dataset['english_sentence_sentiment'])
dataset['std_german_sentence_sentiment']= stdz(dataset['german_sentence_sentiment'])

In [None]:
dataset["max_sentiment_english_bool"] = dataset.apply(lambda row:abs(row['english_sentence_lemma_sentiment'])>abs(row['english_sentence_sentiment']),axis=1)
dataset["max_sentiment_german_bool"] = dataset.apply(lambda row:abs(row['german_sentence_lemma_sentiment'])>abs(row['german_sentence_sentiment']),axis=1)

In [None]:
dataset['max_sentiment_english'] = dataset.apply(lambda row: row["english_sentence_lemma_sentiment"] 
                                                 if row["max_sentiment_english_bool"]
                                                 else row["english_sentence_sentiment"],axis=1)

dataset = dataset.drop(columns=["max_sentiment_english_bool"])

dataset['max_sentiment_german'] = dataset.apply(lambda row: row["german_sentence_lemma_sentiment"] 
                                                 if row["max_sentiment_german_bool"]
                                                 else row["german_sentence_sentiment"],axis=1)

dataset = dataset.drop(columns=["max_sentiment_german_bool"])

In [None]:
def stdz(data):
    data_stdz = (data - data.mean())/data.std()
    return data_stdz

In [None]:
dataset['std_max_english_sentiment']= stdz(dataset['max_sentiment_english'])
dataset['std_max_german_sentiment']= stdz(dataset['max_sentiment_german'])

In [None]:
dataset.to_pickle("../data/dataset_all.pickle")