# Ideas

- Generate a translation and compare it to the translation to give a score
- Number of words
- Embedded space distance between vectors
- Use punctuation to delimiter some subsample of the phrase and try to evaluate the proximity between these
- Fraction of simple words
- Evaluer la complexité synthaxique de la phrase en anglais -> phrase simple, traduction devrait être de bonne qualité
- Mots rares -> chercher si le mot a été traduit ou non

# Code

In [None]:
from collections import Counter, defaultdict
import math
import copy
import random
import operator
import pandas as pd

flatten = lambda l: [item for sublist in l for item in sublist]

# some helper functions
def prepare_data(filename):
    data = [l.strip().split() + ['</s>'] for l in open(filename) if l.strip()]
    corpus = flatten(data)
    vocab = set(corpus)
    return vocab, data

In [None]:
def extract_sentences(filename,lower=False):
    if lower:
        data = [l.lower().strip() for l in open(filename) if l.strip()]
    else:
        data = [l.strip() for l in open(filename) if l.strip()]
    return data

In [None]:
sentences_en = pd.DataFrame(extract_sentences('../data/en_de/train.ende.src'),columns = ['sentences_en'])
sentences_ge = pd.DataFrame(extract_sentences('../data/en_de/train.ende.mt'),columns = ['sentences_ge'])
scores = pd.read_csv('../data/en_de/train.ende.scores',header=None)
scores = scores.rename(columns={0:"scores"})

In [None]:
dataset = pd.merge(sentences_en,sentences_ge,left_index=True,right_index=True)
dataset = pd.merge(dataset,scores,left_index=True,right_index=True)

In [None]:
bottom_10 = dataset.sort_values('scores').reset_index(drop=True).iloc[0:10]
top_10 = dataset.sort_values('scores').reset_index(drop=True).iloc[-10:]
middle = dataset.sort_values('scores').reset_index(drop=True).iloc[4000:4010]

In [None]:
sample = dataset.sample(30)

In [None]:
sample.style.set_properties(subset=['sentences_en'], **{'width': '300px'})

In [None]:
top_10.style.set_properties(subset=['sentences_en'], **{'width': '300px'})

In [None]:
bottom_10.style.set_properties(subset=['sentences_en'], **{'width': '300px'})

In [None]:
middle.style.set_properties(subset=['sentences_en'], **{'width': '300px'})

# Data Processing pipeline

- index of sentences
- full english sentence (without preprocessing)
- full german sentence  (without preprocessing)
- english sentnece no stop words, punctuation
- german sentnece no stop words, punctuation
- score
- verbs in english (separated by a space and lemmatized)
- verbs in german (separated by a space and lemmatized)
- adjectives in english (separated by a space and lemmatized)
- adjectives in german (separated by a space and lemmatized)
- common nouns in english (separated by a space and lemmatized)
- common nouns in german (separated by a space and lemmatized)
- Nouns of persons
- Entities or organizations


In [None]:
import spacy
nlp_en = spacy.load("en_core_web_md")
nlp_ge = spacy.load("de_core_news_md")

In [None]:
def spacy_analysis(sentence,nlp):
    doc = nlp(sentence)
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
              token.shape_, token.is_alpha, token.is_stop,token.vector.shape)

In [None]:
def extract_pos(pos,sentence,nlp):
    doc = nlp(sentence)
    res = ""
    for token in doc:
        if token.pos_==pos:
            res += token.text + " "
    res = res.strip()
    return res

### Match words

In [None]:
import io
import numpy as np

def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

src_path = '/Users/marcdelaferriere/Documents/Imperial/NLP/MUSE/data/vectors/wiki.multi.en.vec'
tgt_path = '/Users/marcdelaferriere/Documents/Imperial/NLP/MUSE/data/vectors/wiki.multi.de.vec'
nmax = 300000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

In [None]:
dataset = pd.read_pickle("../data/dataset_v1.pickle")

In [None]:
def lemmatizer(sentence, nlp):
    sentence = nlp(sentence)
    lemmatized_sentence = ""
    for token in sentence:
        lemmatized_sentence += token.lemma_ + " "
    
    lemmatized_sentence = lemmatized_sentence.strip()
    return lemmatized_sentence

In [None]:
def find_word_index(sentence,word2id):
    words = sentence.split()
    res = ""
    not_trslted_words = ""
    not_trslted_count = 0
    for word in words:
        if word in word2id.keys():
            res += str(word2id[word])+" "
        else:
            res += "None "
            not_trslted_count +=1
            not_trslted_words += word + ","
            
    res += "not_trslted:"+not_trslted_words+" "
    res += "not_found={} ".format(not_trslted_count)
    return res.strip()

In [None]:
dataset["lemmatized_ge_verbs"] = dataset.german_verbs.apply(lambda x:lemmatizer(x,nlp_ge).lower())
dataset["lemmatized_en_verbs"] = dataset.english_verbs.apply(lambda x:lemmatizer(x,nlp_en).lower())

In [None]:
dataset["idx_verbs_english"] = dataset.lemmatized_en_verbs.apply(lambda x:find_word_index(x,src_word2id))
dataset["idx_verbs_german"] = dataset.lemmatized_ge_verbs.apply(lambda x:find_word_index(x,tgt_word2id))

In [None]:
dataset["idx_verbs_english_no_lemma"] = dataset.english_verbs.apply(lambda x:find_word_index(x.lower(),src_word2id))
dataset["idx_verbs_german_no_lemma"] = dataset.german_verbs.apply(lambda x:find_word_index(x.lower(),tgt_word2id))

In [None]:
dataset["verbs_not_found_en_count"] = dataset.idx_verbs_english.apply(lambda x: int(x.split()[-1][-1]))
dataset["verbs_not_found_ge_count"] = dataset.idx_verbs_german.apply(lambda x: int(x.split()[-1][-1]))

In [None]:
dataset["verbs_not_found_en_words"] = dataset.idx_verbs_english.apply(lambda x: x.split()[-2][12:])
dataset["verbs_not_found_ge_words"] = dataset.idx_verbs_german.apply(lambda x: x.split()[-2][12:])

In [None]:
dataset["verbs_not_found_en_count_no_lemma"] = dataset.idx_verbs_english_no_lemma.apply(lambda x: int(x.split()[-1][-1]))
dataset["verbs_not_found_ge_count_no_lemma"] = dataset.idx_verbs_german_no_lemma.apply(lambda x: int(x.split()[-1][-1]))
dataset["verbs_not_found_en_words_no_lemma"] = dataset.idx_verbs_english_no_lemma.apply(lambda x: x.split()[-2][12:])
dataset["verbs_not_found_ge_words_no_lemma"] = dataset.idx_verbs_german_no_lemma.apply(lambda x: x.split()[-2][12:])

In [None]:
def get_correlation(src_emb,tgt_emb):
    corr = (src_emb / np.linalg.norm(src_emb)).dot(tgt_emb / np.linalg.norm(tgt_emb))
    return corr

In [None]:
dataset["english_verbs"] = dataset["english_verbs"].apply(lambda x:x.lower())
dataset["count_english_verbs"] = dataset["english_verbs"].apply(lambda x:len(x.split()))
dataset["count_german_verbs"] = dataset["german_verbs"].apply(lambda x:len(x.split()))

In [None]:
sub_dataset = dataset[dataset["count_english_verbs"] == dataset["count_german_verbs"]]
sub_dataset = sub_dataset[sub_dataset["verbs_not_found_en_count"]==sub_dataset["verbs_not_found_ge_count"]]
sub_dataset = sub_dataset[sub_dataset["verbs_not_found_en_count"]==0]

In [None]:
## irrelevant
def get_dict_corr(verbs_en,verbs_ge):
    list_verbs_en = verbs_en.split()
    list_verbs_ge = verbs_ge.split()
    matches_en_ge = {}
    correlations = {}
    for verb_en in list_verbs_en:
        best_match = 0
        verb_ge_match = None
        for verb_ge in list_verbs_ge:
            corr = get_correlation(src_embeddings[src_word2id[verb_en]],tgt_embeddings[tgt_word2id[verb_ge]])
            if corr>best_match:
                best_match = corr
                verb_ge_match = verb_ge
                
        matches_en_ge[verb_en] = verb_ge
        #matches_ge_en[verb_ge] = verb_en
        correlations[verb_ge] = best_match
        
    return str(matches_en_ge)+str(correlations)
            
                

In [None]:
def get_mean_corr(verbs_en,verbs_ge):
    list_verbs_en = verbs_en.split()
    list_verbs_ge = verbs_ge.split()
    n = len(list_verbs_en)
    
    verb_en_2idx = {}
    verb_ge_2idx = {}
    idx_2verb_en = {}
    idx_2verb_ge = {}
    corr_matrix = np.zeros((n,n))
    
    for i,verb_en in enumerate(list_verbs_en):
        verb_en_2idx[verb_en] = i
        idx_2verb_en[i] = verb_en

    for i,verb_ge in enumerate(list_verbs_ge):
        verb_ge_2idx[verb_ge] = i
        idx_2verb_ge[i] = verb_ge

    for verb_en in list_verbs_en:
        for verb_ge in list_verbs_ge:
            corr = get_correlation(src_embeddings[src_word2id[verb_en]],tgt_embeddings[tgt_word2id[verb_ge]])
            corr_matrix[verb_en_2idx[verb_en],verb_ge_2idx[verb_ge]] = corr
            
    return corr_matrix

In [None]:
def get_corr_matrix(words_en_list,words_ge_list):
    n = len(words_en_list)
    corr_matrix = np.zeros((n,n))
    for i,word_en in enumerate(words_en_list):
        for j,word_ge in enumerate(words_ge_list):
            corr_matrix[i,j] = get_correlation(get_emb(word_en,"en"),get_emb(word_ge,"ge"))
            
    return corr_matrix

In [None]:
def get_word_matches(corr_matrix):
    if len(corr_matrix)==0:
        return {}
    best_match_row = np.argmax(corr_matrix,axis=0)
    best_match_col = np.argmax(corr_matrix,axis=1)
    couples = {}
    tmp_corr_matrix = corr_matrix.copy()
    n = corr_matrix.shape[0] 
    while len(couples.keys())<n:
        for i in range(n):
            if (i == best_match_row[best_match_col[i]]) and (i not in couples.keys()):
                couples[i] = best_match_col[i]
                tmp_corr_matrix[i,:] = np.zeros(n)
                tmp_corr_matrix[:,best_match_col[i]] = np.zeros(n)
                best_match_row = np.argmax(tmp_corr_matrix,axis=0)
                best_match_col = np.argmax(tmp_corr_matrix,axis=1)
    return couples

In [None]:
def get_emb(word,language):
    if language=="en":
        return src_embeddings[src_word2id[word]]
    else:
        return tgt_embeddings[tgt_word2id[word]]

In [None]:
def get_mean_corr(words_en,words_ge):
    words_en_list = words_en.split()
    words_ge_list = words_ge.split()
    
    corr_mat = get_corr_matrix(words_en_list,words_ge_list)
    word_couples = get_word_matches(corr_mat)
    sum_corr = 0
    n = len(word_couples.keys())
    for key,val in word_couples.items():
        sum_corr += corr_mat[key,val]
        
    if n==0:
        return 0
    mean_corr = sum_corr/n
    return mean_corr

In [None]:
def get_word_couples(words_en,words_ge):
    words_en_list = words_en.split()
    words_ge_list = words_ge.split()
    corr_mat = get_corr_matrix(words_en_list,words_ge_list)
    word_couples_idx = get_word_matches(corr_mat)
    word_couples = {}
    for key,val in word_couples_idx.items():
        word_couples[words_en_list[key]] = words_ge_list[val]

    return word_couples

In [None]:
sample = sub_dataset.sample()
en = sample.iloc[0]["lemmatized_en_verbs"]
ge = sample.iloc[0]["lemmatized_ge_verbs"]
print(en)
print(ge)

In [None]:
get_mean_corr(en,ge)

### Prop Nouns

In [None]:
def entity_analysis(sentence,nlp):
    print(sentence+"\n"+"\n"+"Analysis"+"\n"+"--------")
    doc = nlp(sentence)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
sample =dataset.sample()
sentence = sample.iloc[0]["sentences_en"]

In [None]:
#2269
sample

In [None]:
entity_analysis(sentence,nlp_en)

In [None]:
get_entities(sentence,nlp_en)

In [None]:
spacy.explain("ORG")

In [None]:
def return_list(x):
    return ["fdf","fdf"]

In [None]:
spacy.explain("GPE")

In [None]:
def get_entities(sentence,nlp):
    doc = nlp(sentence)
    persons_list = []
    for ent in doc.ents:
        if (ent.label_=="PERSON"):
            persons_list += [ent.text]
    return persons_list

In [None]:
ge

In [None]:
dataset[dataset["sentences_en"]==sentence]

In [None]:
spacy.explain("FAC")

In [None]:
sentence

In [None]:
dataset.head()