# Ideas

- Generate a translation and compare it to the translation to give a score
- Number of words
- Embedded space distance between vectors
- Use punctuation to delimiter some subsample of the phrase and try to evaluate the proximity between these
- Fraction of simple words
- Evaluer la complexité synthaxique de la phrase en anglais -> phrase simple, traduction devrait être de bonne qualité
- Mots rares -> chercher si le mot a été traduit ou non

# Code

In [None]:
from collections import Counter, defaultdict
import math
import copy
import random
import operator
import pandas as pd

flatten = lambda l: [item for sublist in l for item in sublist]

# some helper functions
def prepare_data(filename):
    data = [l.strip().split() + ['</s>'] for l in open(filename) if l.strip()]
    corpus = flatten(data)
    vocab = set(corpus)
    return vocab, data

In [None]:
def extract_sentences(filename,lower=False):
    if lower:
        data = [l.lower().strip() for l in open(filename) if l.strip()]
    else:
        data = [l.strip() for l in open(filename) if l.strip()]
    return data

In [None]:
nlp_en('oppressively')[0].lemma_

In [None]:
sentences_en = pd.DataFrame(extract_sentences('../data/en_de/train.ende.src'),columns = ['sentences_en'])
sentences_ge = pd.DataFrame(extract_sentences('../data/en_de/train.ende.mt'),columns = ['sentences_ge'])
scores = pd.read_csv('../data/en_de/train.ende.scores',header=None)
scores = scores.rename(columns={0:"scores"})

In [None]:
dataset = pd.merge(sentences_en,sentences_ge,left_index=True,right_index=True)
dataset = pd.merge(dataset,scores,left_index=True,right_index=True)

In [None]:
bottom_10 = dataset.sort_values('scores').reset_index(drop=True).iloc[0:10]
top_10 = dataset.sort_values('scores').reset_index(drop=True).iloc[-10:]
middle = dataset.sort_values('scores').reset_index(drop=True).iloc[4000:4010]

In [None]:
sample = dataset.sample(30)

In [None]:
sample.style.set_properties(subset=['sentences_en'], **{'width': '300px'})

In [None]:
top_10.style.set_properties(subset=['sentences_en'], **{'width': '300px'})

In [None]:
bottom_10.style.set_properties(subset=['sentences_en'], **{'width': '300px'})

In [None]:
middle.style.set_properties(subset=['sentences_en'], **{'width': '300px'})

### Exploration 

## Test spacy

In [None]:
dataset.iloc[20]["sentences_ge"]

In [None]:
doc_ge = nlp_german("Er regierte unterdrückerisch und fast bankrott Mali mit seinen verschwenderischen Ausgaben.")
for token in doc_ge:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop,token.vector.shape)

In [None]:
# Prop noun in english we are quite sure -> doesn't matter if not translated

In [None]:
import spacy
import numpy as np
import re

In [None]:
nlp_en = spacy.load("en_core_web_md")
nlp_ge = spacy.load("de_core_news_md")

In [None]:
nlp_en("obama").vector

In [None]:
phrase_ge = "Danach unterwarf er Tughlaqpur Fort und die Stadt Salwan, bevor Loni Fort belagern und schließlich marschieren auf Delhi."
phrase_en = "Afterward, he subdued Tughlaqpur's fort and the town of Salwan before besieging Loni's fort and ultimately marching on Delhi."

In [None]:
def count_non_translated_words(en_phrase,ge_phrase,nlp_en=nlp_en,nlp_ge=nlp_ge):
    doc_en = nlp_en(en_phrase)
    prop_nouns = []

    for token in doc_en:
        if token.pos_=="PROPN" or token.pos_=="NUM":
            prop_nouns += [token.text.lower()]
    print(prop_nouns)
        
    phrase_ge_without_np = " ".join([token.text for token in nlp_german(ge_phrase) if token.text.lower() not in prop_nouns])
    phrase_ge_without_np = phrase_ge_without_np.translate(str.maketrans('', '', string.punctuation)).strip()
    phrase_ge_without_np = re.sub(' +', ' ', phrase_ge_without_np)
    
    count = 0
    for token in nlp_ge(phrase_ge_without_np):
        if token.vector.sum() == 0:
            print(token.text)
            count+=1
        
    return count
    

In [None]:
def f(x):
    return count_non_translated_words(x["sentences_en"],x["sentences_ge"])

In [None]:
dataset.head()

In [None]:
nlp_ge("Kreuzfire").lemma_

In [None]:
for token in nlp_ge(ex_ge):
    print(token.text,token.tag_)

In [None]:
for token in nlp_en(ex_en):
    print(token.text,token.tag_)

In [None]:
for token in nlp_en(ex_en):
    print(token.text,token.tag_)

In [None]:
for token in nlp_ge(ex_ge):
    print(token.text,token.tag_)

In [None]:
val = 20
ex_en = dataset["sentences_en"].iloc[val]
ex_ge = dataset["sentences_ge"].iloc[val]
rating = dataset["scores"].iloc[val]

In [None]:
print(ex_en+"\n"+ex_ge+"\n"+str(rating))

In [None]:
count_non_translated_words(ex_en,ex_ge)

In [None]:
nlp_ge("Hauptantagonisten").vector

In [None]:
nlp_en("crossfire").vector

In [None]:
nlp_ge("bestätigung").vector

In [None]:
for token in nlp(phrase_en):
    print(token.text,token.pos_,token.tag_)

In [None]:
phrase_en = "confirmation of president obama's first nominee, andrew traver, stalled in 2011 after the nra expressed strong opposition."
phrase_ge = "die bestätigung des ersten kandidaten von präsident obama, andrew traver, kam 2011 ins stocken, nachdem die nra starke opposition zum ausdruck gebracht hatte."

In [None]:
phrase_en_2 = "Ben Schwartz talks about Season 3 of House of Lies with Red Crab, Haardvark, and Paul rating Ben's impersonations."
phrase_ge_2 = "die bestätigung des ersten kandidaten von präsident obama, andrew traver, kam 2011 ins stocken, nachdem die nra starke opposition zum ausdruck gebracht hatte."

In [None]:
nlp("doldrums").vector

In [None]:
nlp_german("fghjk").vector

In [None]:
doc = nlp(phrase_en_2)
decomposition = []
for token in doc:
    decomposition += [[token.text,token.pos_]]
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
decomposition = np.array(decomposition)

In [None]:
"die bestätigung des ersten kandidaten von präsident obama, andrew traver, kam 2011 ins stocken, nachdem die nra starke opposition zum ausdruck gebracht hatte.".split()

In [None]:
doc_ge.vector.shape

In [None]:
doc_ge = nlp_german("die bestätigung des ersten kandidaten von präsident obama, andrew traver, kam 2011 ins stocken, nachdem die nra starke opposition zum ausdruck gebracht hatte.")
decomposition_ge = []
for token in doc_ge:
    decomposition_ge += [[token.text,token.pos_]]
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop,token.vector.shape)
decomposition_ge = np.array(decomposition_ge)

In [None]:
stopwords_en = set(stopwords.words('english'))
stopwords_ge = set(stopwords.words('german'))

In [None]:
decomposition = [elt for elt in decomposition if elt[0] not in stopwords_en]
decomposition_ge = [elt for elt in decomposition_ge if elt[0] not in stopwords_ge]

In [None]:
decomposition_ge

In [None]:
decomposition

### English-Deutsch dictionnary

In [None]:
import pandas as pd

In [None]:
dic = pd.read_csv('../data/de-en.txt',sep=' ',header=None)
dic = dic.rename(columns={0:'ge',1:'en'})

In [None]:
def remove_first_elt(array):
    return [x[1:] if (isinstance(x,str) and x[0]=="#") else x for x in array ]

In [None]:
dic['ge'] = dic['ge'].apply(lambda x:x[1:] if (isinstance(x,str) and x[0]=="#") else x)
dic['en'] = dic['en'].apply(lambda x:x[1:] if (isinstance(x,str) and x[0]=="#") else x)

In [None]:
def add(x):
    return list(x)

In [None]:
res = dic.groupby('en').agg({'ge':add}).reset_index(drop=False)

In [None]:
correspondance_dict = res.set_index("en").to_dict()["ge"]

In [None]:
def count_translated_words(phrase_en,phrase_ge):
    count=0
    sep_german_words = phrase_ge.split(" ")
    sep_english_words = phrase_en.split(" ")
    n_german_words = len(sep_english_words)
    for word in sep_english_words:
        for word_ge in sep_german_words:
            if word in correspondance_dict.keys() and word_ge in correspondance_dict[word]:
                count+=1
                break
    
    return (count,n_german_words)

In [None]:
count_translated_words(phrase_en,phrase_ge)

In [None]:
def tr(x):
    return count_translated_words(x["sentences_en"],x["sentences_ge"])

In [None]:
dataset["corresp"] = dataset.apply(tr,axis=1)

In [None]:
dataset

In [None]:
dataset["scores"].corr(dataset["corresp"])