In [1]:
import pandas as pd
import spacy
import re

import en_core_web_md
import de_core_news_md
nlp_en = en_core_web_md.load()
nlp_ge = de_core_news_md.load()

def spacy_analysis(sentence,nlp):
    doc = nlp(sentence)
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
              token.shape_, token.is_alpha, token.is_stop,token.vector.shape)

In [2]:
import os

In [3]:
def extract_sentences(filename,lower=False):
    if lower:
        data = [l.lower().strip() for l in open(filename) if l.strip()]
    else:
        data = [l.strip() for l in open(filename) if l.strip()]
    return data

### Training Data

In [None]:
path_train_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.src')
path_train_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.mt')
path_train_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.scores')

In [None]:
train_sentences_en = pd.DataFrame(extract_sentences(path_train_en),columns = ['sentences_en'])
train_sentences_ge = pd.DataFrame(extract_sentences(path_train_ge),columns = ['sentences_ge'])
train_scores = pd.read_csv(path_train_scores,header=None)
train_scores = train_scores.rename(columns={0:"scores"})

In [None]:
train_dataset = pd.merge(train_sentences_en,train_sentences_ge,left_index=True,right_index=True)
train_dataset = pd.merge(train_dataset,train_scores,left_index=True,right_index=True)

### Validation Data

In [None]:
path_dev_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.src')
path_dev_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.mt')
path_dev_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.scores')

In [None]:
dev_sentences_en = pd.DataFrame(extract_sentences(path_dev_en),columns = ['sentences_en'])
dev_sentences_ge = pd.DataFrame(extract_sentences(path_dev_ge),columns = ['sentences_ge'])
dev_scores = pd.read_csv(path_dev_scores,header=None)
dev_scores = dev_scores.rename(columns={0:"scores"})

In [None]:
dev_dataset = pd.merge(dev_sentences_en,dev_sentences_ge,left_index=True,right_index=True)
dev_dataset = pd.merge(dev_dataset,dev_scores,left_index=True,right_index=True)

### Merging the two 

In [None]:
dataset = pd.concat([train_dataset, dev_dataset])
dataset = dataset.reset_index(drop=True)

### Constructing the full vocabluary

In [4]:
def entity_analysis(sentence,nlp):
    print(sentence+"\n"+"\n"+"Analysis"+"\n"+"--------")
    doc = nlp(sentence)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [5]:
def remove_punct(sentence, nlp):
    sentence=nlp(sentence)
    clean_sentence = ""
    for token in sentence:
        if (token.is_alpha):
            clean_sentence+= (token.text.lower() + " ")
    clean_sentence=clean_sentence.strip()
    return clean_sentence

def remove_names(sentence,persons_list):
    sentence_without_persons = sentence
    for person in persons_list:
        sentence_without_persons = sentence_without_persons.replace(person,"")
    return sentence_without_persons

In [6]:
def get_persons(sentence_en,sentence_ge):
    doc = nlp_en(sentence_en)
    persons_list = []
    for ent in doc.ents:
        if (ent.label_=="PERSON" or ent.label_=="ORG" or 
            ent.label_=="LOC" or ent.label_=="GPE" 
            or ent.label == "FAC" or ent.label == "NORP"):
            if ent.text in sentence_ge:
                persons_list += [ent.text]
            else:
                ent_text_clean = ent.text.replace("the","").replace("The","").strip()
                if ent_text_clean in sentence_ge:
                    persons_list += [ent_text_clean]
    return persons_list

In [7]:
#dataset['person']=dataset.apply(lambda row: get_persons(row["sentences_en"],
#                                                         row["sentences_ge"]),axis=1)

In [8]:
#dataset['sentences_en_no_propnouns'] = dataset.apply(lambda row: remove_names(row['sentences_en'], row["person"]), axis=1)
#dataset['sentences_ge_no_propnouns'] = dataset.apply(lambda row: remove_names(row['sentences_ge'], row["person"]), axis=1)

In [9]:
#dataset['sentences_en_clean'] = dataset.apply(lambda row: remove_punct(row['sentences_en_no_propnouns'], nlp_en), axis=1)
#dataset['sentences_ge_clean'] = dataset.apply(lambda row: remove_punct(row['sentences_ge_no_propnouns'], nlp_ge), axis=1)

In [10]:
#dataset.to_pickle('../data/dataset_train_val.pickle')
dataset = pd.read_pickle("../data/dataset_train_val.pickle")

### Vocabulary with embedding

#### Load embeddings

In [11]:
import io
import numpy as np

def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

path = os.path.join(os.path.dirname(os.getcwd()), 'data','muse')

src_path = path+"/wiki.multi.en.vec"
tgt_path = path+"/wiki.multi.de.vec"
nmax = 300000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

#### Load stemmers

In [12]:
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer #stem for english
from nltk.stem.cistem import Cistem
porter = PorterStemmer()
cistem = Cistem()

## Making vocabularies

In [13]:
def add_words_to_vocab(vocab,sentence,nlp,word_2id,stemmer,language="en"):
    mini_sentence = nlp(sentence)
    out_of_vocab = []
    for token in mini_sentence:
        if token.text not in vocab.keys():
            try:
                vocab[token.text] = word_2id[token.text]
            except:
                try:
                    vocab[token.text] = word_2id[token.lemma_]
                except:
                    try:
                        synonyms = wordnet.synsets(token.text)[0].lemmas()
                        for i in range(10):
                            synonym = synonyms[i].name()
                            try:
                                vocab[token.text] = word_2id[synonym]
                                break
                            except:
                                continue
                    except:
                        try:
                            vocab[token.text] = word_2id[stemmer.stem(token.text)]
                        except:
                            out_of_vocab.append(token.text)
    return vocab,out_of_vocab

#### English

In [14]:
vocab_2_embedding_idx_en = {}
out_of_vocab_en = []

sentences_en = dataset.sentences_en_clean

for i,sentence in enumerate(sentences_en):
    vocab_2_embedding_idx_en,out_of_vocab_current = add_words_to_vocab(vocab_2_embedding_idx_en,sentence,
                                                                       nlp_en,src_word2id,porter)
    
    for out_of_vocab_word in out_of_vocab_current:
        out_of_vocab_en += [(out_of_vocab_word,i)]


#### German

In [15]:
vocab_2_embedding_idx_ge = {}
out_of_vocab_ge = []

sentences_ge = dataset.sentences_ge_clean

for i,sentence in enumerate(sentences_ge):
    vocab_2_embedding_idx_ge,out_of_vocab_current = add_words_to_vocab(vocab_2_embedding_idx_ge,sentence,
                                                                nlp_ge,tgt_word2id,cistem)
    for out_of_vocab_word in out_of_vocab_current:
        out_of_vocab_ge += [(out_of_vocab_word,i)]

### Removing non german words from german vocabulary

#### Load list of true german words

In [21]:
current_german_vocab_corpus = list(vocab_2_embedding_idx_ge.keys())
#large list of german words downloaded here : https://gist.github.com/MarvinJWendt/2f4f4154b8ae218600eb091a5706b5f4#file-wordlist-german-txt
german_words = list(pd.read_csv("../data/wordlist-german.txt",header=None)[0])
german_words = [str(word).lower() for word in german_words]

In [22]:
#### Helpers methods ####

def remove_unnecessary_spaces(sentence):
    return re.sub(' +',' ',sentence.strip())

def remove_word_from_sentence(sentence,word):
    new_sentence = sentence.replace(word,"")
    return remove_unnecessary_spaces(new_sentence)

def remove_words_from_list(list_1,list_2 = german_words,isin=False):
    if isin:
        return [word for word in list_1 if word in list_2]
    else:
        return [word for word in list_1 if word not in list_2]

#### Finding words in original vocabulary which are not german (using multiprocessing)

In [25]:
from multiprocessing import Pool

pool = Pool(4)
n = len(current_german_vocab_corpus)
split = int(n/4)

words_to_be_removed_list = pool.map(remove_words_from_list,[current_german_vocab_corpus[0:split],
                                                 current_german_vocab_corpus[split:split*2],
                                                 current_german_vocab_corpus[split*2:split*3],
                                                 current_german_vocab_corpus[split*3:]])

In [26]:
words_to_be_removed = words_to_be_removed_list[0] + words_to_be_removed_list[1] + \
                      words_to_be_removed_list[2] + words_to_be_removed_list[3]

#### Add these words to the out_of_vocab_list

In [27]:
new_out_of_vocab_ge = []
for word in words_to_be_removed:
    idxs =  list(dataset[dataset.sentences_ge_clean.apply(lambda x: word in x)].index)
    if len(idxs)<=3:
        for idx in idxs:
            new_out_of_vocab_ge += [(word,idx)]

In [28]:
print("Number of old out of vocab words: {}".format(len(out_of_vocab_ge)))
print("Number of new out of vocab words: {}".format(len(new_out_of_vocab_ge)))

Number of old out of vocab words: 4418
Number of new out of vocab words: 2019


In [29]:
out_of_vocab_ge += new_out_of_vocab_ge

#### Removing these words from the vocab dictionnary

In [30]:
print("Number of old out vocab words: {}".format(len(vocab_2_embedding_idx_ge)))

Number of old out vocab words: 18457


In [31]:
for word in [pair[0] for pair in new_out_of_vocab_ge]:
    try:
        vocab_2_embedding_idx_ge.pop(word,None)
    except:
        continue

In [32]:
print("Number of new out vocab words: {}".format(len(vocab_2_embedding_idx_ge)))

Number of new out vocab words: 16847


#### Save vocab dicts

In [33]:
import pickle
#with open("../data/vocab_en.pkl","wb") as f:
#    pickle.dump(vocab_2_embedding_idx_en, f, pickle.HIGHEST_PROTOCOL)
#with open("../data/vocab_ge.pkl","wb") as f:
#    pickle.dump(vocab_2_embedding_idx_ge, f, pickle.HIGHEST_PROTOCOL)

#### Load vocab dicts

In [51]:
with open("../data/vocab_en.pkl", 'rb') as f:
    vocab_2_embedding_idx_en = pickle.load(f)
with open("../data/vocab_ge.pkl", 'rb') as f:
    vocab_2_embedding_idx_ge = pickle.load(f)

### Find non translated english words and split german words which can be

In [52]:
dataset = pd.read_pickle("../data/dataset_train_val.pickle")

In [53]:
dataset["non_translated_words"] = 0
dataset["sentences_en_cleaner"] = dataset["sentences_en_clean"].copy()
dataset["sentences_ge_cleaner"] = dataset["sentences_ge_clean"].copy()

In [54]:
def process_translated_out_of_vocab_words(pair,word2id,dataset,out_of_vocab_english):
    word,idx = pair[0],pair[1]
    en_sentence = dataset.loc[idx]["sentences_en_cleaner"]
    ge_sentence = dataset.loc[idx]["sentences_ge_cleaner"]
    count_non_translated = dataset.loc[idx]["non_translated_words"]
    
    ### If the word is exactly the same as a word in the source sentence
    if word in en_sentence:
        if word not in out_of_vocab_english:
            ### if the word is in the english vocab -> it could have been translated
            ### count it as a non translated word and remove it from both sentences
            dataset.at[idx,"sentences_en_cleaner"] = remove_word_from_sentence(en_sentence,word)
            dataset.at[idx,"sentences_ge_cleaner"] = remove_word_from_sentence(ge_sentence,word)
            dataset.at[idx,"non_translated_words"] = count_non_translated+1
            return dataset
        else:
            ### if the word is not in the english vocab -> it could not have been translated
            ### don't count it as a non translated word and remove it from both sentences
            dataset.at[idx,"sentences_en_cleaner"] = remove_word_from_sentence(en_sentence,word)
            dataset.at[idx,"sentences_ge_cleaner"] = remove_word_from_sentence(ge_sentence,word)
            return dataset
        
    ### Test for subwords
    subword = word[:-1]
    while len(subword)>0:
        if subword in word2id.keys():
            subword1 = subword
            subword2 = word[len(subword1):]
            if subword2 in word2id.keys():
                if len(subword2)>2:
                    ### if the word can be splitted in 2 meaningful words do it
                    dataset.at[idx,"sentences_ge_cleaner"] = ge_sentence.replace(word,subword1+" "+subword2)
                    return dataset
                else:
                    ### if the word can be found by using a shorter version which can be found in vocab
                    dataset.at[idx,"sentences_ge_cleaner"] = ge_sentence.replace(word,subword1)
                    return dataset
            else:
                return dataset
            
        else:
            subword = subword[:-1]
    return dataset

#### Process dataset

In [55]:
for pair in out_of_vocab_ge:
    dataset = process_translated_out_of_vocab_words(pair,vocab_2_embedding_idx_ge,dataset,out_of_vocab_en)

In [56]:
#dataset.to_pickle('../data/dataset_train_val_plus.pickle')
dataset = pd.read_pickle("../data/dataset_train_val_plus.pickle")

### Remove stop words

In [59]:
from nltk.corpus import stopwords
stopwords_en = set(stopwords.words('english'))
stopwords_ge = set(stopwords.words('german'))

In [64]:
def remove_multiple_words_from_sentence(sentence,words,isin=False):
    if isin:
        splited = [word for word in sentence.split() if word in words]
    else:
        splited = [word for word in sentence.split() if word not in words]
    return " ".join(splited)

In [65]:
dataset["sentences_en_cleaner"] = dataset.sentences_en_cleaner.apply(lambda x:remove_multiple_words_from_sentence(x,stopwords_en))
dataset["sentences_ge_cleaner"] = dataset.sentences_ge_cleaner.apply(lambda x:remove_multiple_words_from_sentence(x,stopwords_ge))
dataset["sentences_ge_cleaner"] = dataset.sentences_ge_cleaner.apply(lambda x:remove_multiple_words_from_sentence(x,stopwords_en))

In [66]:
vocab_fin_en = list(vocab_2_embedding_idx_en.keys())
vocab_fin_ge = list(vocab_2_embedding_idx_ge.keys())

In [67]:
dataset["sentences_en_final"] = dataset.sentences_en_cleaner.apply(lambda x:remove_multiple_words_from_sentence(x,vocab_fin_en,True))
dataset["sentences_ge_final"] = dataset.sentences_ge_cleaner.apply(lambda x:remove_multiple_words_from_sentence(x,vocab_fin_ge,True))

In [68]:
dataset["length_ge"] = dataset["sentences_ge_final"].apply(lambda x:len(x.split()))
dataset["length_en"] = dataset["sentences_en_final"].apply(lambda x:len(x.split()))
dataset["distance"] = dataset["length_en"] - dataset["length_ge"]

In [71]:
#dataset.to_pickle('../data/dataset_train_val_final.pickle')
dataset = pd.read_pickle("../data/dataset_train_val_final.pickle")