In [None]:
import pandas as pd
import spacy

import en_core_web_md
import de_core_news_md
nlp_en = en_core_web_md.load()
nlp_ge = de_core_news_md.load()

def spacy_analysis(sentence,nlp):
    doc = nlp(sentence)
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
              token.shape_, token.is_alpha, token.is_stop,token.vector.shape)

In [None]:
import os

In [None]:
def extract_sentences(filename,lower=False):
    if lower:
        data = [l.lower().strip() for l in open(filename) if l.strip()]
    else:
        data = [l.strip() for l in open(filename) if l.strip()]
    return data

### Training Data

In [None]:
path_train_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.src')
path_train_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.mt')
path_train_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.scores')

In [None]:
train_sentences_en = pd.DataFrame(extract_sentences(path_train_en),columns = ['sentences_en'])
train_sentences_ge = pd.DataFrame(extract_sentences(path_train_ge),columns = ['sentences_ge'])
train_scores = pd.read_csv(path_train_scores,header=None)
train_scores = train_scores.rename(columns={0:"scores"})

In [None]:
train_dataset = pd.merge(train_sentences_en,train_sentences_ge,left_index=True,right_index=True)
train_dataset = pd.merge(train_dataset,train_scores,left_index=True,right_index=True)

### Validation Data

In [None]:
path_dev_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.src')
path_dev_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.mt')
path_dev_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.scores')

In [None]:
dev_sentences_en = pd.DataFrame(extract_sentences(path_dev_en),columns = ['sentences_en'])
dev_sentences_ge = pd.DataFrame(extract_sentences(path_dev_ge),columns = ['sentences_ge'])
dev_scores = pd.read_csv(path_dev_scores,header=None)
dev_scores = dev_scores.rename(columns={0:"scores"})

In [None]:
dev_dataset = pd.merge(dev_sentences_en,dev_sentences_ge,left_index=True,right_index=True)
dev_dataset = pd.merge(dev_dataset,dev_scores,left_index=True,right_index=True)

### Merging the two 

In [None]:
dataset = pd.concat([train_dataset, dev_dataset])

### Constructing the full vocabluary

In [None]:
def entity_analysis(sentence,nlp):
    print(sentence+"\n"+"\n"+"Analysis"+"\n"+"--------")
    doc = nlp(sentence)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
def remove_punct(sentence, nlp):
    sentence=nlp(sentence)
    clean_sentence = ""
    for token in sentence:
        if (token.is_alpha):
            clean_sentence+= (token.text + " ")
    clean_sentence=clean_sentence.strip()
    return clean_sentence

def remove_names(sentence,persons_list):
    sentence_without_persons = sentence
    for person in persons_list:
        sentence_without_persons = sentence_without_persons.replace(person,"")
    return sentence_without_persons

In [None]:
dataset['sentences_en_no_punct'] = dataset.apply(lambda row: remove_punct(row['sentences_en'], nlp_en), axis=1)
dataset['sentences_ge_no_punct'] = dataset.apply(lambda row: remove_punct(row['sentences_ge'], nlp_ge), axis=1)

In [None]:
def get_persons(sentence_en,sentence_ge):
    doc = nlp_en(sentence_en)
    persons_list = []
    for ent in doc.ents:
        if (ent.label_=="PERSON" or ent.label_=="ORG") and (ent.text in sentence_ge):
            persons_list += [ent.text]
    return persons_list

In [None]:
dataset['person']=dataset.apply(lambda row: get_persons(row["sentences_en_no_punct"],
                                                         row["sentences_ge_no_punct"]),axis=1)

In [None]:
dataset['sentences_en_clean'] = dataset.apply(lambda row: remove_names(row['sentences_en_no_punct'], row["person"]), axis=1)
dataset['sentences_ge_clean'] = dataset.apply(lambda row: remove_names(row['sentences_ge_no_punct'], row["person"]), axis=1)


In [None]:
def create_vocab(dataset):
    vocabulary=[]
    for sentence in dataset:
        for token in sentence:
            vocabulary.append(token)

In [None]:
'David' in dataset.iloc[3839]['person']

In [None]:
dataset.loc[261][['sentences_en', 'sentences_en_no_punct','person']]

In [None]:
dataset.loc[10]['sentences_en']

In [None]:
dataset.iloc[7000]['sentences_en_no_punct']