In [1]:
import pandas as pd
import spacy

import en_core_web_md
import de_core_news_md
nlp_en = en_core_web_md.load()
nlp_ge = de_core_news_md.load()

def spacy_analysis(sentence,nlp):
    doc = nlp(sentence)
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
              token.shape_, token.is_alpha, token.is_stop,token.vector.shape)

In [7]:
import os

In [3]:
def extract_sentences(filename,lower=False):
    if lower:
        data = [l.lower().strip() for l in open(filename) if l.strip()]
    else:
        data = [l.strip() for l in open(filename) if l.strip()]
    return data

### Training Data

In [18]:
path_train_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.src')
path_train_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.mt')
path_train_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.scores')

In [20]:
train_sentences_en = pd.DataFrame(extract_sentences(path_train_en),columns = ['sentences_en'])
train_sentences_ge = pd.DataFrame(extract_sentences(path_train_ge),columns = ['sentences_ge'])
train_scores = pd.read_csv(path_train_scores,header=None)
train_scores = train_scores.rename(columns={0:"scores"})

In [24]:
train_dataset = pd.merge(train_sentences_en,train_sentences_ge,left_index=True,right_index=True)
train_dataset = pd.merge(train_dataset,train_scores,left_index=True,right_index=True)

### Validation Data

In [25]:
path_dev_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.src')
path_dev_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.mt')
path_dev_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.scores')

In [26]:
dev_sentences_en = pd.DataFrame(extract_sentences(path_dev_en),columns = ['sentences_en'])
dev_sentences_ge = pd.DataFrame(extract_sentences(path_dev_ge),columns = ['sentences_ge'])
dev_scores = pd.read_csv(path_dev_scores,header=None)
dev_scores = dev_scores.rename(columns={0:"scores"})

In [27]:
dev_dataset = pd.merge(dev_sentences_en,dev_sentences_ge,left_index=True,right_index=True)
dev_dataset = pd.merge(dev_dataset,dev_scores,left_index=True,right_index=True)

### Merging the two 

In [28]:
dataset = pd.concat([train_dataset, dev_dataset])

### Constructing the full vocabluary

In [105]:
def get_entities(sentence):
    doc = nlp_en(sentence)
    persons_list = []
    for ent in doc.ents:
        if (ent.label_=="PERSON"):
            persons_list += [word for word in ent.text.split()]
    return persons_list

In [106]:
dataset['person']=dataset['sentences_en'].apply(get_entities)

In [110]:
def remove_punct(sentence, nlp, persons_list):
    sentence=nlp(sentence)
    clean_sentence = ""
    for token in sentence:
        if token.text not in persons_list:
            if (token.is_alpha):
                 clean_sentence+= (token.text + " ")
    clean_sentence=clean_sentence.strip()
    return clean_sentence

In [111]:
dataset['sentences_en_no_punct'] = dataset.apply(lambda row: remove_punct(row['sentences_en'], nlp_en, row['person']), axis=1)
dataset['sentences_ge_no_punct'] = dataset.apply(lambda row: remove_punct(row['sentences_ge'], nlp_ge, row['person']), axis=1)

In [None]:
def create_vocab(dataset):
    vocabulary=[]
    for sentence in dataset:
        for token in sentence:
            vocabulary.append(token)

In [112]:
'David' in dataset.iloc[3839]['person']

True

In [140]:
dataset.loc[261][['sentences_en', 'sentences_en_no_punct','person']]

Unnamed: 0,sentences_en,sentences_en_no_punct,person
261,"These included British, Macedonians, Portugues...",These included British Macedonians Portuguese ...,[]
261,"Abler, Thomas S. ""Joseph Brant"" in John A. Gar...",Abler Joseph in and eds American National Biog...,"[Thomas, S., ""Joseph, Brant, John, A., Garraty..."


In [129]:
dataset.loc[10]['sentences_en']

10    Sienna Miller portrays The Baroness as one of ...
10    The PVV contested the 2017 general election wi...
Name: sentences_en, dtype: object

In [125]:
dataset.iloc[7000]['sentences_en_no_punct']

'Simultaneously the Legion took part to the pacification of Algeria plagued by various tribal rebellions and'