In [1]:
import pandas as pd
import spacy

import en_core_web_md
import de_core_news_md
nlp_en = en_core_web_md.load()
nlp_ge = de_core_news_md.load()

def spacy_analysis(sentence,nlp):
    doc = nlp(sentence)
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
              token.shape_, token.is_alpha, token.is_stop,token.vector.shape)

In [2]:
import os

In [3]:
def extract_sentences(filename,lower=False):
    if lower:
        data = [l.lower().strip() for l in open(filename) if l.strip()]
    else:
        data = [l.strip() for l in open(filename) if l.strip()]
    return data

### Training Data

In [4]:
path_train_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.src')
path_train_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.mt')
path_train_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.scores')

In [5]:
train_sentences_en = pd.DataFrame(extract_sentences(path_train_en),columns = ['sentences_en'])
train_sentences_ge = pd.DataFrame(extract_sentences(path_train_ge),columns = ['sentences_ge'])
train_scores = pd.read_csv(path_train_scores,header=None)
train_scores = train_scores.rename(columns={0:"scores"})

In [6]:
train_dataset = pd.merge(train_sentences_en,train_sentences_ge,left_index=True,right_index=True)
train_dataset = pd.merge(train_dataset,train_scores,left_index=True,right_index=True)

### Validation Data

In [7]:
path_dev_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.src')
path_dev_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.mt')
path_dev_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.scores')

In [8]:
dev_sentences_en = pd.DataFrame(extract_sentences(path_dev_en),columns = ['sentences_en'])
dev_sentences_ge = pd.DataFrame(extract_sentences(path_dev_ge),columns = ['sentences_ge'])
dev_scores = pd.read_csv(path_dev_scores,header=None)
dev_scores = dev_scores.rename(columns={0:"scores"})

In [9]:
dev_dataset = pd.merge(dev_sentences_en,dev_sentences_ge,left_index=True,right_index=True)
dev_dataset = pd.merge(dev_dataset,dev_scores,left_index=True,right_index=True)

### Merging the two 

In [10]:
dataset = pd.concat([train_dataset, dev_dataset])

In [13]:
dataset.to_pickle('../data/dataset_train_val.pickle')
dataset = pd.read_pickle("../data/dataset_train_val.pickle")

### Constructing the full vocabluary

In [14]:
def entity_analysis(sentence,nlp):
    print(sentence+"\n"+"\n"+"Analysis"+"\n"+"--------")
    doc = nlp(sentence)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [15]:
def remove_punct(sentence, nlp):
    sentence=nlp(sentence)
    clean_sentence = ""
    for token in sentence:
        if (token.is_alpha):
            clean_sentence+= (token.text + " ")
    clean_sentence=clean_sentence.strip()
    return clean_sentence

def remove_names(sentence,persons_list):
    sentence_without_persons = sentence
    for person in persons_list:
        sentence_without_persons = sentence_without_persons.replace(person,"")
    return sentence_without_persons

In [None]:
def get_persons(sentence_en,sentence_ge):
    doc = nlp_en(sentence_en)
    persons_list = []
    for ent in doc.ents:
        if (ent.label_=="PERSON" or ent.label_=="ORG" or 
            ent.label_=="LOC" or ent.label_=="GPE" 
            or ent.label == "FAC" or ent.label == "NORP"):
            if ent.text in sentence_ge:
                persons_list += [ent.text]
            else:
                ent_text_clean = ent.text.replace("the","").replace("The","").strip()
                if ent_text_clean in sentence_ge:
                    persons_list += [ent_text_clean]
    return persons_list

In [None]:
dataset['person']=dataset.apply(lambda row: get_persons(row["sentences_en"],
                                                         row["sentences_ge"]),axis=1)

In [None]:
dataset['sentences_en_no_propnouns'] = dataset.apply(lambda row: remove_names(row['sentences_en'], row["person"]), axis=1)
dataset['sentences_ge_propnouns'] = dataset.apply(lambda row: remove_names(row['sentences_ge'], row["person"]), axis=1)

In [None]:
dataset['sentences_en_clean'] = dataset.apply(lambda row: remove_punct(row['sentences_en_clean'], nlp_en), axis=1)
dataset['sentences_ge_clean'] = dataset.apply(lambda row: remove_punct(row['sentences_ge_clean'], nlp_ge), axis=1)

In [None]:
dataset.to_pickle('../data/dataset_train_val.pickle')
dataset = pd.read_pickle("../data/dataset_train_val.pickle")

### Vocabulary with embedding

In [33]:
import io
import numpy as np

def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

src_path = '/Users/theophile/Documents/Masters_ML/NLP/coursework/MUSE/data/vectors/wiki.multi.en.vec'
tgt_path = '/Users/theophile/Documents/Masters_ML/NLP/coursework/MUSE/data/vectors/wiki.multi.de.vec'
nmax = 300000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

In [140]:
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer #stem for english
porter = PorterStemmer()

In [234]:
mini_sentence = nlp_en("Hi encompasse John destine dismaste misperceive chocolate loving recurving hope joheg jealousy betroth casemated")

In [235]:
embeddings = []
vocabulary = []
out_of_vocab = []

for token in mini_sentence:
    if token.text not in vocabulary:
        try:
            embeddings.append(src_embeddings[src_word2id[token.text]])
            vocabulary.append(token.text)
        except:
            try:
                embeddings.append(src_embeddings[src_word2id[token.lemma_]])
                vocabulary.append(token.text)
            except:
                n = False
                if n == True:
                    for i in range(3):
                        synonym = (wordnet.synsets(token.text)[0].lemmas()[i].name())
                        try:
                            embeddings.append(src_embeddings[src_word2id[synonym]])
                            vocabulary.append(token.text)
                            n = True
                        except:
                            continue
                else:
                    try:
                        embeddings.append(src_embeddings[src_word2id[porter.stem(token.text)]])
                        vocabulary.append(token.text)
                    except:
                        out_of_vocab.append(token.text)

In [236]:
vocabulary

['Hi',
 'encompasse',
 'John',
 'destine',
 'chocolate',
 'loving',
 'hope',
 'jealousy']

In [237]:
out_of_vocab

['dismaste', 'misperceive', 'recurving', 'joheg', 'betroth', 'casemated']

In [None]:
'David' in dataset.iloc[3839]['person']

In [None]:
dataset.loc[261][['sentences_en', 'sentences_en_no_punct','person']]

In [None]:
dataset.loc[10]['sentences_en']

In [None]:
dataset.iloc[7000]['sentences_en_no_punct']