### Loading

In [1]:
import pandas as pd
import spacy
import re
import pickle
import os

import en_core_web_md
import de_core_news_md
nlp_en = en_core_web_md.load()
nlp_ge = de_core_news_md.load()

def spacy_analysis(sentence,nlp):
    doc = nlp(sentence)
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
              token.shape_, token.is_alpha, token.is_stop,token.vector.shape)

In [2]:
dataset = pd.read_pickle("../data/dataset_train_val_final.pickle")
with open("../data/vocab_en.pkl", 'rb') as f:
    vocab_2_embedding_idx_en = pickle.load(f)
with open("../data/vocab_ge.pkl", 'rb') as f:
    vocab_2_embedding_idx_ge = pickle.load(f)

In [4]:
import io
import numpy as np

def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

path = os.path.join(os.path.dirname(os.getcwd()), 'data','muse')

src_path = path+"/wiki.multi.en.vec"
tgt_path = path+"/wiki.multi.de.vec"
nmax = 300000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

### Helpers

In [5]:
def get_correlation(src_emb,tgt_emb):
    corr = (src_emb / np.linalg.norm(src_emb)).dot(tgt_emb / np.linalg.norm(tgt_emb))
    return corr

In [6]:
def get_emb(word,language):
    if language=="en":
        return src_embeddings[vocab_2_embedding_idx_en[word]]
    else:
        return tgt_embeddings[vocab_2_embedding_idx_ge[word]]

In [7]:
def get_corr_matrix(words_en_list,words_ge_list):
    n = len(words_en_list)
    m = len(words_ge_list)
    corr_matrix = np.zeros((n,m))
    for i,word_en in enumerate(words_en_list):
        for j,word_ge in enumerate(words_ge_list):
            corr_matrix[i,j] = get_correlation(get_emb(word_en,"en"),get_emb(word_ge,"ge"))
            
    return corr_matrix

In [8]:
def get_word_matches(corr_matrix):
    if len(corr_matrix)==0:
        return {}
    best_match_row = np.argmax(corr_matrix,axis=0)
    best_match_col = np.argmax(corr_matrix,axis=1)
    couples = {}
    tmp_corr_matrix = corr_matrix.copy()
    n = corr_matrix.shape[0] 
    m = corr_matrix.shape[1]
    dim = min(n,m)
    while len(couples.keys())<dim:
        for i in range(n):
            if (i == best_match_row[best_match_col[i]]) and (i not in couples.keys()):
                couples[i] = best_match_col[i]
                tmp_corr_matrix[i,:] = np.zeros(m)
                tmp_corr_matrix[:,best_match_col[i]] = np.zeros(n)
                best_match_row = np.argmax(tmp_corr_matrix,axis=0)
                best_match_col = np.argmax(tmp_corr_matrix,axis=1)
    return couples

In [9]:
mat = np.array([[1,2,3],
                [4,5,6],
                [7,8,9],
                [10,11,12]])

In [18]:
def get_word_couples(words_en,words_ge):
    if len(words_en)==0 or len(words_ge)==0:
        return np.nan,np.nan,np.nan
    
    words_en_list = words_en.split()
    words_ge_list = words_ge.split()
    corr_mat = get_corr_matrix(words_en_list,words_ge_list)
    word_couples_idx = get_word_matches(corr_mat)
    score = 0
    
    for i in word_couples_idx.keys():
        score+=corr_mat[i,word_couples_idx[i]]
    score/=len(word_couples_idx)
    
    if len(words_en_list)>len(words_ge_list):
        kept_words_idx = np.array(list(word_couples_idx.keys()))
        left_words_idx = np.setdiff1d(np.arange(len(words_en_list)),kept_words_idx)
        left_words = [words_en_list[i] for i in left_words_idx]
        
    elif len(words_en_list)<len(words_ge_list):
        kept_words_idx = np.array(list(word_couples_idx.values()))
        left_words_idx = np.setdiff1d(np.arange(len(words_en_list)),kept_words_idx)
        left_words = [words_ge_list[i] for i in left_words_idx]
        
    else:
        left_words = []
        
    word_couples = {}
    for key,val in word_couples_idx.items():
        word_couples[words_en_list[key]] = words_ge_list[val]
        
    return word_couples,score,left_words

In [14]:
dataset["correlation"] = dataset.apply(lambda row:get_word_couples(row["sentences_en_final"],row["sentences_ge_final"])[1],axis=1)


In [15]:
dataset["correlation"] = dataset["correlation"].fillna(dataset["correlation"].mean())

In [16]:
dataset[["correlation","scores"]].corr()

Unnamed: 0,correlation,scores
correlation,1.0,0.089792
scores,0.089792,1.0


In [19]:
dataset

Unnamed: 0,sentences_en,sentences_ge,scores,person,sentences_en_no_propnouns,sentences_ge_no_propnouns,sentences_en_clean,sentences_ge_clean,non_translated_words,sentences_en_cleaner,sentences_ge_cleaner,sentences_en_final,sentences_ge_final,length_ge,length_en,distance,correlation
0,José Ortega y Gasset visited Husserl at Freibu...,1934 besuchte José Ortega y Gasset Husserl in ...,1.101697,"[José Ortega y Gasset, Husserl, Freiburg]",visited at in 1934.,1934 besuchte in .,visited at in,besuchte in,0,visited,besuchte,visited,besuchte,1,1,0,0.518761
1,"However, a disappointing ninth in China meant ...",Eine enttäuschende Neunte in China bedeutete j...,-0.516656,[China],"However, a disappointing ninth in meant that ...",Eine enttäuschende Neunte in bedeutete jedoch...,however a disappointing ninth in meant that he...,eine enttäuschende neunte in bedeutete jedoch ...,0,however disappointing ninth meant dropped back...,enttäuschende neunte bedeutete jedoch gesamtwe...,however disappointing ninth meant dropped back...,enttäuschende neunte bedeutete jedoch gesamtwe...,8,8,0,0.619618
2,"In his diary, Chase wrote that the release of ...","In seinem Tagebuch, Chase schrieb, dass die Ve...",-2.226388,"[Chase, Mason, Slidell]","In his diary, wrote that the release of and ...","In seinem Tagebuch, schrieb, dass die Veröffe...",in his diary wrote that the release of and was...,in seinem tagebuch schrieb dass die veröffentl...,0,diary wrote release like gall wormwood,tagebuch schrieb veröffentlichung galle wermut,diary wrote release like gall wormwood,tagebuch schrieb veröffentlichung galle wermut,5,6,1,0.633080
3,Heavy arquebuses mounted on wagons were called...,Schwere Arquebuses auf Waggons montiert wurden...,-0.827379,[],Heavy arquebuses mounted on wagons were called...,Schwere Arquebuses auf Waggons montiert wurden...,heavy arquebuses mounted on wagons were called...,schwere arquebuses auf waggons montiert wurden...,4,heavy mounted wagons called,schwere waggons montiert wurden genannt,heavy mounted wagons called,schwere waggons montiert wurden genannt,5,4,-1,0.626568
4,Once North Pacific salmon die off after spawni...,Sobald der nordpazifische Lachs nach dem Laich...,0.364695,[],Once North Pacific salmon die off after spawni...,Sobald der nordpazifische Lachs nach dem Laich...,once north pacific salmon die off after spawni...,sobald der nordpazifische lachs nach dem laich...,0,north pacific salmon die spawning usually loca...,sobald nordpazifische lachs laichen abstirbt f...,north pacific salmon die spawning usually loca...,sobald lachs laichen abstirbt fressen regel lo...,11,14,3,0.583080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,"The gang absconded with $2,000 cash in the sec...",Die Bande flüchtete mit $2.000 Bargeld in den ...,0.164712,"[St. Joseph, Louisiana]","The gang absconded with $2,000 cash in the sec...",Die Bande flüchtete mit $2.000 Bargeld in den ...,the gang absconded with cash in the second rob...,die bande flüchtete mit bargeld in den zweiten...,2,gang absconded cash second robbery took shelte...,bande flüchtete bargeld zweiten raub nahm schu...,gang absconded cash second robbery took shelte...,bande flüchtete bargeld zweiten raub nahm schu...,10,10,0,0.582558
7996,The Irish settlers arrives from Northern Irela...,Die irischen Siedler kommen kurz nach den Loya...,0.394755,[],The Irish settlers arrives from Northern Irela...,Die irischen Siedler kommen kurz nach den Loya...,the irish settlers arrives from northern irela...,die irischen siedler kommen kurz nach den loya...,0,irish settlers arrives northern ireland shortl...,irischen siedler kommen kurz loyalisten nordir...,irish settlers arrives northern ireland shortl...,irischen siedler kommen kurz loyalisten nordir...,6,7,1,0.684928
7997,Volcanics include dacite breccia and small rem...,Zu den Vulkanen gehören Dacite Breccia und kle...,0.241944,[],Volcanics include dacite breccia and small rem...,Zu den Vulkanen gehören Dacite Breccia und kle...,volcanics include dacite breccia and small rem...,zu den vulkanen gehören dacite breccia und kle...,3,volcanics include small remnants hornblende la...,vulkanen gehören kleine reste hornblende lavas...,volcanics include small remnants hornblende la...,vulkanen gehören kleine reste hornblende lavas...,9,11,2,0.585751
7998,The Fort Concho Museum acquired the Schoolhous...,Das Fort Concho Museum erwarb 1946 das Schulha...,0.360707,[Fort Concho Museum],The acquired the Schoolhouse in 1946 and rest...,Das erwarb 1946 das Schulhaus und restauriert...,the acquired the schoolhouse in and restored it,das erwarb das schulhaus und restaurierte es,0,acquired schoolhouse restored,erwarb schulhaus restaurierte,acquired schoolhouse restored,erwarb schulhaus restaurierte,3,3,0,0.659042


In [21]:
dataset.to_pickle('../data/dataset_correlations_v1.pickle')