### Loading

In [None]:
import pandas as pd
import spacy
import re
import pickle
import os

import en_core_web_md
import de_core_news_md
nlp_en = en_core_web_md.load()
nlp_ge = de_core_news_md.load()

def spacy_analysis(sentence,nlp):
    doc = nlp(sentence)
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
              token.shape_, token.is_alpha, token.is_stop,token.vector.shape)

In [None]:
dataset = pd.read_pickle("../data/dataset_train_val_final.pickle")
with open("../data/vocab_en.pkl", 'rb') as f:
    vocab_2_embedding_idx_en = pickle.load(f)
with open("../data/vocab_ge.pkl", 'rb') as f:
    vocab_2_embedding_idx_ge = pickle.load(f)

In [None]:
import io
import numpy as np

def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

path = os.path.join(os.path.dirname(os.getcwd()), 'data','muse')

src_path = path+"/wiki.multi.en.vec"
tgt_path = path+"/wiki.multi.de.vec"
nmax = 300000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

### Helpers

In [None]:
def get_correlation(src_emb,tgt_emb):
    corr = (src_emb / np.linalg.norm(src_emb)).dot(tgt_emb / np.linalg.norm(tgt_emb))
    return corr

In [None]:
def get_emb(word,language):
    if language=="en":
        return src_embeddings[vocab_2_embedding_idx_en[word]]
    else:
        return tgt_embeddings[vocab_2_embedding_idx_ge[word]]

In [None]:
def get_corr_matrix(words_en_list,words_ge_list):
    n = len(words_en_list)
    m = len(words_ge_list)
    corr_matrix = np.zeros((n,m))
    for i,word_en in enumerate(words_en_list):
        for j,word_ge in enumerate(words_ge_list):
            corr_matrix[i,j] = get_correlation(get_emb(word_en,"en"),get_emb(word_ge,"ge"))
            
    return corr_matrix

In [None]:
def get_word_matches(corr_matrix):
    if len(corr_matrix)==0:
        return {}
    best_match_row = np.argmax(corr_matrix,axis=0)
    best_match_col = np.argmax(corr_matrix,axis=1)
    couples = {}
    tmp_corr_matrix = corr_matrix.copy()
    n = corr_matrix.shape[0] 
    m = corr_matrix.shape[1]
    dim = min(n,m)
    while len(couples.keys())<dim:
        for i in range(n):
            if (i == best_match_row[best_match_col[i]]) and (i not in couples.keys()):
                couples[i] = best_match_col[i]
                tmp_corr_matrix[i,:] = np.zeros(m)
                tmp_corr_matrix[:,best_match_col[i]] = np.zeros(n)
                best_match_row = np.argmax(tmp_corr_matrix,axis=0)
                best_match_col = np.argmax(tmp_corr_matrix,axis=1)
    return couples

In [None]:
mat = np.array([[1,2,3],
                [4,5,6],
                [7,8,9],
                [10,11,12]])

In [None]:
def get_word_couples(words_en,words_ge):
    if len(words_en)==0 or len(words_ge)==0:
        return np.nan,np.nan,np.nan
    
    words_en_list = words_en.split()
    words_ge_list = words_ge.split()
    corr_mat = get_corr_matrix(words_en_list,words_ge_list)
    word_couples_idx = get_word_matches(corr_mat)
    score = 0
    
    for i in word_couples_idx.keys():
        score+=corr_mat[i,word_couples_idx[i]]
    score/=len(word_couples_idx)
    
    if len(words_en_list)>len(words_ge_list):
        kept_words_idx = np.array(list(word_couples_idx.keys()))
        left_words_idx = np.setdiff1d(np.arange(len(words_en_list)),kept_words_idx)
        left_words = [words_en_list[i] for i in left_words_idx]
        
    elif len(words_en_list)<len(words_ge_list):
        kept_words_idx = np.array(list(word_couples_idx.values()))
        left_words_idx = np.setdiff1d(np.arange(len(words_en_list)),kept_words_idx)
        left_words = [words_ge_list[i] for i in left_words_idx]
        
    else:
        left_words = []
        
    word_couples = {}
    for key,val in word_couples_idx.items():
        word_couples[words_en_list[key]] = words_ge_list[val]
        
    return word_couples,score,left_words

In [None]:
dataset["correlation"] = dataset.apply(lambda row:get_word_couples(row["sentences_en_final"],row["sentences_ge_final"])[1],axis=1)


In [None]:
dataset["correlation"] = dataset["correlation"].fillna(dataset["correlation"].mean())

In [None]:
dataset[["correlation","scores"]].corr()