In [1]:
import numpy as np
import pandas as pd

In [2]:
from laserembeddings import Laser

laser = Laser()

# if all sentences are in the same language:

embeddings = laser.embed_sentences(
    ['let your neural network be polyglot',
     'use multilingual embeddings!'],
    lang='en')  # lang is only used for tokenization

# embeddings is a N*1024 (N = number of sentences) NumPy array

In [3]:
embeddings.shape

(2, 1024)

In [4]:
embeddings = laser.embed_sentences(
    ['I love pasta.',
     "J'adore les p√¢tes.",
     'Ich liebe Pasta.'],
    lang=['en', 'fr', 'de'])

In [5]:
def get_correlation(src_sentence,tgt_sentence):
    embeddings = laser.embed_sentences([src_sentence, tgt_sentence], lang=['en', 'de'])
    src_emb, tgt_emb = embeddings[0], embeddings[1]
    corr = (src_emb / np.linalg.norm(src_emb)).dot(tgt_emb / np.linalg.norm(tgt_emb))
    return corr

In [8]:
get_correlation('I love pasta.', 'Ich liebe pizza.')

0.85943264

In [6]:
dataset = pd.read_pickle('../data/dataset_correlations_v2.pickle')

In [41]:
dataset['sentence_correlation'] = dataset.apply(lambda row: 
                                  get_correlation(row["sentences_en"], row["sentences_ge"]),axis=1)



In [43]:
dataset['sentence_correlation'].corr(dataset['scores'])

0.0568468439418905

In [47]:
# dataset.to_pickle('../data/dataset_corrleations_laser.pickle')

### Getting the test laser embeddings

In [9]:
import pandas as pd
import os

In [10]:
def extract_sentences(filename,lower=False):
    if lower:
        data = [l.lower().strip() for l in open(filename) if l.strip()]
    else:
        data = [l.strip() for l in open(filename) if l.strip()]
    return data

In [17]:
sentences_en = pd.DataFrame(extract_sentences('/Users/theophile/Documents/Masters_ML/NLP/coursework/sentence_level_qe_ffl/data/en-de/test.ende.src'),columns = ['sentences_en'])
sentences_ge = pd.DataFrame(extract_sentences('/Users/theophile/Documents/Masters_ML/NLP/coursework/sentence_level_qe_ffl/data/en-de/test.ende.mt'),columns = ['sentences_ge'])

In [18]:
dataset_test_laser = pd.merge(sentences_en,sentences_ge,left_index=True,right_index=True)

In [19]:
dataset_test_laser['sentence_correlation'] = dataset.apply(lambda row: 
                                             get_correlation(row["sentences_en"], row["sentences_ge"]),axis=1)



In [20]:
# dataset_test_laser.to_pickle('../data/test_dataset_corrleatiojhjklns_laser.pickle')

In [42]:
laser_embeddings = dataset[['sentences_en', 'sentences_ge', 'scores']]

In [49]:
laser_embeddings = pd.concat([laser_embeddings[['sentences_en', 'sentences_ge', 'scores']], dataset_test_laser])

In [52]:
laser_embeddings.shape

(9000, 3)

In [53]:
laser_embeddings.columns

Index(['sentences_en', 'sentences_ge', 'sentence_correlation'], dtype='object')

In [55]:
full_laser_english = laser.embed_sentences(laser_embeddings['sentences_en'].tolist(), lang='en')

In [56]:
full_laser_german = laser.embed_sentences(laser_embeddings['sentences_ge'].tolist(), lang='de')



In [58]:
np.save('laser_1024_english', full_laser_english)
np.save('laser_1024_german', full_laser_german)

In [61]:
full_laser_english.shape

(9000, 1024)