In [1]:
import random
import numpy as np
from langdetect import detect
from sacremoses import MosesTokenizer, MosesDetokenizer
from laserembeddings import Laser
from scipy import spatial

In [2]:
dataset = "paracrawl"
target = "de"
source = "en"
location = "../data/{}-{}/processed_data/{}".format(source,target,dataset)

source_file = location+".{}".format(source)
target_file = location+".{}".format(target)

In [3]:
num_lines = 10**5
total_source = total_target = sum(1 for line in open(source_file))

source_lines = []
target_lines = []

f = open(source_file)
g = open(target_file)

for line in f:
    line_2 = g.readline()
    if random.random()<num_lines/total_source:
        source_lines.append(line.strip())
        target_lines.append(line_2.strip())

In [4]:
mt = MosesTokenizer(lang=source)
source_lines_tokenized = [mt.tokenize(i) for i in source_lines]

In [5]:
test_lines_source = open("../data/test/flores/devtest/devtest.{}".format(source)).read().split("\n")
test_lines_source_tokenized = [mt.tokenize(i.strip()) for i in test_lines_source]

In [6]:
mt = MosesTokenizer(lang=target)
target_lines_tokenized = [mt.tokenize(i) for i in target_lines]

In [7]:
test_lines_target = open("../data/test/flores/devtest/devtest.{}".format(target)).read().split("\n")
test_lines_target_tokenized = [mt.tokenize(i.strip()) for i in test_lines_target]

In [8]:
source_lines[0], target_lines[0], test_lines_source[0], test_lines_target[0]

('schwatzgelb: thanks for a revealing and extensive interview, mr. saftig!',
 'schwatzgelb: danke für ein aufschlussreiches und umfangreiches interview, herr saftig!',
 '"we now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.',
 '„wir haben jetzt 4 monate alte mäuse, die diabetes hatten und jetzt keinen mehr haben“, fügte er hinzu.')

In [9]:
len(source_lines)

99418

In [10]:
len(target_lines)

99418

In [11]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [12]:
source_words = len(set(flatten(source_lines_tokenized)))
target_words = len(set(flatten(target_lines_tokenized)))

test_source_words = len(set(flatten(test_lines_source_tokenized)))
test_target_words = len(set(flatten(test_lines_target_tokenized)))

In [13]:
source_words, target_words

(113636, 192296)

In [14]:
test_source_words, test_target_words

(5784, 7192)

In [15]:
words_in_test_source = len(set(flatten(source_lines_tokenized)).intersection(set(flatten(test_lines_source_tokenized))))
words_in_test_target = len(set(flatten(target_lines_tokenized)).intersection(set(flatten(test_lines_target_tokenized))))

In [16]:
round(words_in_test_source/test_source_words,2), round(words_in_test_target/test_target_words,2)

(0.89, 0.8)

In [17]:
laser = Laser()

In [18]:
embeddings_source = laser.embed_sentences(source_lines[:1000],lang=source)

In [19]:
embeddings_target = laser.embed_sentences(target_lines[:1000],lang=target)

In [20]:
similarities = [1 - spatial.distance.cosine(embeddings_source[i],embeddings_target[i]) for i in range(len(embeddings_source))]

In [21]:
np.mean(similarities)

0.8738329346776008

In [22]:
source_words_per_sentence = np.median([len(i) for i in source_lines_tokenized])
target_words_per_sentence = np.median([len(i) for i in target_lines_tokenized])
source_words_per_sentence, target_words_per_sentence

(15.0, 15.0)

In [23]:
np.median([len(i) for i in test_lines_source_tokenized])

23.0