In [3]:
import time
import random
import os
import requests
import numpy as np
from os import listdir, remove, mkdir
from os.path import join as join_path

# Preprocessing europarl data

In [2]:
euro_data_dir = "../europarl/aligned/"
output_data_dir = "./data/"

## Join Files

In [3]:
for l_pair in listdir(euro_data_dir):
    pair_path = join_path(euro_data_dir, l_pair)
    for l_dir in listdir(pair_path):
        out_file = open(output_data_dir + l_pair + "_" + l_dir + ".txt", "w")
        l_path = join_path(pair_path, l_dir)
        for file_name in sorted(listdir(l_path)):
            # Consume content and split lines
            content = open(join_path(l_path, file_name), "r").read() 

            # Write content in output file
            out_file.write(content)


## Load corpus in memory

In [4]:
def load_corpus(data_dir, as_np_array=False):
    corpus = {}
    for file_name in listdir(data_dir):
        path = join_path(data_dir, file_name)
        if os.path.isfile(path):
            file = open(path, "r+")
            l_pair = file_name[:5]
            if l_pair not in corpus:
                corpus[l_pair] = {}
            if as_np_array:
                corpus[l_pair][file_name] = np.array(file.read().split('\n'), dtype=object)
            else:
                corpus[l_pair][file_name[:3]] = file.read().split('\n')
            file.close()
            remove(path)
    return corpus

In [None]:
corpus = load_corpus(output_data_dir, True)

## Delete empty lines and lines with xml tags

In [5]:
for pair, l_data in corpus.items():

    # Find special lines
    empty_lines = np.array([], dtype=int)
    for _, value in l_data.items():
        is_empty = np.vectorize(lambda x: len(x) == 0 or x[0] == "<")
        indices = np.nonzero(is_empty(value))[0]
        empty_lines = np.concatenate((empty_lines, indices))
    empty_lines = np.unique(empty_lines)

    # Remove lines with an xml tag or an empty char
    for key, value in l_data.items():
        new_corpus = np.delete(value, empty_lines)
        corpus[pair][key] = new_corpus.tolist()
        print(len(new_corpus))
        

2048549
2048549
1957802
1957802
2007131
2007131


## Write new file(s)

In [8]:
for lang_pair, pair_dict in corpus.items():
    for file_name, data in pair_dict.items():
        file = open(join_path(output_data_dir, file_name), 'w')
        file.write("\n".join(data))


### The following steps must be run after executing the multialign script and the BPE tokenization script

## Join language files and add translation token

In [3]:
tokenized_dir = './data/tokenized/'
output_dir = './data/'

for sub_set in listdir(tokenized_dir):
    sub_set_path = join_path(tokenized_dir, sub_set)

    corpus = {}
    src_data = []
    tgt_data = []

    for file_name in listdir(sub_set_path):
        file_path = join_path(sub_set_path, file_name)
        file = open(file_path, 'r')
        corpus[file_name[:-4]] = file.read().split('\n')

    # An extra line will be created when loading the data to memory,
    # that's why the -1 is there
    for i in range(0, len(corpus['en']) - 1):
        for src in corpus:
            targets = filter(lambda x: x != src, corpus.keys())
            for tgt in targets:
                src_sentence = '_src_{}_tgt_{} '.format(src, tgt) + corpus[src][i]
                src_data.append(src_sentence)
                tgt_data.append(corpus[tgt][i])

    out_src = open(join_path(output_dir, 'europarl_{}_src.txt'.format(sub_set)), 'w')
    out_src.write("\n".join(src_data))
    out_src = open(join_path(output_dir, 'europarl_{}_tgt.txt'.format(sub_set)), 'w')
    out_src.write("\n".join(tgt_data))



# Preprocessing classifier data

## Generate Back-Translations

In [4]:
location = './data/translated/'
back_translation_num = 3
lang_pool = ['en', 'es', 'fr', 'de']
ignore = ['de_en', 'es_fr', 'fr_de', 'fr_en', 'fr_es']

In [10]:
def translate_text(text, src, tgt, file_name, dir_path):
    start = time.time()
    response = requests.post('http://localhost:8080/translate', data = {'text': text, 'src': src, 'tgt': tgt})
    end = time.time()
    print('Elapsed time to generate {} for {} to {}: {}s'.format(file_name, src, tgt, end - start))
    res_file = open(join_path(dir_path, '{}.txt'.format(file_name)), 'w')
    res_file.write(response.text)
    return response.text

for file_name in listdir(location):
    path = join_path(location, file_name)
    dir_path = path[:-4]
    if os.path.isfile(path) and file_name[:-4] not in ignore:
        print("Starting loops for:", file_name)
        original_lang = file_name[:2]
        translated_lang = file_name[-6:-4]

        if not os.path.isdir(dir_path):
            mkdir(dir_path)

        content = open(path, 'r').read()
        
        for tgt in filter(lambda x: x != translated_lang, lang_pool):
            tgt_dir = join_path(dir_path, tgt)
            if not os.path.isdir(tgt_dir):
                mkdir(tgt_dir)

            last_back = content
            for i in range(0, back_translation_num):
                translation = translate_text(last_back, translated_lang, tgt, 'T{}'.format(i), tgt_dir)
                last_back = translate_text(translation, tgt, translated_lang, 'Back{}'.format(i), tgt_dir)
            translate_text(last_back, translated_lang, tgt, 'T{}'.format(back_translation_num), tgt_dir)




        

Starting loops for: de_es.txt
Elapsed time to generate T0 for es to en: 84.76807451248169s
Elapsed time to generate Back0 for en to es: 82.66909098625183s
Elapsed time to generate T1 for es to en: 77.02899098396301s
Elapsed time to generate Back1 for en to es: 80.14960050582886s
Elapsed time to generate T2 for es to en: 75.55847644805908s
Elapsed time to generate Back2 for en to es: 79.01752805709839s
Elapsed time to generate T3 for es to en: 74.84669542312622s
Elapsed time to generate T0 for es to fr: 98.60221600532532s
Elapsed time to generate Back0 for fr to es: 82.73007440567017s
Elapsed time to generate T1 for es to fr: 87.15043616294861s
Elapsed time to generate Back1 for fr to es: 79.43580174446106s
Elapsed time to generate T2 for es to fr: 85.46473169326782s
Elapsed time to generate Back2 for fr to es: 78.61906862258911s
Elapsed time to generate T3 for es to fr: 85.17634749412537s
Elapsed time to generate T0 for es to de: 88.93029475212097s
Elapsed time to generate Back0 for de

## Generate BLEU-Score Table