In [1]:
import time
import random
import os
import requests
import numpy as np
import pandas as pd
from os import listdir, remove, mkdir
from os.path import join as join_path
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from pyonmttok import Tokenizer

# Preprocessing europarl data

In [5]:
euro_data_dir = "../europarl/aligned/"
output_data_dir = "./data/original/"

## Join Files

In [6]:
for l_pair in listdir(euro_data_dir):
    pair_path = join_path(euro_data_dir, l_pair)
    for l_dir in listdir(pair_path):
        out_file = open(output_data_dir + l_pair + "_" + l_dir + ".txt", "w")
        l_path = join_path(pair_path, l_dir)
        for file_name in sorted(listdir(l_path)):
            # Consume content and split lines
            content = open(join_path(l_path, file_name), "r").read() 

            # Write content in output file
            out_file.write(content)


## Load corpus in memory

In [8]:
def load_corpus(data_dir, as_np_array=False):
    corpus = {}
    for file_name in listdir(data_dir):
        path = join_path(data_dir, file_name)
        if os.path.isfile(path):
            file = open(path, "r+")
            l_pair = file_name[:5]
            if l_pair not in corpus:
                corpus[l_pair] = {}
            if as_np_array:
                corpus[l_pair][file_name] = np.array(file.read().split('\n'), dtype=object)
            else:
                corpus[l_pair][file_name[:3]] = file.read().split('\n')
            file.close()
            remove(path)
    return corpus

In [9]:
corpus = load_corpus(output_data_dir, True)

## Delete short lines and lines with xml tags

In [10]:
for pair, l_data in corpus.items():

    # Find special lines
    empty_lines = np.array([], dtype=int)
    for _, value in l_data.items():
        is_empty = np.vectorize(lambda x: len(x) <= 7 or x[0] == "<" or x[0] == "(")
        indices = np.nonzero(is_empty(value))[0]
        empty_lines = np.concatenate((empty_lines, indices))
    empty_lines = np.unique(empty_lines)

    # Remove lines with an xml tag or an empty char
    for key, value in l_data.items():
        new_corpus = np.delete(value, empty_lines)
        corpus[pair][key] = new_corpus.tolist()
        print(len(new_corpus))
        

2000491
2000491
1910858
1910858
1958593
1958593


## Write new file(s)

In [11]:
for lang_pair, pair_dict in corpus.items():
    for file_name, data in pair_dict.items():
        file = open(join_path(output_data_dir, file_name), 'w')
        file.write("\n".join(data))


### The following steps must be run after executing the multialign script and the BPE tokenization script

## Join language files and add translation token

In [2]:
tokenized_dir = './data/tokenized/'
output_dir = './data/'

for sub_set in listdir(tokenized_dir):
    sub_set_path = join_path(tokenized_dir, sub_set)

    corpus = {}
    src_data = []
    tgt_data = []

    for file_name in listdir(sub_set_path):
        file_path = join_path(sub_set_path, file_name)
        file = open(file_path, 'r')
        corpus[file_name[:-4]] = file.read().split('\n')

    # An extra line will be created when loading the data to memory,
    # that's why the -1 is there
    for i in range(0, len(corpus['en']) - 1):
        for src in corpus:
            targets = filter(lambda x: x != src, corpus.keys())
            for tgt in targets:
                src_sentence = '_src_{}_tgt_{} '.format(src, tgt) + corpus[src][i]
                src_data.append(src_sentence)
                tgt_data.append(corpus[tgt][i])

    out_src = open(join_path(output_dir, 'europarl_{}_src.txt'.format(sub_set)), 'w')
    out_src.write("\n".join(src_data))
    out_src = open(join_path(output_dir, 'europarl_{}_tgt.txt'.format(sub_set)), 'w')
    out_src.write("\n".join(tgt_data))



# Preprocessing classifier data

In [20]:
location = './data/translated/'
out_location = './data/'
lang_pool = ['en', 'es', 'fr', 'de']
back_translation_num = 3
file_size = 5000

## Generate Back-Translations

In [16]:
ignore = []

def translate_text(text, src, tgt, file_name, dir_path):
    start = time.time()
    response = requests.post('http://localhost:8080/translate', data = {'text': text, 'src': src, 'tgt': tgt})
    end = time.time()
    print('Elapsed time to generate {} for {} to {}: {}s'.format(file_name, src, tgt, end - start))
    res_file = open(join_path(dir_path, '{}.txt'.format(file_name)), 'w')
    res_file.write(response.text)
    return response.text

for file_name in listdir(location):
    path = join_path(location, file_name)
    dir_path = path[:-4]
    if os.path.isfile(path) and file_name[:-4] not in ignore:
        print("Starting loops for:", file_name)
        original_lang = file_name[:2]
        translated_lang = file_name[-6:-4]

        if not os.path.isdir(dir_path):
            mkdir(dir_path)

        content = open(path, 'r').read()
        
        for tgt in filter(lambda x: x != translated_lang, lang_pool):
            tgt_dir = join_path(dir_path, tgt)
            if not os.path.isdir(tgt_dir):
                mkdir(tgt_dir)

            last_back = content
            for i in range(0, back_translation_num):
                translation = translate_text(last_back, translated_lang, tgt, 'T{}'.format(i), tgt_dir)
                last_back = translate_text(translation, tgt, translated_lang, 'Back{}'.format(i), tgt_dir)
            translate_text(last_back, translated_lang, tgt, 'T{}'.format(back_translation_num), tgt_dir)




        

Starting loops for: fr_es.txt
Elapsed time to generate T0 for es to en: 72.79625606536865s
Elapsed time to generate Back0 for en to es: 71.71730089187622s
Elapsed time to generate T1 for es to en: 67.70391917228699s
Elapsed time to generate Back1 for en to es: 69.71693444252014s
Elapsed time to generate T2 for es to en: 66.12536144256592s
Elapsed time to generate Back2 for en to es: 69.00499105453491s
Elapsed time to generate T3 for es to en: 66.20717287063599s
Elapsed time to generate T0 for es to fr: 83.52035069465637s
Elapsed time to generate Back0 for fr to es: 72.34266018867493s
Elapsed time to generate T1 for es to fr: 75.88401317596436s
Elapsed time to generate Back1 for fr to es: 68.62711930274963s
Elapsed time to generate T2 for es to fr: 74.03134655952454s
Elapsed time to generate Back2 for fr to es: 68.49460077285767s
Elapsed time to generate T3 for es to fr: 73.70856547355652s
Elapsed time to generate T0 for es to de: 75.48436570167542s
Elapsed time to generate Back0 for de

## Generate BLEU-Score Table

In [21]:
columns = ['T{}-T{}_{}'.format(i, i+1, lang) for lang in lang_pool for i in range(
    0, back_translation_num)] + ['src', 'origin', 'len']
data = np.zeros((4*back_translation_num+3))
tokenizer = Tokenizer('conservative')
smoothing = SmoothingFunction()
# score_func = lambda ref, hyp: sentence_bleu([ref], hyp, smoothing_function=smoothing.method4)
score_func = lambda ref, hyp: meteor_score([ref], hyp)

dirs = os.listdir(location)
for directory in [d for d in os.listdir(location) if os.path.isdir(join_path(location, d))]:
    path = join_path(location, directory)
    bleu_data = np.zeros((file_size, 4*back_translation_num))

    # Calculate BLEU Score for each sentence and its back-translations
    for lang_idx, lang in enumerate(lang_pool):
        lang_path = join_path(path, lang)
        if not os.path.isdir(lang_path):
            pass
        else:
            sentences = []
            for i in range(0, back_translation_num + 1):
                file_name = 'T{}.txt'.format(i)
                file_data = open(join_path(lang_path, file_name),
                                 'r').read().split('\n')
                sentences.append(file_data)

            for i in range(0, len(sentences[0])):
                for j in range(0, back_translation_num):
                    # ref_sentence, _ = tokenizer.tokenize(sentences[j][i])
                    # hypothesis, _ = tokenizer.tokenize(sentences[j+1][i])
                    ref_sentence = sentences[j][i]
                    hypothesis = sentences[j+1][i]
                    bleu_data[i, lang_idx * back_translation_num +
                        j] = score_func(ref_sentence, hypothesis) #if len(hypothesis) > 3 else 0

    # Append src and origin data
    bleu_data = np.hstack(
        (bleu_data, np.full((file_size, 1), lang_pool.index(directory[-2:]))))
    bleu_data = np.hstack(
        (bleu_data, np.full((file_size, 1), lang_pool.index(directory[:2]))))

    # Append sentence length data fron the src file
    src_sentences = open(join_path(location, '{}.txt'.format(directory)), 'r').read().split('\n')
    tok_sentences = [tokenizer.tokenize(s)[0] for s in src_sentences]
    lengths = [len(ts) for ts in tok_sentences]
    bleu_data = np.hstack((bleu_data, np.array([lengths]).T))
    
    data = np.vstack((data, bleu_data))


In [22]:
data = np.delete(data, (0), axis=0)
dataFrame = pd.DataFrame(data, index=range(0, file_size*12), columns=columns)
dataFrame['src'] = dataFrame['src'].apply(lambda x: lang_pool[int(x)])
dataFrame['origin'] = dataFrame['origin'].apply(lambda x: lang_pool[int(x)])
dataFrame.to_csv(join_path(out_location, 'meteor_table_200k.csv'))


In [23]:
dataFrame.head()

Unnamed: 0,T0-T1_en,T1-T2_en,T2-T3_en,T0-T1_es,T1-T2_es,T2-T3_es,T0-T1_fr,T1-T2_fr,T2-T3_fr,T0-T1_de,T1-T2_de,T2-T3_de,src,origin,len
0,0.997685,0.997685,0.997685,0.996,0.996,0.996,0.0,0.0,0.0,0.996,0.996,0.996,fr,en,8.0
1,0.780859,0.999624,0.999624,0.766154,0.92094,0.999772,0.0,0.0,0.0,0.921592,0.999624,0.999624,fr,en,17.0
2,0.998542,0.998542,0.998542,0.999023,0.999023,0.999023,0.0,0.0,0.0,0.999023,0.999023,0.999023,fr,en,9.0
3,0.916972,0.9995,0.9995,0.979938,0.999914,0.999914,0.0,0.0,0.0,0.928079,0.999711,0.999711,fr,en,15.0
4,0.5,0.5,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.5,0.5,0.5,fr,en,1.0
