In [26]:
import pandas as pd
import os 
import sys
import collections
from mosestokenizer import MosesTokenizer
from multiprocessing import Pool
import tempfile
import numpy as np

In [27]:
DATASET_PATH = "/mnt/dl/Translation/WMT_15/en-fr"

In [28]:
src_lang, tgt_lang = 'en', 'fr'

In [29]:
512 * 8096

4145152

In [30]:
MAX_DATASIZE = 512 * 8096
MAX_DATASIZE

4145152

In [31]:
train_file = os.path.join(DATASET_PATH,  'train.{}.{}')
valid_file = os.path.join(DATASET_PATH, 'valid.{}.{}')
test_file = os.path.join(DATASET_PATH,  'test.{}.{}')

In [32]:
def read_file(fname, chunksize=int(4e6)):
    print("Reading filename: ", fname)
    lines = [pd.NA] * chunksize 
    print("List created")
    with open(fname, 'r', encoding="utf-8") as f:
        for i, line in enumerate(f):
            if  i == len(lines):
                break
            lines[i] = line
    print(f'Total lines for {fname}: {i + 1}')
    return  pd.Series(lines).dropna(how='all')
        

In [33]:
def save_file(lines, fname):
    with open(fname, "w") as f:
        for line in lines:
            f.write(line)
            f.write("\n")
    return True   

In [34]:
train_df = pd.DataFrame({'src': read_file(train_file.format('tok.clean.dl', src_lang), MAX_DATASIZE), 
                        'tgt': read_file(train_file.format('tok.clean.dl', tgt_lang), MAX_DATASIZE), 
                        })
# valid_df = pd.DataFrame({'src': read_file(valid_file.format('tok.clean', src_lang), 10000), 
#                         'tgt': read_file(valid_file.format('tok.clean', tgt_lang), 10000), 
#                         })
# test_df = pd.DataFrame({'src': read_file(test_file.format('tok.clean', src_lang), MAX_DATASIZE), 
#                         'tgt': read_file(test_file.format('tok.clean', tgt_lang), MAX_DATASIZE), 
#                         })


Reading filename:  /mnt/dl/Translation/WMT_15/en-fr/train.tok.clean.dl.en
List created
Total lines for /mnt/dl/Translation/WMT_15/en-fr/train.tok.clean.dl.en: 4145152
Reading filename:  /mnt/dl/Translation/WMT_15/en-fr/train.tok.clean.dl.fr
List created
Total lines for /mnt/dl/Translation/WMT_15/en-fr/train.tok.clean.dl.fr: 4145152


In [35]:
train_df

Unnamed: 0,src,tgt
0,"In his briefing on economic development , Al H...",Dans sa présentation sur le développement écon...
1,( b ) Positive aspects\n,b ) Aspects positifs\n
2,Activities of the second type will be a major ...,"Pour la mise en oeuvre , la deuxième catégorie..."
3,Emergency supplemental requirements with respe...,Les ressources supplémentaires à prélever d&ap...
4,"Order No. 1991-R-140 dated May 17 , 1991 - Cit...",L&apos; arrêté no 1991-R-140 du 17 mai 1991 - ...
...,...,...
4145147,Documentation :\n,Documentation\n
4145148,"Unfortunately , we all know that this amendmen...",Nous savons tous que cet amendement n&apos; a ...
4145149,"Furthermore , the resumption of the Peace Jirg...","En outre , la reprise du processus de paix de ..."
4145150,"For product vs. product comparisons , the actu...","Pour les comparaisons de produit à produit , l..."


In [18]:
train_df

Unnamed: 0,src,tgt
0,"In his briefing on economic development , Al H...",Dans sa présentation sur le développement écon...
1,( b ) Positive aspects\n,b ) Aspects positifs\n
2,Crop insurance payments include only governmen...,Les indemnités d ’ assurance-récolte comprenne...
3,Activities of the second type will be a major ...,"Pour la mise en oeuvre , la deuxième catégorie..."
4,Emergency supplemental requirements with respe...,Les ressources supplémentaires à prélever d&ap...
...,...,...
40729915,Provide the &quot; trade date &quot; not the &...,Indiquer la date de l ’ opération et non celle...
40729916,Heather says a friend had a BabyFirst home vis...,Heather dit qu ’ une de ses amies avait eu une...
40729917,&quot; Software &quot; specially designed or m...,"Cuves de réacteurs Cuves métalliques , ou élém..."
40729918,That would definitely be interesting .\n,Ce serait certainement intéressant .\n


In [36]:
train_df.iloc[-1].array

<PandasArray>
['In early January 18 officers and 196 NCOs went to Britain to take up training duties , with more to follow in subsequent weeks .\n', 'Il réclame également 200 sous-officiers aguerris , qui doivent tous être détachés pour une période de trois mois et qui pourront être rappelés si leur présence devient nécessaire aux opérations .\n']
Length: 2, dtype: object

In [48]:
STEP = 250005

In [49]:
train_df.iloc[STEP].array[0]

'We &apos;re always talking about our image on the outside , but I &apos;d be interested in hearing what suggestions you have to improve our image on the inside of the military , because there &apos;s a big problem of our image right there .\n'

In [50]:
train_df.iloc[STEP].array[1]

'Nous parlons constamment de notre image à l&apos; étranger , et j&apos; aimerais donc savoir ce que vous pourriez proposer pour améliorer notre image au sein même de l&apos; armée , parce qu&apos; il semble y avoir un grand problème de ce côté-là .\n'

In [None]:
train_df

In [None]:
valid_df

In [None]:
test_df

In [None]:
def tokenize_filter(text, lang):
    with MosesTokenizer(lang) as tokenize:
        return " ".join(tokenize(text))
    
def latin_filter(src_line, tgt_line):
    try:
        src_line.encode("latin1")
        tgt_line.encode("latin1")
    except:
        return False
    return True  

def sentence_length_ratio_filter(src_line, tgt_line, max_len=250, min_len=1, ratio=None):
    src_len = src_line.count(" ")
    tgt_len = tgt_line.count(" ")
    
    if not (min_len <= src_len <= max_len and min_len <= tgt_len <= max_len):
        return False
    if ratio is None:
        return True
    if max(src_len, tgt_len) > ratio * min(src_len, tgt_len):
        return False
    return True
    

def process_train(src_lines, tgt_lines, src_fname, tgt_fname,):
    print("processing train....")
    src_tokens = [''] * len(src_lines)
    tgt_tokens = [''] * len(tgt_lines)
    assert len(src_lines) == len(tgt_lines)
    for i in range(len(src_lines)):
        if i > 0 and i % 1000 == 0:
            print("processed ", i)
        src_line, tgt_line = src_lines[i], tgt_lines[i]
        src_line = tokenize_filter(src_line, src_lang)
        tgt_line = tokenize_filter(tgt_line, tgt_lang)
        if not latin_filter(src_line, tgt_line) or not sentence_length_ratio_filter(src_line, tgt_line, ratio=2.0):
            continue
        src_tokens[i] = src_line
        tgt_tokens[i] = tgt_line
    print("writing to files")
    
    save_file(src_tokens, src_fname)
    save_file(tgt_tokens, tgt_fname)
    print("done writing to files")
    

def process_test(src_lines, tgt_lines, src_fname, tgt_fname,):
    src_tokens = [''] * len(src_lines)
    tgt_tokens = [''] * len(tgt_lines)
    assert len(src_lines) == len(tgt_lines)
    for i in range(len(src_lines)):
        src_line, tgt_line = src_lines[i], tgt_lines[i]
        src_line = tokenize_filter(src_line, src_lang)
        tgt_line = tokenize_filter(tgt_line, tgt_lang)
        if not sentence_length_ratio_filter(src_line, tgt_line, ratio=2.0):
            continue
        src_tokens[i] = src_line
        tgt_tokens[i] = tgt_line
    save_file(src_tokens, src_fname)
    save_file(tgt_tokens, tgt_fname)
    
    
def process(src_lines, tgt_lines, process_fn, nproc=14):
    # size = min(10000, len(src_lines))
    size = len(src_lines)
    # size = 100
    bins = [i * (size // nproc) for i in range(nproc + 1)]
    bins[-1] = size
    src_args = [src_lines[bins[i]:bins[i+1]] for i in range(nproc)]
    tgt_args = [tgt_lines[bins[i]:bins[i+1]] for i in range(nproc)]
    assert len(src_args) == len(tgt_args) == nproc
    print([len(a) for a in tgt_args])
    pool = Pool(nproc)
    src_tmps = [tempfile.NamedTemporaryFile(delete=False) for _ in src_args]
    tgt_tmps = [tempfile.NamedTemporaryFile(delete=False) for _ in tgt_args]
    try:
        for src_proc_lines, tgt_proc_lines, src_tmp, tgt_tmp in zip(src_args, tgt_args, src_tmps, tgt_tmps):
            print(len(src_proc_lines))
            pool.apply_async(process_fn, (src_proc_lines, tgt_proc_lines, src_tmp.name, tgt_tmp.name))
        pool.close()
        pool.join()
        print("Done processing")
        outputs = {"src": [], "tgt": []}
        for i in range(nproc):
            src_tmp = src_tmps[i]
            tgt_tmp = tgt_tmps[i]
            outputs["src"].append(read_file(src_tmp.name))
            outputs["tgt"].append(read_file(tgt_tmp.name))
            src_tmp.close()
            tgt_tmp.close()

    except Exception as e:
        for i in range(nproc):
            src_tmps[i].close()
            tgt_tmps[i].close()
        
        raise e
    outputs['src'] = pd.concat(outputs['src']).reset_index(drop=True)
    outputs['tgt'] = pd.concat(outputs['tgt']).reset_index(drop=True)
    df = pd.DataFrame(outputs)
    df = df[df['src'] != '\n']
    return df 



In [None]:
# train_outs = process(train_df['src'].values, train_df['tgt'].values, process_train)
# train_outs

In [None]:
# valid_outs = process(valid_df['src'].values, valid_df['tgt'].values, process_train)
# valid_outs

In [None]:
# test_outs = process(test_df['src'].values, test_df['tgt'].values, process_test)
# test_outs