Most of code is from GNMT v2 For PyTorch. This script is for learning


In [1]:
import os
import sys
import time

import pandas as pd
import numpy as np
import nltk
from mosestokenizer import MosesTokenizer

from multiprocessing import Process, Lock, Pool
import itertools
from collections import OrderedDict
import functools
import tempfile
import pickle
import torch

In [2]:
DL_PATH = os.environ.get("DL_PATH")
DL_DATASET = os.environ.get("DL_DATASET")

In [3]:
DL_PATH

'/media/mtb/1268324a-8d38-4c4f-9b71-2a4ddc231fe6/dl'

<h1> Data Preparation </h1>

In [4]:
LANG = {"src": "en", "tgt": "fr"}
STAGE_PREFIXES = ["train", "valid", "test"]
LANG_DATASET = {key: os.path.join(DL_DATASET, "nlp/wmt15_translate_fr_en", key) for key in STAGE_PREFIXES}

In [5]:
LANG_FILES = {stage: { fromto: [os.path.join(LANG_DATASET[stage],  f) 
                              for f in sorted(os.listdir(os.path.join(LANG_DATASET[stage]))) if f.endswith(lang)]  
                        for fromto, lang in LANG.items()
                        }
                      for stage in STAGE_PREFIXES}


In [6]:
LANG_FILES

{'train': {'src': ['/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/train/giga-fren.release2.fixed.en'],
  'tgt': ['/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/train/giga-fren.release2.fixed.fr']},
 'valid': {'src': ['/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newsdiscussdev2015.en',
   '/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newstest2014.en'],
  'tgt': ['/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newsdiscussdev2015.fr',
   '/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newstest2014.fr']},
 'test': {'src': ['/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/test/newsdiscusstest2015.en'],
  'tgt': ['/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/test/newsdiscusstest2015.fr']}}

In [7]:
LANG_FILES['train']

{'src': ['/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/train/giga-fren.release2.fixed.en'],
 'tgt': ['/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/train/giga-fren.release2.fixed.fr']}

In [8]:
LANG_FILES['valid']

{'src': ['/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newsdiscussdev2015.en',
  '/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newstest2014.en'],
 'tgt': ['/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newsdiscussdev2015.fr',
  '/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newstest2014.fr']}

In [9]:
LANG_FILES['test']

{'src': ['/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/test/newsdiscusstest2015.en'],
 'tgt': ['/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/test/newsdiscusstest2015.fr']}

In [10]:
def tokenize(filename, lang,  num_wokers=-1, MAX_FILESIZE=5e8,):
    
    def process_tokenize(filename, process_num, begin, end, savefile=None):
        outputs = dict( raw=[], tokenize=[],)
        print(f"Processing file from {begin} - {end} with process {process_num}")
        with MosesTokenizer(lang) as mos_tokenize:
            with open(filename, "r", encoding="utf-8") as f:
                f.seek(begin)
                pos = begin
                while True:
                    if end > -1 and pos >= end:
                        break
                    line = f.readline()
                    pos = f.tell()
                    if not line:
                        break
                    outputs["tokenize"].append(" ".join(mos_tokenize(line)))
                    outputs["raw"].append(line.strip())
        if savefile:
            with open(savefile, "wb") as f:
                pickle.dump(outputs, f)

        return outputs
    
                    
    print("Processing ", filename, lang, num_wokers)
    with open(filename, 'r', encoding="utf-8") as f:
        filesize = os.fstat(f.fileno()).st_size 
        if filesize > MAX_FILESIZE and num_wokers > 1:
            pos = 0
            cursors = [0]
            chunk = filesize // num_wokers
            for i in range(1, num_wokers):
                f.seek(i * chunk)
                while True:
                    line = f.readline()
                    if not line:
                        pos -= 1
                        f.seek(pos)
                        continue
                    break
                pos = f.tell()
                cursors.append(pos)
            cursors.append(-1)
            processes = []
            tmp_files = dict()
            for i in range(num_wokers):
                tmp = tempfile.NamedTemporaryFile(delete=False)
                tmp.close()
                tmp_files[i] = tmp.name
                processes.append(Process(target=process_tokenize, 
                                         args=(filename, i, cursors[i], cursors[i + 1], tmp.name),
                                        )
                                 )
            for i in range(num_wokers):
                processes[i].start()
            for i in range(num_wokers):
                processes[i].join()
            outputs = {"tokenize": [], "raw": []}
            for i, tmp_file in sorted(tmp_files.items()):
                with open(tmp_file, "rb") as tmp_f:
                    x = pickle.load(tmp_f)
                    outputs["tokenize"] += x['tokenize']
                    outputs["raw"] += x['raw']
                os.remove(tmp_file)
        else: 
            outputs = process_tokenize(filename, 0, 0, -1)
    
    return pd.DataFrame(outputs)            


def filter_latin(x):
    try:
        x.src.tokenize.encode("latin1")
        x.tgt.tokenize.encode("latin1")
    except:
        return False
    return True    


def ratio_transform(x, ratio=4.):
    src_length = len(x.src.tokenize.split(" "))
    tgt_length = len(x.tgt.tokenize.split(" "))
    
    return max(src_length / tgt_length, tgt_length / src_length) < ratio
 
def sentence_length_range(x, min_length=1, max_length=80):
    src_length = len(x.src.tokenize.split(" "))
    tgt_length = len(x.tgt.tokenize.split(" "))
    return  (min_length < src_length < max_length) and  (min_length  < tgt_length < max_length)
   
# def run_cleaning(df):
#     print("Po ", df)
    
def clean_data(df, transforms=(), num_workers=-1):
    print("Cleaning")
    # def run(df):
    def get_mask(x):
        flag = True
        for transform_fn in transforms:
            flag &= transform_fn(x)
            if not flag: return flag
        return flag
    
    df["mask"] = df.apply(get_mask, axis=1)
    
    df = df[df["mask"]].reset_index(drop=True)
    df.drop(columns=["mask"], inplace=True)
    return df
  
        
    # if len(df) > 1e6:
    #     chunks = [0] + [ i * (len(df) // num_workers) for i in range(1, num_workers)] + [len(df)]
    #     pool = Pool()
    #     print("pass ")
    #     for i in range(1, num_workers + 1):
    #         beg, end = chunks[i-1:i+1]
    #         print(beg, end)
            
    #         pool.apply_async(run_cleaning, (df.iloc[beg:end], ))
    #     print("Applutn ")
    #     time.sleep(2.0)
    #     pool.close()
        
    #     pool.join()        
        
# clean_data(train_df, (filter_latin, sentence_length_range, ratio_transform), num_workers=10) 

In [12]:
def load_data(stage):
    num_workers = 12 if stage == "train" else -1
    df = pd.DataFrame(columns=pd.MultiIndex.from_product((["src", "tgt"], ['raw', 'tokenize'])))
    src_res, tgt_res = [], []
    for filesrc, filetgt in zip(LANG_FILES[stage]['src'], LANG_FILES[stage]['tgt']):
        print(filesrc, filetgt)
        src_res.append(tokenize(filesrc, filesrc.split(".")[-1], num_wokers=num_workers))
        tgt_res.append(tokenize(filetgt, filetgt.split(".")[-1], num_wokers=num_workers))
    
    if len(src_res) == 1:
        src_res = src_res[0]
        tgt_res = tgt_res[0]
    else:
        src_res = pd.concat(src_res).reset_index(drop=True)
        tgt_res = pd.concat(tgt_res).reset_index(drop=True)
    df['src'] = src_res
    df['tgt'] = tgt_res

    return df

In [13]:
train_df = load_data("train")

/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/train/giga-fren.release2.fixed.en /media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/train/giga-fren.release2.fixed.fr
Processing  /media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/train/giga-fren.release2.fixed.en en 12
Processing file from 0 - 315822765 with process 0
Processing file from 315822765 - 631645634 with process 1
Processing file from 631645634 - 947468279 with process 2
Processing file from 947468279 - 1263291030 with process 3
Processing file from 1263291030 - 1579113847 with process 4
Processing file from 1579113847 - 1894937639 with process 5
Processing file from 1894937639 - 2210759410 with process 6
Processing file from 2210759410 - 2526582145 with process 7
Processing file from 2526582145 - 2842404835 with process 8
Processing file from 2842404835 - 3158227609 with process 9
Processing file from 3158227609 - 3474050274 with process 10
Processing file from 3474050274 - -1 with process 11
Processing  /media/mtb/nas/data

In [14]:
valid_df = load_data("valid")

/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newsdiscussdev2015.en /media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newsdiscussdev2015.fr
Processing  /media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newsdiscussdev2015.en en -1
Processing file from 0 - -1 with process 0
Processing  /media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newsdiscussdev2015.fr fr -1
Processing file from 0 - -1 with process 0
/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newstest2014.en /media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newstest2014.fr
Processing  /media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newstest2014.en en -1
Processing file from 0 - -1 with process 0
Processing  /media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/valid/newstest2014.fr fr -1
Processing file from 0 - -1 with process 0


In [15]:
test_df = load_data("test")

/media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/test/newsdiscusstest2015.en /media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/test/newsdiscusstest2015.fr
Processing  /media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/test/newsdiscusstest2015.en en -1
Processing file from 0 - -1 with process 0
Processing  /media/mtb/nas/datasets/nlp/wmt15_translate_fr_en/test/newsdiscusstest2015.fr fr -1
Processing file from 0 - -1 with process 0


In [16]:
valid_df

Unnamed: 0_level_0,src,src,tgt,tgt
Unnamed: 0_level_1,raw,tokenize,raw,tokenize
0,Sounds like a typical rugby club to me.,Sounds like a typical rugby club to me .,Ça m'a l'air d'être un club de rugby typique.,Ça m&apos; a l&apos; air d&apos; être un club ...
1,"At an English university, perhaps...","At an English university , perhaps ...","Dans une université anglaise, peut-être...","Dans une université anglaise , peut @-@ être ..."
2,Not like any rugby club I know about in NZ.,Not like any rugby club I know about in NZ .,Rien à voir avec les clubs de rugby que je con...,Rien à voir avec les clubs de rugby que je con...
3,"It doesn't make it all right though, does it?","It doesn &apos;t make it all right though , do...","Mais ça ne justifie rien, si ?","Mais ça ne justifie rien , si ?"
4,"Of course it's not right, but the original pre...","Of course it &apos;s not right , but the origi...","Bien sûr que non, mais la prémisse qui dit que...","Bien sûr que non , mais la prémisse qui dit qu..."
...,...,...,...,...
4498,The Marguerite-Bourgeoys School Board has crea...,The Marguerite @-@ Bourgeoys School Board has ...,La commission scolaire Marguerite-Bourgeoys a ...,La commission scolaire Marguerite @-@ Bourgeoy...
4499,Rachida Azdouz from the University of Montreal...,Rachida Azdouz from the University of Montreal...,"Rachida Azdouz, de l'Université de Montréal, e...","Rachida Azdouz , de l&apos; Université de Mont..."
4500,Preparation to manage a class in a North-Ameri...,Preparation to manage a class in a North @-@ A...,La préparation à gérer une classe dans un cont...,La préparation à gérer une classe dans un cont...
4501,"""The real need is for different educational st...",&quot; The real need is for different educatio...,"""Des stratégies pédagogiques différentes, c'es...",&quot; Des stratégies pédagogiques différentes...


In [17]:
train_df

Unnamed: 0_level_0,src,src,tgt,tgt
Unnamed: 0_level_1,raw,tokenize,raw,tokenize
0,Changing Lives | Changing Society | How It Wor...,Changing Lives &#124; Changing Society &#124; ...,Il a transformé notre vie | Il a transformé la...,Il a transformé notre vie &#124; Il a transfor...
1,Site map,Site map,Plan du site,Plan du site
2,Feedback,Feedback,Rétroaction,Rétroaction
3,Credits,Credits,Crédits,Crédits
4,Français,Français,English,English
...,...,...,...,...
22520371,Only with a highly overcompensatory stock–recr...,Only with a highly overcompensatory stock – re...,C'est seulement en cas de courbe stock–recrute...,C&apos; est seulement en cas de courbe stock –...
22520372,The model predicts that the assumption made ab...,The model predicts that the assumption made ab...,Le modèle prévoit que l'hypothèse émise au suj...,Le modèle prévoit que l&apos; hypothèse émise ...
22520373,Overall the results confirm the unsatisfactory...,Overall the results confirm the unsatisfactory...,"Dans l'ensemble, les résultats confirment le p...","Dans l&apos; ensemble , les résultats confirme..."
22520374,"Error 404 — file not found Sorry, but the file...","Error 404 — file not found Sorry , but the fil...",Erreur 404 — fichier introuvable Nous sommes d...,Erreur 404 — fichier introuvable Nous sommes d...


In [18]:
test_df

Unnamed: 0_level_0,src,src,tgt,tgt
Unnamed: 0_level_1,raw,tokenize,raw,tokenize
0,This is perfectly illustrated by the UKIP numb...,This is perfectly illustrated by the UKIP numb...,Les demeurés de UKIP qui refusent ceux qui viv...,Les demeurés de UKIP qui refusent ceux qui viv...
1,You mean Nigel Farage saying the NHS should no...,You mean Nigel Farage saying the NHS should no...,Vous parlez de quand Nigel Farage dit que le N...,Vous parlez de quand Nigel Farage dit que le N...
2,You raise a straw man and then knock it down w...,You raise a straw man and then knock it down w...,"D'abord vous utilisez des arguments spécieux, ...",D&apos; abord vous utilisez des arguments spéc...
3,Every time I or my family need to use the NHS ...,Every time I or my family need to use the NHS ...,Chaque fois que moi ou ma famille avons besoin...,Chaque fois que moi ou ma famille avons besoin...
4,I think the straw man is yours.,I think the straw man is yours .,Je crois que c'est vous qui utilisez des argum...,Je crois que c&apos; est vous qui utilisez des...
...,...,...,...,...
1495,"And if the rule is the same as here, the state...","And if the rule is the same as here , the stat...",Et si la règle est la même qu'ici l'état est s...,Et si la règle est la même qu&apos; ici l&apos...
1496,They are going to perhaps pay for the satellit...,They are going to perhaps pay for the satellit...,Ils vont peut-etre payer les satellites en fré...,Ils vont peut @-@ etre payer les satellites en...
1497,That is all the more regrettable since a lot o...,That is all the more regrettable since a lot o...,Cela est d'autant plus fâcheux que de nombreux...,Cela est d&apos; autant plus fâcheux que de no...
1498,All in all that creates an erroneous impressio...,All in all that creates an erroneous impressio...,Au total cela crée une impression erronée cont...,Au total cela crée une impression erronée cont...


In [32]:
# Save for bpe
src_lang, tgt_lang = sorted(LANG.values())
SAVEPATH = os.path.join(DL_PATH, "nmt", f"{src_lang}-{tgt_lang}", "data" )
def save_data(df, filename):
    with open(filename, "w") as f:
        for line in df:
            f.write(line)
            f.write("\n")
    return True   

In [25]:
for df, stage in zip((train_df, valid_df, test_df), ("train", "valid", "test")):
    for lang_key, lang in LANG.items():
        for data_type in df[lang_key]:
            save_data(df[lang_key][data_type], os.path.join(SAVEPATH, f"{stage}.{data_type[:3]}.{lang}"))

In [19]:
_train_df, _valid_df, _test_df = train_df.copy(), valid_df.copy(), test_df.copy()

In [20]:
train_df_clean = clean_data(train_df, (filter_latin, sentence_length_range, ratio_transform), num_workers=12) 

Cleaning


In [27]:
valid_df_clean = clean_data(valid_df, ( filter_latin, sentence_length_range, ratio_transform)) 

Cleaning


In [28]:
test_df_clean = clean_data(test_df, (sentence_length_range, ratio_transform)) 

Cleaning


In [29]:
train_df_clean

Unnamed: 0_level_0,src,src,tgt,tgt
Unnamed: 0_level_1,raw,tokenize,raw,tokenize
0,Changing Lives | Changing Society | How It Wor...,Changing Lives &#124; Changing Society &#124; ...,Il a transformé notre vie | Il a transformé la...,Il a transformé notre vie &#124; Il a transfor...
1,Site map,Site map,Plan du site,Plan du site
2,Astronomers Introduction Introduction video Wh...,Astronomers Introduction Introduction video Wh...,Astronomes Introduction Vidéo d'introduction Q...,Astronomes Introduction Vidéo d&apos; introduc...
3,The name is derived from the Greek root astron...,The name is derived from the Greek root astron...,"Son nom vient du grec astron, qui veut dire ét...","Son nom vient du grec astron , qui veut dire é..."
4,"More specifically, astronomy is the study of t...","More specifically , astronomy is the study of ...","Plus spécifiquement, elle étudie la formation ...","Plus spécifiquement , elle étudie la formation..."
...,...,...,...,...
12122414,COUNTRIES TO BE INVITED TO SEND OBSERVER DELEG...,COUNTRIES TO BE INVITED TO SEND OBSERVER DELEG...,II. PAYS à INVITER à ENVOYER UNE DéLéGATION OB...,II . PAYS à INVITER à ENVOYER UNE DéLéGATION O...
12122415,Stock assessment of the European lobster (Homa...,Stock assessment of the European lobster ( Hom...,L'évaluation des stocks de homards d'Europe (H...,L&apos; évaluation des stocks de homards d&apo...
12122416,734) which assumes that recruitment to the fis...,734 ) which assumes that recruitment to the fi...,734) selon laquelle le recrutement pour la pêc...,734 ) selon laquelle le recrutement pour la pê...
12122417,Yield curves show a clear maximum with a marke...,Yield curves show a clear maximum with a marke...,Les courbes de la production montrent un net m...,Les courbes de la production montrent un net m...


In [30]:
valid_df_clean

Unnamed: 0_level_0,src,src,tgt,tgt
Unnamed: 0_level_1,raw,tokenize,raw,tokenize
0,Sounds like a typical rugby club to me.,Sounds like a typical rugby club to me .,Ça m'a l'air d'être un club de rugby typique.,Ça m&apos; a l&apos; air d&apos; être un club ...
1,"At an English university, perhaps...","At an English university , perhaps ...","Dans une université anglaise, peut-être...","Dans une université anglaise , peut @-@ être ..."
2,Not like any rugby club I know about in NZ.,Not like any rugby club I know about in NZ .,Rien à voir avec les clubs de rugby que je con...,Rien à voir avec les clubs de rugby que je con...
3,"It doesn't make it all right though, does it?","It doesn &apos;t make it all right though , do...","Mais ça ne justifie rien, si ?","Mais ça ne justifie rien , si ?"
4,"Of course it's not right, but the original pre...","Of course it &apos;s not right , but the origi...","Bien sûr que non, mais la prémisse qui dit que...","Bien sûr que non , mais la prémisse qui dit qu..."
...,...,...,...,...
4154,The Marguerite-Bourgeoys School Board has crea...,The Marguerite @-@ Bourgeoys School Board has ...,La commission scolaire Marguerite-Bourgeoys a ...,La commission scolaire Marguerite @-@ Bourgeoy...
4155,Rachida Azdouz from the University of Montreal...,Rachida Azdouz from the University of Montreal...,"Rachida Azdouz, de l'Université de Montréal, e...","Rachida Azdouz , de l&apos; Université de Mont..."
4156,Preparation to manage a class in a North-Ameri...,Preparation to manage a class in a North @-@ A...,La préparation à gérer une classe dans un cont...,La préparation à gérer une classe dans un cont...
4157,"""The real need is for different educational st...",&quot; The real need is for different educatio...,"""Des stratégies pédagogiques différentes, c'es...",&quot; Des stratégies pédagogiques différentes...


In [31]:
test_df_clean

Unnamed: 0_level_0,src,src,tgt,tgt
Unnamed: 0_level_1,raw,tokenize,raw,tokenize
0,This is perfectly illustrated by the UKIP numb...,This is perfectly illustrated by the UKIP numb...,Les demeurés de UKIP qui refusent ceux qui viv...,Les demeurés de UKIP qui refusent ceux qui viv...
1,You mean Nigel Farage saying the NHS should no...,You mean Nigel Farage saying the NHS should no...,Vous parlez de quand Nigel Farage dit que le N...,Vous parlez de quand Nigel Farage dit que le N...
2,You raise a straw man and then knock it down w...,You raise a straw man and then knock it down w...,"D'abord vous utilisez des arguments spécieux, ...",D&apos; abord vous utilisez des arguments spéc...
3,Every time I or my family need to use the NHS ...,Every time I or my family need to use the NHS ...,Chaque fois que moi ou ma famille avons besoin...,Chaque fois que moi ou ma famille avons besoin...
4,I think the straw man is yours.,I think the straw man is yours .,Je crois que c'est vous qui utilisez des argum...,Je crois que c&apos; est vous qui utilisez des...
...,...,...,...,...
1489,"And if the rule is the same as here, the state...","And if the rule is the same as here , the stat...",Et si la règle est la même qu'ici l'état est s...,Et si la règle est la même qu&apos; ici l&apos...
1490,They are going to perhaps pay for the satellit...,They are going to perhaps pay for the satellit...,Ils vont peut-etre payer les satellites en fré...,Ils vont peut @-@ etre payer les satellites en...
1491,That is all the more regrettable since a lot o...,That is all the more regrettable since a lot o...,Cela est d'autant plus fâcheux que de nombreux...,Cela est d&apos; autant plus fâcheux que de no...
1492,All in all that creates an erroneous impressio...,All in all that creates an erroneous impressio...,Au total cela crée une impression erronée cont...,Au total cela crée une impression erronée cont...


In [34]:
for df, stage in zip((train_df_clean, valid_df_clean, test_df_clean), ("train", "valid", "test")):
# for df, stage in zip((test_df_clean, ), ( "test", )):
    for lang_key, lang in LANG.items():
        save_data(df[lang_key]["tokenize"], os.path.join(SAVEPATH, f"{stage}.tok.clean.{lang}"))

In [35]:
save_data(test_df_clean["tgt"]["raw"], os.path.join(SAVEPATH, f"test.tok_clean.clean.{lang}"))

True