In [None]:
# default_exp data.seq2seq.translation


In [None]:
# all_slow


In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.seq2seq.translation

> This module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for translation tasks

In [None]:
# export
from typing import Optional

import numpy as np
import pandas as pd

from datasets import Dataset
from fastai.data.block import DataBlock
from transformers import AutoModelForSeq2SeqLM, PreTrainedTokenizerBase, logging

from blurr.utils import BLURR
from blurr.data.seq2seq.core import Seq2SeqBatchTokenizeTransform, Seq2SeqPreprocessor, Seq2SeqTextBlock

logging.set_verbosity_error()

In [None]:
# hide_input
import os, ast, pdb
from functools import reduce

from datasets import load_dataset
from fastai.data.transforms import *
from fastai.torch_core import *
from fastai.torch_imports import *
from fastcore.all import *
from fastcore.test import *
from nbdev.showdoc import show_doc

from blurr.utils import print_versions

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")


What we're running with at the time this documentation was generated:
torch: 1.10.1+cu111
fastai: 2.5.3
transformers: 4.16.2


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Using GPU #1: GeForce GTX 1080 Ti


## Setup

We'll use a subset of `wmt16` to demonstrate how to configure your BLURR for translation tasks

In [None]:
raw_dataset = load_dataset("wmt16", "de-en", split="train[:1%]")
raw_dataset


Reusing dataset wmt16 (/home/wgilliam/.cache/huggingface/datasets/wmt16/de-en/1.0.0/af3c5d746b307726d0de73ebe7f10545361b9cb6f75c83a1734c000e48b6264f)


Dataset({
    features: ['translation'],
    num_rows: 45489
})

In [None]:
print(raw_dataset[0].keys())
print(raw_dataset[0])

dict_keys(['translation'])
{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}}


In [None]:
wmt_df = pd.DataFrame(raw_dataset["translation"], columns=["de", "en"])

print(len(wmt_df))
wmt_df.head(2)


45489


Unnamed: 0,de,en
0,Wiederaufnahme der Sitzungsperiode,Resumption of the session
1,"Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.","I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period."


In [None]:
pretrained_model_name = "facebook/bart-large-cnn"
model_cls = AutoModelForSeq2SeqLM

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)
hf_arch, type(hf_tokenizer), type(hf_config), type(hf_model)


('bart',
 transformers.models.bart.tokenization_bart_fast.BartTokenizerFast,
 transformers.models.bart.configuration_bart.BartConfig,
 transformers.models.bart.modeling_bart.BartForConditionalGeneration)

## Preprocessing

Starting with version 2.0, BLURR provides a preprocessing base class that can be used to build task specific pre-processed datasets from pandas DataFrames or Hugging Face Datasets

### `TranslationPreprocessor`

This class can be used for preprocessing translation tasks, and includes a `proc_{your_text_attr}` and `proc_{target_text_attr}` attributes containing your modified input and target texts as a result of tokenization (e.g., if you specify a `max_length` the `proc_{your_text_attr}` may contain truncated text). 

In [None]:
# export
class TranslationPreprocessor(Seq2SeqPreprocessor):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # The unique identifier in the dataset
        id_attr: Optional[str] = None,
        # The attribute holding the text to translate
        text_attr: str = "original_text",
        # The maximum length (# of tokens) allowed for inputs. Will default to the max length allowed
        # by the model if not provided
        max_input_tok_length: Optional[int] = None,
        # The attribute holding the summary
        target_text_attr: str = "translated_text",
        # The maximum length (# of tokens) allowed for targets
        max_target_tok_length: Optional[int] = None,
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        # we need to use the offset mappings to get back at the raw text from its tokenized representation
        tok_kwargs = {**tok_kwargs, "return_offsets_mapping": True}
        
        super().__init__(
            hf_tokenizer, batch_size, text_attr, max_input_tok_length, target_text_attr, max_target_tok_length, is_valid_attr, tok_kwargs
        )

        self.id_attr = id_attr

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = super().process_df(training_df, validation_df)

        # process df in mini-batches
        final_df = pd.DataFrame()
        for g, batch_df in df.groupby(np.arange(len(df)) // self.batch_size):
            final_df = final_df.append(self._process_df_batch(batch_df))

        final_df.reset_index(drop=True, inplace=True)
        return final_df

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = super().process_hf_dataset(training_ds, validation_ds)
        return Dataset.from_pandas(self.process_df(pd.DataFrame(ds)))

    # ----- utility methods -----
    def _process_df_batch(self, batch_df):
        batch_df.reset_index(drop=True, inplace=True)

        # grab our inputs and targets batch encoding objects
        inputs, targets = self._tokenize_function(batch_df.to_dict(orient="list"))

        # add are processed text and target texts to the batched DataFrame
        for txt_seq_idx, (txt_attr, batch_enc) in enumerate(zip([self.text_attr, self.target_text_attr], [inputs, targets])):
            if txt_attr is None:
                break

            char_idxs = []
            for idx, offset_mapping in enumerate(batch_enc["offset_mapping"]):
                text_offsets = [offset_mapping[i] for i, seq_id in enumerate(batch_enc.sequence_ids(idx))]
                char_idxs.append([min(text_offsets)[0], max(text_offsets)[1]])

            batch_df = pd.concat(
                [batch_df, pd.DataFrame(char_idxs, columns=[f"{txt_attr}_start_char_idx", f"{txt_attr}_end_char_idx"])], axis=1
            )
            batch_df.insert(
                0,
                f"proc_{txt_attr}",
                batch_df.apply(lambda r: r[txt_attr][r[f"{txt_attr}_start_char_idx"] : r[f"{txt_attr}_end_char_idx"] + 1], axis=1),
            )

        return batch_df


#### Using a `DataFrame`

In [None]:
preprocessor = TranslationPreprocessor(
    hf_tokenizer, text_attr="de", target_text_attr="en", max_input_tok_length=128, max_target_tok_length=128
)
proc_df = preprocessor.process_df(wmt_df)
proc_df.columns, len(proc_df)
proc_df.head(2)


Unnamed: 0,proc_en,proc_de,de,en,de_start_char_idx,de_end_char_idx,en_start_char_idx,en_end_char_idx
0,Resumption of the session,Wiederaufnahme der Sitzungsperiode,Wiederaufnahme der Sitzungsperiode,Resumption of the session,0,34,0,25
1,"I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.","Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.","Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.","I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.",0,218,0,207


## Examples

### Using the mid-level API

#### Batch-Time Tokenization

##### Step 1: Get your Hugging Face objects.

In [None]:
pretrained_model_name = "facebook/bart-large-cnn"
model_cls = AutoModelForSeq2SeqLM

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)

#####  Step 2: Create your `DataBlock`

Two lines!  Notice we pass in `noop` for our targets (e.g. our summaries) because the batch transform will take care of both out inputs and targets.

In [None]:
blocks = (Seq2SeqTextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), noop)
dblock = DataBlock(blocks=blocks, get_x=ColReader("de"), get_y=ColReader("en"), splitter=RandomSplitter())


In [None]:
# dblock.summary(wmt_df)


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(wmt_df, bs=4)


In [None]:
b = dls.one_batch()


In [None]:
len(b), b[0]["input_ids"].shape, b[0]["labels"].shape, b[1].shape


(2, torch.Size([4, 458]), torch.Size([4, 287]), torch.Size([4, 287]))

In [None]:
b[0]["labels"][0], b[1][0]

(tensor([    0,  2872,    36,   250,   245,    12,   612,  5046,    73, 17472,
            43,    30,   427,   248,  2768,   298,   459,     6,    15,  4137,
             9,     5,  1674,    15,  8587,  1766,  6007,     6,    15, 22674,
         30252,     9,     5,   568,  8082, 15462,     7,     5,  1463,    11,
          2098,     9,     5,   613,  1052,     9,     5,  2958,     6,  3821,
             8,  4413,   796,  2717, 17114,    13,     5,  6708,   613,    76,
            36, 10370,  1640, 37446,    43, 30398,   111,   230,   245,  5514,
           151,   246,    73, 37446,   111,  6193,    73, 34972,  1640, 44579,
         48749,    15, 18379, 15462,     7,     5,   796,  2475,    13,     5,
         26657,     9, 10427,     8, 11214, 28614,     6,  7378,     6,    13,
             5,  6708,   613,    76,    36,   347,   245,    12,  2663,  1096,
            73, 17472,   111,  3788,    73,   844,  6405,  1640, 44579, 48749,
            15, 18379, 15462,     7,     5,   796,  

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=250, target_trunc_at=250)


Unnamed: 0,text,target
0,"<s> Bericht (A5-0089/2000) von Frau Rühle im Namen des Ausschusses für Haushaltskontrolle über den Aufschub des Beschlusses zur Entlastung der Kommission für die Haushaltsführung des sechsten, siebten und achten Europäischen Entwicklungsfonds für das","Report (A5-0089/2000) by Mr Rühle, on behalf of the Committee on Budgetary Control, on postponement of the decision concerning discharge to the Commission in respect of the financial management of the sixth, seventh and eighth European Development F"
1,"<s> Dabei wird mit der Entziehung oder Einbehaltung von Mitteln gedroht, um eine bessere Umsetzung von Rechtsvorschriften in Bereichen durchzusetzen, die mit dem Bereich, für den die Mittel bestimmt sind, mitunter recht wenig zu tun haben, wobei die","Linkage, in other words, using the threat of the withdrawal or withholding of funds to try to get better implementation of legislation in what may sometimes be a rather unconnected sector but in this case is quite a closely connected sector, is one"


#### Using a preprocessed dataset

##### Step 1a: Get your Hugging Face objects.

In [None]:
pretrained_model_name = "facebook/bart-large-cnn"
model_cls = AutoModelForSeq2SeqLM

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)

##### Step 1b. Preprocess dataset

In [None]:
preprocessor = TranslationPreprocessor(
    hf_tokenizer,
    text_attr="de",
    target_text_attr="en",
    max_input_tok_length=128,
    max_target_tok_length=128
)
proc_df = preprocessor.process_df(wmt_df)

##### Step 2: Create your `DataBlock`

In [None]:
blocks = (Seq2SeqTextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), noop)
dblock = DataBlock(blocks=blocks, get_x=ColReader("proc_de"), get_y=ColReader("proc_en"), splitter=RandomSplitter())


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_df, bs=4)

In [None]:
b = dls.one_batch()


In [None]:
len(b), b[0]["input_ids"].shape, b[0]["labels"].shape, b[1].shape

(2, torch.Size([4, 129]), torch.Size([4, 91]), torch.Size([4, 91]))

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=250, target_trunc_at=250)

Unnamed: 0,text,target
0,"<s> Daher geht es nicht nur um eine Reform der Welthandelsorganisation und der wirtschaftlichen Architektur - und der Kommissionspräsident hat bezeichnenderweise vergessen, von der Reform der Finanzinstitutionen der internationalen Finanzarchitektur","What we need are not just reforms of the World Trade Organisation and the economic architecture - and, significantly, the President of the Commission forgot to mention the reform of the financial institutions and international financial architecture"
1,"<s> Die ""Vereinfachung "" der Richtlinie von 1973, die 1992 durch den Rat von Edinburgh eingeleitet wurde, hat erst 1996 zu einem Vorschlag der Kommission geführt, der 1997 in erster Lesung von unserem Parlament geprüft wurde; und nun hat es noch einm","Introduced at the Edinburgh Council in 1992, the'simplification' of the 1973 directive became a Commission proposal only in 1996 and was considered at first reading by this House in 1997. It has taken another two and a half years for the Council to"


## Tests

The purpose of the following tests is to ensure as much as possible, that the core DataBlock code above works for the pretrained **translation models** below.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained translation models you are working with ... and if any of your pretrained summarization models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
[model_type for model_type in BLURR.get_models(task="ConditionalGeneration") if (not model_type.startswith("TF"))]


['BartForConditionalGeneration',
 'BigBirdPegasusForConditionalGeneration',
 'BlenderbotForConditionalGeneration',
 'BlenderbotSmallForConditionalGeneration',
 'FSMTForConditionalGeneration',
 'LEDForConditionalGeneration',
 'M2M100ForConditionalGeneration',
 'MBartForConditionalGeneration',
 'MT5ForConditionalGeneration',
 'PegasusForConditionalGeneration',
 'ProphetNetForConditionalGeneration',
 'Speech2TextForConditionalGeneration',
 'T5ForConditionalGeneration',
 'XLMProphetNetForConditionalGeneration']

In [None]:
pretrained_model_names = [
    "facebook/bart-base",
    "facebook/wmt19-de-en",  # FSMT
    "Helsinki-NLP/opus-mt-de-en",  # MarianMT
    "sshleifer/tiny-mbart",
    "google/mt5-small",
    "t5-small",
]


In [None]:
path = Path("./")
wmt_df = pd.DataFrame(raw_dataset["translation"], columns=["de", "en"])


In [None]:
# slow
# hide_output
model_cls = AutoModelForSeq2SeqLM
bsz = 2
seq_sz = 128
trg_seq_sz = 128

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    hf_tok_kwargs = {}
    if model_name == "sshleifer/tiny-mbart":
        hf_tok_kwargs["src_lang"], hf_tok_kwargs["tgt_lang"] = "de_DE", "en_XX"

    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=model_cls, tokenizer_kwargs=hf_tok_kwargs)

    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    batch_tokenize_tfm = Seq2SeqBatchTokenizeTransform(
        hf_arch, hf_config, hf_tokenizer, hf_model, padding="max_length", max_length=seq_sz, max_target_length=trg_seq_sz
    )

    def add_t5_prefix(inp):
        return f"translate German to English: {inp}" if (hf_arch == "t5") else inp

    blocks = (Seq2SeqTextBlock(batch_tokenize_tfm=batch_tokenize_tfm), noop)
    dblock = DataBlock(blocks=blocks, get_x=Pipeline([ColReader("de"), add_t5_prefix]), get_y=ColReader("en"), splitter=RandomSplitter())

    dls = dblock.dataloaders(wmt_df, bs=bsz)
    b = dls.one_batch()

    try:
        print("*** TESTING DataLoaders ***\n")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)
        test_eq(b[1].shape, torch.Size([bsz, trg_seq_sz]))

        if hasattr(hf_tokenizer, "add_prefix_space"):
            test_eq(hf_tokenizer.add_prefix_space, True)

        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", ""))
        dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=1000)

    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err))


=== facebook/bart-base ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"<s> Was nun die Ergebnisse der Verhandlungen über die Anwendung der Artikel 3, 4, 5, 6 und 12 des Interimsabkommens bezüglich Warenhandel, öffentlicher Aufträge, Wettbewerb, Konsultationsmechanismen bei Fragen des geistigen Eigentums und Beilegung von Streitigkeiten betrifft, so gilt festzuhalten, daß wir das vorgesehene Maß</s>","Although for certain sectors there may be flaws - I am thinking specifically of the textiles sector, where the rules of origin issue causes great concern - the effects will be beneficial for both the European Union and Mexico. For the European Union, because the establishment of a free trade area will enable it to rebuild the presence Member States had in Mexican markets before the NAFTA agreement came into force, involving new expansion opportunities for European companies."
1,"<s> Den Häfen von Mitgliedsländern am Mittelmeer wie beispielsweise meiner Heimat, Griechenland, werden ganz erhebliche Probleme im Wettbewerb mit den Häfen benachbarter, nicht der Gemeinschaft angehörender Länder erwachsen, denn natürlich können die Kapitäne der das Mittelmeer befahrenden Schiffe nicht daran gehindert werden,</s>","The other side to the coin is that ports in Member States in the Mediterranean area, such as my own country, Greece, will face particularly stiff competition from ports in neighbouring non-EU countries as, of course, nothing will prevent masters of ships sailing through the Mediterranean from docking at Turkish or North-African ports so as to avoid paying the objectively high fees which are to apply in Community ports, irrespective of use of facilities."


=== facebook/wmt19-de-en ===

architecture:	fsmt
tokenizer:	FSMTTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Bericht (A5-0089 / 2000) von Frau Rühle im Namen des Ausschusses für Haushaltskontrolle über den Aufschub des Beschlusses zur Entlastung der Kommission für die Haushaltsführung des sechsten, siebten und achten Europäischen Entwicklungsfonds für das Haushaltsjahr 1998 (KOM (1999) 227 - C5-0003 / 1999 - 1999 / 2004 (DEC)), über die Entlastung der Europäischen Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen (Dublin) für das Haushaltsjahr 1998 (C5-0150 / 2000 - 2000 / 2094 (DEC)), über die Entlastung des Europäischen Z</s>","Report (A5-0089 / 2000) by Mr Rühle, on behalf of the Committee on Budgetary Control, on postponement of the decision concerning discharge to the Commission in respect of the financial management of the sixth, seventh and eighth European Development Funds for the 1998 financial year (COM (1999) 227 - C50003 / 1999 - 1999 / 2004 (DEC)); on granting discharge to the European Foundation for the Improvement of Living and Working Conditions, Dublin, for the 1998 financial year (C5-0150 / 2000"
1,"Zusammen mit meinem Kollegen Dominique Souchet, dem dieses Thema sehr am Herzen liegt, habe ich fünf Änderungsanträge eingebracht, die die Rolle der KMU und des Handwerks im Rahmen von INTERREG, die Bedeutung der Kooperation zwischen den Unternehmen und die notwendige Einbindung der Wirtschafts- und Sozialpartner in die Ausarbeitung und Umsetzung der Programme stärker herausstellen sollen. Diese Änderungsanträge wurden zu meiner großen Freude einstimmig angenommen. </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>","I and my colleague Dominique Souchet, who is well versed in this matter, have tabled five amendments highlighting the role of SMEs and craft trades within the framework of INTERREG, the importance of cooperation between undertakings and the need to involve economic and social partners in the design and implementation of the programmes and I welcome the fact that these amendments were adopted unanimously."


=== Helsinki-NLP/opus-mt-de-en ===

architecture:	marian
tokenizer:	MarianTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Was nun die▁Ergebnisse der▁Verhandlungen über die▁Anwendung der Artikel 3, 4, 5, 6 und 12 des Interimsabkommens▁bezüglich▁Warenhandel,▁öffentlicher▁Aufträge,▁Wettbewerb,▁Konsultationsmechanismen▁bei▁Fragen des▁geistigen▁Eigentums und▁Beilegung von▁Streitigkeiten▁betrifft, so▁gilt▁festzuhalten,▁daß wir das▁vorgesehene▁Maßnahmenpaket für ein▁gutes▁Abkommen▁halten,▁auch▁wenn es für▁bestimmte▁Sektoren mit▁Nachteilen▁verbunden▁ist -▁ich▁denke im▁einzelnen an die▁Textilindustrie, die sich▁insbesondere▁wegen der▁Ursprungsregeln▁Sorgen▁macht -, ein▁Abkommen, das▁sowohl der▁Europäischen Union▁als▁auch▁Mexiko▁Vorteile▁bringen▁wird: Für die▁Europäische Union,▁weil die▁Schaffung▁einer▁Freihandelszone es den▁Mitgliedstaaten▁ermöglichen▁wird, auf","Although for certain sectors there may be flaws - I am thinking specifically of the textiles sector, where the rules of origin issue causes great concern - the effects will be beneficial for both the European Union and Mexico. For the European Union, because the establishment of a free trade area will enable it to rebuild the presence Member States had in Mexican markets before the NAFTA agreement came into force, involving new expansion opportunities for European companies."
1,"Seine▁Befürworter▁mögen das▁Paket der▁Schlußfolgerungen▁noch so▁sehr mit▁rosafarbenen▁Bändern schmücken,▁indem▁sie▁entweder über die▁sogenannte▁Modernisierung des▁europäischen Gesellschaftsmodells oder über die▁Notwendigkeit▁reden, die▁Europäische Union zum wettbewerbsfähigsten▁Raum der Welt zu▁machen, es▁gelingt▁ihnen▁trotzdem nicht,▁ihre▁Ziele zu verheimlichen: Sie▁wollen▁einen▁Arbeitsmarkt▁ohne die▁Schutzbestimmungen der▁Arbeitnehmerrechte▁erreichen, die▁Sozialversicherung▁vollständig den▁Finanzmärkten▁unterwerfen und den▁Liberalisierungsprozeß in▁zentralen▁Sektoren▁wie Gas,▁Elektrizität, Postdienste,▁Verkehr und Telekommunikation forcieren.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>","Its supporters may try to make its conclusions look rosy, whether they are talking about the so-called modernisation of the European social model or about the need to make the European Union the most competitive area in the world, but they cannot hide their real aims, which are to produce a labour market free from the conditions imposed by legislation designed to protect workers' rights, to put social security entirely in the hands of the financial markets and to speed up the process of liberalisation in fundamental sectors such as gas, electricity, postal services, transport and telecommunications."


=== sshleifer/tiny-mbart ===

architecture:	mbart
tokenizer:	MBartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Was nun die Ergebnisse der Verhandlungen über die Anwendung der Artikel 3, 4, 5, 6 und 12 des Interimsabkommens bezüglich Warenhandel, öffentlicher Aufträge, Wettbewerb, Konsultationsmechanismen bei Fragen des geistigen Eigentums und Beilegung von Streitigkeiten betrifft, so gilt festzuhalten, daß wir das vorgesehene Maßnahmenpaket für ein gutes Abkommen halten, auch wenn es für bestimmte Sektoren mit Nachteilen verbunden ist - ich denke im einzelnen an die Textilindustrie, die sich insbesondere wegen der Ursprungsregeln Sorgen macht -, ein Abkommen, das sowohl der Europäischen</s>de_DE","Although for certain sectors there may be flaws - I am thinking specifically of the textiles sector, where the rules of origin issue causes great concern - the effects will be beneficial for both the European Union and Mexico. For the European Union, because the establishment of a free trade area will enable it to rebuild the presence Member States had in Mexican markets before the NAFTA agreement came into force, involving new expansion opportunities for European companies."
1,"Ein derartiges Signal würde, freiwillig oder unfreiwillig, einem ungeheuren Aufruf zur Einwanderung gleichkommen, bei der allein der Antrag auf Anerkennung als Flüchtling, ob sie nun gewährt wird oder nicht, ausreichen würde, um eine ganze Palette von Hilfsmaßnahmen zu rechtfertigen, die zur dauerhaften Integration des Asylbewerbers in die Arbeitswelt beitragen soll, in ein allgemeines Umfeld, das von Mangel geprägt ist und in dem die reguläre Bevölkerung, Ausländer und Einheimische, auf grausame Weise mit dem Problem der Arbeitslosigkeit konfrontiert ist</s>de_DE","Whether we like it or not, a signal of this sort would be a fearful invitation to immigrants who, merely by applying for refugee status, irrespective of whether or not it is granted, would be able to qualify for a range of aid measures designed to help them integrate permanently into the labour market, in a general climate of shortage, in which the legal population, both foreign and indigenous, is cruelly aware of the problem of unemployment."


=== google/mt5-small ===

architecture:	mt5
tokenizer:	T5TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Was nun die Ergebnisse der Verhandlungen über die Anwendung der Artikel 3, 4, 5, 6 und 12 des Interimsabkommens bezüglich Warenhandel, öffentlicher Aufträge, Wettbewerb, Konsultationsmechanismen bei Fragen des geistigen Eigentums und Beilegung von Streitigkeiten betrifft, so gilt festzuhalten, daß wir das vorgesehene Maßnahmenpaket für ein gutes Abkommen halten, auch wenn es für bestimmte Sektoren mit Nachteilen verbunden ist - ich denke im einzelnen an die Textilindustrie, die sich insbesonder</s>","Although for certain sectors there may be flaws - I am thinking specifically of the textiles sector, where the rules of origin issue causes great concern - the effects will be beneficial for both the European Union and Mexico. For the European Union, because the establishment of a free trade area will enable it to rebuild the presence Member States had in Mexican markets before the NAFTA agreement came into force, involving new expansion opportunities for European companies."
1,"Theoretisch könnten wir dann auch die Forderung der Kommission nach einer Managementstelle für die Strukturförderung in den Mitgliedstaaten, welche die Durchführung und Verwaltung der Strukturinterventionen vor Ort koordiniert, unterstützen, vorausgesetzt, daß sich eine solche Stelle nicht als ein Instrument der Zentralisierung auf supranationaler Ebene erweist und die effektive Aufmerksamkeit gegenüber jenen Gebieten gewährleistet - ich wiederhole es nochmals -, die durch das Zusammenwirken mehrerer negativer Faktoren noch keine adäquate</s>","We could then, theoretically, also support the Commission' s call for a management unit for structural assistance in the Member States, with the task of coordinating the implementation and administration of structural assistance there,provided that this unit does not become a centralising instrument at supranational level, but guarantees real help for those zones - and I stress this once again - which, owing to a whole string of converging negative factors, have not yet harnessed Structural Funds to sufficient levels,"


=== t5-small ===

architecture:	t5
tokenizer:	T5TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"translate German to English: Was nun die Ergebnisse der Verhandlungen über die Anwendung der Artikel 3, 4, 5, 6 und 12 des Interimsabkommens bezüglich Warenhandel, öffentlicher Aufträge, Wettbewerb, Konsultationsmechanismen bei Fragen des geistigen Eigentums und Beilegung von Streitigkeiten betrifft, so gilt festzuhalten, daß wir das vorgesehene Maßnahmenpaket für ein gutes Abkommen halten, auch wenn es für bestimmte Sektoren mit Nachteilen verbunden ist - ich denke im einzelnen an die Textilindustrie,</s>","Although for certain sectors there may be flaws - I am thinking specifically of the textiles sector, where the rules of origin issue causes great concern - the effects will be beneficial for both the European Union and Mexico. For the European Union, because the establishment of a free trade area will enable it to rebuild the presence Member States had in Mexican markets before the NAFTA agreement came into force, involving new expansion opportunities for European companies."
1,"translate German to English: Sie wird aber auf Seite 5 dieser Leitlinien ganz eindeutig genannt, und ich möchte darauf verweisen - weil sie mich dazu aufgefordert haben -, daß diese Partnerschaft für mich - und ich habe lange genug eine Region betreut, um dies beurteilen zu können - ein sehr wirkungsvolles Instrument zur Mobilisierung der geistigen Ressourcen auf lokaler Ebene ist - sowohl derer im öffentlichen Sektor - die Stadt- und Gemeinderäte, den schulischen und gesellschaftlichen Bereich, die Vereine</s>","However, I do wish to mention - since you have asked me to do so - that, as far as I am concerned, this partnership - and I spent long enough as a regional administrator within my own country to be able to say this most sincerely - is a tool, one used to involve local brainpower, be it in the public sector, in the form of elected representatives, the social and educational sectors, associations, or in the private sector; a decentralised partnership, and let me mention in this connection, in response to Mrs Angelilli, the territorial pact"


In [None]:
# slow
# hide_input
test_results_df = pd.DataFrame(test_results, columns=["arch", "tokenizer", "model_name", "result", "error"])
display_df(test_results_df)


Unnamed: 0,arch,tokenizer,model_name,result,error
0,bart,BartTokenizerFast,facebook/bart-base,PASSED,
1,fsmt,FSMTTokenizer,facebook/wmt19-de-en,PASSED,
2,marian,MarianTokenizer,Helsinki-NLP/opus-mt-de-en,PASSED,
3,mbart,MBartTokenizerFast,sshleifer/tiny-mbart,PASSED,
4,mt5,T5TokenizerFast,google/mt5-small,PASSED,
5,t5,T5TokenizerFast,t5-small,PASSED,


## Summary

This module includes the fundamental data preprocessing bits to use Blurr for translation.

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()


Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
