In [None]:
# |default_exp text.data.seq2seq.translation
# |default_cls_lvl 3

In [None]:
# | nbflags skip_exec

In [None]:
# |hide
%reload_ext autoreload
%autoreload 2

# Data

> The `text.data.seq2seq.translation` module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for translation tasks

In [None]:
# |export
import warnings
from typing import Optional

import numpy as np
import pandas as pd

from datasets import Dataset
from fastai.data.block import DataBlock
from transformers import AutoModelForSeq2SeqLM, PreTrainedTokenizerBase
from transformers.utils import logging as hf_logging

from blurr.text.data.seq2seq.core import (
    Seq2SeqBatchTokenizeTransform,
    Seq2SeqPreprocessor,
    Seq2SeqTextBlock,
)
from blurr.text.utils import get_hf_objects

In [None]:
# | hide
import os, ast, pdb
from functools import reduce

from datasets import load_dataset
from fastai.data.transforms import *
from fastai.torch_core import *
from fastai.torch_imports import *
from fastcore.all import *
from fastcore.test import *
from nbdev import nbdev_export
from nbdev.showdoc import show_doc

from blurr.utils import print_versions
from blurr.text.utils import BlurrText

What we're running with at the time this documentation was generated:
torch: 1.10.1+cu111
fastai: 2.5.6
transformers: 4.16.2


In [None]:
# |export
# silence all the HF warnings
warnings.simplefilter("ignore")
hf_logging.set_verbosity_error()

In [None]:
# | echo: false
NLP = BlurrText()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")

In [None]:
# |hide
# |cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")

Using GPU #1: GeForce GTX 1080 Ti


## Setup

We'll use a subset of `wmt16` to demonstrate how to configure your BLURR for translation tasks

In [None]:
raw_dataset = load_dataset("wmt16", "de-en", split="train[:1%]")
raw_dataset

Reusing dataset wmt16 (/home/wgilliam/.cache/huggingface/datasets/wmt16/de-en/1.0.0/af3c5d746b307726d0de73ebe7f10545361b9cb6f75c83a1734c000e48b6264f)


Dataset({
    features: ['translation'],
    num_rows: 45489
})

In [None]:
print(raw_dataset[0].keys())
print(raw_dataset[0])

dict_keys(['translation'])
{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}}


In [None]:
wmt_df = pd.DataFrame(raw_dataset["translation"], columns=["de", "en"])

print(len(wmt_df))
wmt_df.head(2)

45489


Unnamed: 0,de,en
0,Wiederaufnahme der Sitzungsperiode,Resumption of the session
1,"Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.","I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period."


In [None]:
pretrained_model_name = "facebook/bart-large-cnn"
model_cls = AutoModelForSeq2SeqLM

hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
    pretrained_model_name, model_cls=model_cls
)
hf_arch, type(hf_tokenizer), type(hf_config), type(hf_model)

('bart',
 transformers.models.bart.tokenization_bart_fast.BartTokenizerFast,
 transformers.models.bart.configuration_bart.BartConfig,
 transformers.models.bart.modeling_bart.BartForConditionalGeneration)

## Preprocessing

Starting with version 2.0, BLURR provides a preprocessing base class that can be used to build task specific pre-processed datasets from pandas DataFrames or Hugging Face Datasets

### `TranslationPreprocessor` -

In [None]:
# |export
class TranslationPreprocessor(Seq2SeqPreprocessor):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # The unique identifier in the dataset
        id_attr: Optional[str] = None,
        # The attribute holding the text to translate
        text_attr: str = "original_text",
        # The maximum length (# of tokens) allowed for inputs. Will default to the max length allowed
        # by the model if not provided
        max_input_tok_length: Optional[int] = None,
        # The attribute holding the summary
        target_text_attr: str = "translated_text",
        # The maximum length (# of tokens) allowed for targets
        max_target_tok_length: Optional[int] = None,
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        # we need to use the offset mappings to get back at the raw text from its tokenized representation
        tok_kwargs = {**tok_kwargs, "return_offsets_mapping": True}

        super().__init__(
            hf_tokenizer,
            batch_size,
            text_attr,
            max_input_tok_length,
            target_text_attr,
            max_target_tok_length,
            is_valid_attr,
            tok_kwargs,
        )

        self.id_attr = id_attr

    def process_df(
        self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None
    ):
        df = super().process_df(training_df, validation_df)

        # process df in mini-batches
        final_df = pd.DataFrame()
        for g, batch_df in df.groupby(np.arange(len(df)) // self.batch_size):
            final_df = final_df.append(self._process_df_batch(batch_df))

        final_df.reset_index(drop=True, inplace=True)
        return final_df

    def process_hf_dataset(
        self, training_ds: Dataset, validation_ds: Optional[Dataset] = None
    ):
        ds = super().process_hf_dataset(training_ds, validation_ds)
        return Dataset.from_pandas(self.process_df(pd.DataFrame(ds)))

    # ----- utility methods -----
    def _process_df_batch(self, batch_df):
        batch_df.reset_index(drop=True, inplace=True)

        # grab our inputs and targets batch encoding objects
        inputs, targets = self._tokenize_function(batch_df.to_dict(orient="list"))

        # add are processed text and target texts to the batched DataFrame
        for txt_seq_idx, (txt_attr, batch_enc) in enumerate(
            zip([self.text_attr, self.target_text_attr], [inputs, targets])
        ):
            if txt_attr is None:
                break

            char_idxs = []
            for idx, offset_mapping in enumerate(batch_enc["offset_mapping"]):
                text_offsets = [
                    offset_mapping[i]
                    for i, seq_id in enumerate(batch_enc.sequence_ids(idx))
                ]
                char_idxs.append([min(text_offsets)[0], max(text_offsets)[1]])

            batch_df = pd.concat(
                [
                    batch_df,
                    pd.DataFrame(
                        char_idxs,
                        columns=[
                            f"{txt_attr}_start_char_idx",
                            f"{txt_attr}_end_char_idx",
                        ],
                    ),
                ],
                axis=1,
            )
            batch_df.insert(
                0,
                f"proc_{txt_attr}",
                batch_df.apply(
                    lambda r: r[txt_attr][
                        r[f"{txt_attr}_start_char_idx"] : r[f"{txt_attr}_end_char_idx"]
                        + 1
                    ],
                    axis=1,
                ),
            )

        return batch_df

This class can be used for preprocessing translation tasks, and includes a `proc_{your_text_attr}` and `proc_{target_text_attr}` attributes containing your modified input and target texts as a result of tokenization (e.g., if you specify a `max_length` the `proc_{your_text_attr}` may contain truncated text). 

#### Using a `DataFrame`

In [None]:
preprocessor = TranslationPreprocessor(
    hf_tokenizer,
    text_attr="de",
    target_text_attr="en",
    max_input_tok_length=128,
    max_target_tok_length=128,
)
proc_df = preprocessor.process_df(wmt_df)
proc_df.columns, len(proc_df)
proc_df.head(2)

Unnamed: 0,proc_en,proc_de,de,en,de_start_char_idx,de_end_char_idx,en_start_char_idx,en_end_char_idx
0,Resumption of the session,Wiederaufnahme der Sitzungsperiode,Wiederaufnahme der Sitzungsperiode,Resumption of the session,0,34,0,25
1,"I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.","Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.","Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.","I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.",0,218,0,207


## Examples

### Using the mid-level API

#### Batch-Time Tokenization

##### Step 1: Get your Hugging Face objects.

In [None]:
pretrained_model_name = "facebook/bart-large-cnn"
model_cls = AutoModelForSeq2SeqLM

hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
    pretrained_model_name, model_cls=model_cls
)

#####  Step 2: Create your `DataBlock`

Two lines!  Notice we pass in `noop` for our targets (e.g. our summaries) because the batch transform will take care of both out inputs and targets.

In [None]:
blocks = (Seq2SeqTextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), noop)
dblock = DataBlock(
    blocks=blocks,
    get_x=ColReader("de"),
    get_y=ColReader("en"),
    splitter=RandomSplitter(),
)

In [None]:
# dblock.summary(wmt_df)

##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(wmt_df, bs=4)

In [None]:
b = dls.one_batch()

In [None]:
len(b), b[0]["input_ids"].shape, b[0]["labels"].shape, b[1].shape

(2, torch.Size([4, 483]), torch.Size([4, 86]), torch.Size([4, 86]))

In [None]:
b[0]["labels"][0], b[1][0]

(tensor([    0,  2223,    13,  1402,  4723,    89,   189,    28, 16855,   111,
            38,   524,  2053,  4010,     9,     5,  2788,  4755,  1293,     6,
           147,     5,  1492,     9,  9813,   696,  4685,   372,  2212,   111,
             5,  3038,    40,    28, 10142,    13,   258,     5,   796,  1332,
             8,  1625,     4,   286,     5,   796,  1332,     6,   142,     5,
          7147,     9,    10,   481,   721,   443,    40,  3155,    24,     7,
          9648,     5,  2621, 10153,   532,    56,    11,  4938,  1048,   137,
             5, 13783,  1288,   376,    88,  1370,     6,  3329,    92,  2919,
          1616,    13,   796,   451,     4,     2], device='cuda:1'),
 tensor([    0,  2223,    13,  1402,  4723,    89,   189,    28, 16855,   111,
            38,   524,  2053,  4010,     9,     5,  2788,  4755,  1293,     6,
           147,     5,  1492,     9,  9813,   696,  4685,   372,  2212,   111,
             5,  3038,    40,    28, 10142,    13,   258,    

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=250, target_trunc_at=250)

Unnamed: 0,text,target
0,"<s> Was nun die Ergebnisse der Verhandlungen über die Anwendung der Artikel 3, 4, 5, 6 und 12 des Interimsabkommens bezüglich Warenhandel, öffentlicher Aufträge, Wettbewerb, Konsultationsmechanismen bei Fragen des geistigen Eigentums und Beilegung vo","Although for certain sectors there may be flaws - I am thinking specifically of the textiles sector, where the rules of origin issue causes great concern - the effects will be beneficial for both the European Union and Mexico. For the European Union"
1,"<s> Die allgemeine Ausrichtung der umgesetzten Wirtschaftspolitik, der Stabilitätspakt sowie die strengen Konvergenzprogramme, die der Beschäftigung empfindlich schaden und Beschäftigungsfähigkeit sowie Flexibilität der Arbeitsverhältnisse und Arbeit","The general lines of the implemented economic policy, the Stability Pact and the strict convergence programmes, which are a constant menace to employment and which promote employability and the flexibilisation of labour relations, and the organisati"


#### Using a preprocessed dataset

##### Step 1a: Get your Hugging Face objects.

In [None]:
pretrained_model_name = "facebook/bart-large-cnn"
model_cls = AutoModelForSeq2SeqLM

hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
    pretrained_model_name, model_cls=model_cls
)

##### Step 1b. Preprocess dataset

In [None]:
preprocessor = TranslationPreprocessor(
    hf_tokenizer,
    text_attr="de",
    target_text_attr="en",
    max_input_tok_length=128,
    max_target_tok_length=128,
)
proc_df = preprocessor.process_df(wmt_df)

##### Step 2: Create your `DataBlock`

In [None]:
blocks = (Seq2SeqTextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), noop)
dblock = DataBlock(
    blocks=blocks,
    get_x=ColReader("proc_de"),
    get_y=ColReader("proc_en"),
    splitter=RandomSplitter(),
)

##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_df, bs=4)

In [None]:
b = dls.one_batch()

In [None]:
len(b), b[0]["input_ids"].shape, b[0]["labels"].shape, b[1].shape

(2, torch.Size([4, 129]), torch.Size([4, 83]), torch.Size([4, 83]))

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=250, target_trunc_at=250)

Unnamed: 0,text,target
0,"<s> Hierbei denke ich an systematische Dokumentation und Informationsbeschaffung, professionelle Formen der Beobachtung, die Entwicklung von Aufklärungsaktionen, die Verwendung von Geldern zur Unterstützung der demokratischen Kräfte in dem betreffend","The following spring to mind in this respect: the systematic collation of documentation and information, professional forms of observation, the development of information campaigns, the use of cash to support democratic forces in the country concern"
1,"<s> Eine letzte Bemerkung: Die durch die Struktur des Internet bedingte permanente Verfügbarkeit von Sexuellem im ausschließlich anonymisierten privaten Bereich und die Tatsache, daß der sexuelle Mißbrauch der öffentlichen und damit der sozialen Kont","One final comment: the permanent availability of sexual material in the exclusively anonymous private sphere, which is conditioned by the structure of the Internet, and the attendant fact that sexual abuse is removed from public and, hence, social c"


## Tests

The purpose of the following tests is to ensure as much as possible, that the core DataBlock code above works for the pretrained **translation models** below.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained translation models you are working with ... and if any of your pretrained summarization models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
[
    model_type
    for model_type in NLP.get_models(task="ConditionalGeneration")
    if (not model_type.startswith("TF"))
]

['BartForConditionalGeneration',
 'BigBirdPegasusForConditionalGeneration',
 'BlenderbotForConditionalGeneration',
 'BlenderbotSmallForConditionalGeneration',
 'FSMTForConditionalGeneration',
 'LEDForConditionalGeneration',
 'M2M100ForConditionalGeneration',
 'MBartForConditionalGeneration',
 'MT5ForConditionalGeneration',
 'PegasusForConditionalGeneration',
 'ProphetNetForConditionalGeneration',
 'Speech2TextForConditionalGeneration',
 'T5ForConditionalGeneration',
 'XLMProphetNetForConditionalGeneration']

In [None]:
pretrained_model_names = [
    "facebook/bart-base",
    "facebook/wmt19-de-en",  # FSMT
    "Helsinki-NLP/opus-mt-de-en",  # MarianMT
    "sshleifer/tiny-mbart",
    "google/mt5-small",
    "t5-small",
]

In [None]:
path = Path("./")
wmt_df = pd.DataFrame(raw_dataset["translation"], columns=["de", "en"])

In [None]:
# |slow
# | output: false
model_cls = AutoModelForSeq2SeqLM
bsz = 2
seq_sz = 128
trg_seq_sz = 128

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    hf_tok_kwargs = {}
    if model_name == "sshleifer/tiny-mbart":
        hf_tok_kwargs["src_lang"], hf_tok_kwargs["tgt_lang"] = "de_DE", "en_XX"

    hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
        model_name, model_cls=model_cls, tokenizer_kwargs=hf_tok_kwargs
    )

    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    batch_tokenize_tfm = Seq2SeqBatchTokenizeTransform(
        hf_arch,
        hf_config,
        hf_tokenizer,
        hf_model,
        padding="max_length",
        max_length=seq_sz,
        max_target_length=trg_seq_sz,
    )

    def add_t5_prefix(inp):
        return f"translate German to English: {inp}" if (hf_arch == "t5") else inp

    blocks = (Seq2SeqTextBlock(batch_tokenize_tfm=batch_tokenize_tfm), noop)
    dblock = DataBlock(
        blocks=blocks,
        get_x=Pipeline([ColReader("de"), add_t5_prefix]),
        get_y=ColReader("en"),
        splitter=RandomSplitter(),
    )

    dls = dblock.dataloaders(wmt_df, bs=bsz)
    b = dls.one_batch()

    try:
        print("*** TESTING DataLoaders ***\n")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)
        test_eq(b[1].shape, torch.Size([bsz, trg_seq_sz]))

        if hasattr(hf_tokenizer, "add_prefix_space"):
            test_eq(hf_tokenizer.add_prefix_space, True)

        test_results.append(
            (hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", "")
        )
        dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=1000)

    except Exception as err:
        test_results.append(
            (hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err)
        )

=== facebook/bart-base ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"<s> Bericht (A5-0089/2000) von Frau Rühle im Namen des Ausschusses für Haushaltskontrolle über den Aufschub des Beschlusses zur Entlastung der Kommission für die Haushaltsführung des sechsten, siebten und achten Europäischen Entwicklungsfonds für das Haushaltsjahr 1998 (KOM(1999) 227 - C5-0003/1999 - 1999/2004(</s>","Report (A5-0089/2000) by Mr Rühle, on behalf of the Committee on Budgetary Control, on postponement of the decision concerning discharge to the Commission in respect of the financial management of the sixth, seventh and eighth European Development Funds for the 1998 financial year (COM(1999) 227 - C5­0003/1999 - 1999/2004(DEC)); on granting discharge to the European Foundation for the Improvement of Living and Working Conditions, Dublin, for the 1998 financial year (C5-0150/2000 - 2000/2094(DEC)); on granting discharge to the European Centre"
1,"<s> Wesentliche Pfeiler, die ein Fundament dafür schaffen könnten, sind unserer Meinung nach die Aufnahme eines materiell abgesicherten Kapitels über den Fremdenverkehr im Rahmen der Reform, das auch die Rechtsgrundlage für den Sektor darstellt, die Herausbildung einer EU-weiten Fremdenverkehrspolitik, die den Sektor fördert, ohne das Funktion</s>","In our view, the axes which could create a basis for its support are: the addition of material capital for tourism in the new review which will also constitute a legal base for the sector, the framing of a Community tourism policy to support the sector without disturbing the operation of the tourism market, coordination of the tourism policy with other, parallel Community policies which affect it directly or indirectly, a study of the sector, and the adoption of medium and long-term measures to enhance the competitiveness of tourism as a product."


=== facebook/wmt19-de-en ===

architecture:	fsmt
tokenizer:	FSMTTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Was nun die Ergebnisse der Verhandlungen über die Anwendung der Artikel 3, 4, 5, 6 und 12 des Interimsabkommens bezüglich Warenhandel, öffentlicher Aufträge, Wettbewerb, Konsultationsmechanismen bei Fragen des geistigen Eigentums und Beilegung von Streitigkeiten betrifft, so gilt festzuhalten, daß wir das vorgesehene Maßnahmenpaket für ein gutes Abkommen halten, auch wenn es für bestimmte Sektoren mit Nachteilen verbunden ist - ich denke im einzelnen an die Textilindustrie, die sich insbesondere wegen der Ursprungsregeln Sorgen macht -, ein Abkommen, das sowohl der Europäischen Union als auch Mexiko </s>","Although for certain sectors there may be flaws - I am thinking specifically of the textiles sector, where the rules of origin issue causes great concern - the effects will be beneficial for both the European Union and Mexico. For the European Union, because the establishment of a free trade area will enable it to rebuild the presence Member States had in Mexican markets before the NAFTA agreement came into force, involving new expansion opportunities for European companies."
1,"Herr Präsident, ich habe auch für die Entschließung zu der Mitteilung der Kommission ""Strategie für den europäischen Binnenmarkt"" gestimmt, obwohl in ihrem Text die Stellungnahme, die Herr Medina Ortega im Namen des Ausschusses für Beschäftigung und soziale Angelegenheiten erarbeitet hat, nicht enthalten ist. Dort heißt es auf Seite 20 wörtlich: ""Unter dem Aspekt der sozialen Sicherheit erfordert das Ideal der Integration die Einführung einer europäischen Sozialversicherung,"" - ich wiederhole, einer europäischen Sozialversicherung - ""die das gegenwärtige System der Harmonisierung der nationalen Systeme ablöst"". </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>","Mr President, I also voted for the Communication: ""The strategy for Europe's internal market"", despite the fact that the text did not include the opinion drafted by Mr Medina Ortega on behalf of the Committee on Employment and Social Affairs, page 20 of which states that from the point of view of social security, the ideal of supplementary pensions requires the creation of a genuine European social security system - and I repeat, European social security system - to replace the current approach, which is based on the harmonisation of national systems."


=== Helsinki-NLP/opus-mt-de-en ===

architecture:	marian
tokenizer:	MarianTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Was nun die▁Ergebnisse der▁Verhandlungen über die▁Anwendung der Artikel 3, 4, 5, 6 und 12 des Interimsabkommens▁bezüglich▁Warenhandel,▁öffentlicher▁Aufträge,▁Wettbewerb,▁Konsultationsmechanismen▁bei▁Fragen des▁geistigen▁Eigentums und▁Beilegung von▁Streitigkeiten▁betrifft, so▁gilt▁festzuhalten,▁daß wir das▁vorgesehene▁Maßnahmenpaket für ein▁gutes▁Abkommen▁halten,▁auch▁wenn es für▁bestimmte▁Sektoren mit▁Nachteilen▁verbunden▁ist -▁ich▁denke im▁einzelnen an die▁Textilindustrie, die sich▁insbesondere▁wegen der▁Ursprungsregeln▁Sorgen▁macht -, ein▁Abkommen, das▁sowohl der▁Europäischen Union▁als▁auch▁Mexiko▁Vorteile▁bringen▁wird: Für die▁Europäische Union,▁weil die▁Schaffung▁einer▁Freihandelszone es den▁Mitgliedstaaten▁ermöglichen▁wird, auf","Although for certain sectors there may be flaws - I am thinking specifically of the textiles sector, where the rules of origin issue causes great concern - the effects will be beneficial for both the European Union and Mexico. For the European Union, because the establishment of a free trade area will enable it to rebuild the presence Member States had in Mexican markets before the NAFTA agreement came into force, involving new expansion opportunities for European companies."
1,"▁Unserer▁Meinung nach▁kommt▁oberste▁Priorität der▁wirtschafts- und▁sozialpolitische Agenda zu, die das einschließt, was in▁Ihren▁Prioritäten▁auch▁Lebensqualität▁genannt▁wird, das▁heißt, die▁Rechte der▁Bürger▁als▁Verbraucher und▁auch▁als▁Personen.▁Dabei▁geht es um▁Fragen, die wir▁immer▁wieder▁als▁gegeben▁voraussetzen,▁aber nicht▁genügend▁herausstellen, zum▁Beispiel um das▁europäische▁Sozialmodell und▁seine▁Anpassung an▁neue▁Gegebenheiten, um die▁Verbraucherrechte und den▁Schutz der Umwelt▁sowie um eine▁nachhaltige▁Entwicklung.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>","We understand that the first priority is the economic and social agenda, which also includes what is known as 'quality of life' priorities, that is to say, the rights of citizens as consumers and as people, in relation to those questions which we always talk about, but never give sufficient priority to, that is, the European social model and its adaptation to the new circumstances, consumer rights, respect for the environment and sustainable development."


=== sshleifer/tiny-mbart ===

architecture:	mbart
tokenizer:	MBartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Was nun die Ergebnisse der Verhandlungen über die Anwendung der Artikel 3, 4, 5, 6 und 12 des Interimsabkommens bezüglich Warenhandel, öffentlicher Aufträge, Wettbewerb, Konsultationsmechanismen bei Fragen des geistigen Eigentums und Beilegung von Streitigkeiten betrifft, so gilt festzuhalten, daß wir das vorgesehene Maßnahmenpaket für ein gutes Abkommen halten, auch wenn es für bestimmte Sektoren mit Nachteilen verbunden ist - ich denke im einzelnen an die Textilindustrie, die sich insbesondere wegen der Ursprungsregeln Sorgen macht -, ein Abkommen, das sowohl der Europäischen</s>de_DE","Although for certain sectors there may be flaws - I am thinking specifically of the textiles sector, where the rules of origin issue causes great concern - the effects will be beneficial for both the European Union and Mexico. For the European Union, because the establishment of a free trade area will enable it to rebuild the presence Member States had in Mexican markets before the NAFTA agreement came into force, involving new expansion opportunities for European companies."
1,"Dasselbe gilt für das europäische Patent, dessen Verkauf uns ebenfalls nicht gelingen wird; ferner für die Ausschreibungsnormen, für den Kapitalmarkt; auch für die Früchte der Koordinierung der Sozialpolitik, denn es ist sinnlos, große Mobilitätsmöglichkeiten ins Auge zu fassen, erfolgreiche Programme wie ERASMUS oder SOKRATES zur Erhöhung der Mobilität durchzuführen, wenn wir dann Regelungen für die soziale Sicherheit haben, die dieser Mobilität nicht förderlich sind.</s>de_DE<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>","The same applies to the Community patent - for we will not be able to sell that either - to procurement rules and the capital market and to the beginnings of the coordination of social security, because there is no point in us dreaming up great expectations of mobility or developing such successful programmes as ERASMUS or SOCRATES to make people more mobile, if our social security rules do not then support this mobility."


=== google/mt5-small ===

architecture:	mt5
tokenizer:	T5TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Was nun die Ergebnisse der Verhandlungen über die Anwendung der Artikel 3, 4, 5, 6 und 12 des Interimsabkommens bezüglich Warenhandel, öffentlicher Aufträge, Wettbewerb, Konsultationsmechanismen bei Fragen des geistigen Eigentums und Beilegung von Streitigkeiten betrifft, so gilt festzuhalten, daß wir das vorgesehene Maßnahmenpaket für ein gutes Abkommen halten, auch wenn es für bestimmte Sektoren mit Nachteilen verbunden ist - ich denke im einzelnen an die Textilindustrie, die sich insbesonder</s>","Although for certain sectors there may be flaws - I am thinking specifically of the textiles sector, where the rules of origin issue causes great concern - the effects will be beneficial for both the European Union and Mexico. For the European Union, because the establishment of a free trade area will enable it to rebuild the presence Member States had in Mexican markets before the NAFTA agreement came into force, involving new expansion opportunities for European companies."
1,"Wie Frau Abgeordnete Anna Terrón in ihrer Wortmeldung gesagt hat, bezeichnet das Jahr 1999 im Bereich der Justiz und der inneren Angelegenheiten ein Jahr großer Erwartungen. Erwartungen, die sich mit dem Inkrafttreten des Vertrags von Amsterdam konkretisieren, Erwartungen, die sich mit den Schlußfolgerungen des Rates von Tampere konkretisieren, aber auch Erwartungen, bei denen es darauf ankommt, daß sie sich ab diesem Jahr in konkreten Handlungen und Leistungen niederschlagen, und in diesem Sinne sind wir uns alle</s>","As Mrs Terrón i Cusí said in her speech, 1999 was a year of great expectations in the field of justice and home affairs - an expectation realised with the coming into force of the Treaty of Amsterdam, an expectation realised with the conclusions of the Tampere Council, and a fundamental expectation, as from this year, that will be realised in the form of specific actions."


=== t5-small ===

architecture:	t5
tokenizer:	T5TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"translate German to English: Was nun die Ergebnisse der Verhandlungen über die Anwendung der Artikel 3, 4, 5, 6 und 12 des Interimsabkommens bezüglich Warenhandel, öffentlicher Aufträge, Wettbewerb, Konsultationsmechanismen bei Fragen des geistigen Eigentums und Beilegung von Streitigkeiten betrifft, so gilt festzuhalten, daß wir das vorgesehene Maßnahmenpaket für ein gutes Abkommen halten, auch wenn es für bestimmte Sektoren mit Nachteilen verbunden ist - ich denke im einzelnen an die Textilindustrie,</s>","Although for certain sectors there may be flaws - I am thinking specifically of the textiles sector, where the rules of origin issue causes great concern - the effects will be beneficial for both the European Union and Mexico. For the European Union, because the establishment of a free trade area will enable it to rebuild the presence Member States had in Mexican markets before the NAFTA agreement came into force, involving new expansion opportunities for European companies."
1,"translate German to English: Das beweist unter anderem, daß der Tourismus gar nicht das Erdöl der Armen ist, weil er enorme Investitionen in Strukturen, Infrastrukturen, Unternehmertum, Berufswesen, soziale Leistungen und Ausbildung erfordert; daß die Tourismusindustrie von grundlegender Bedeutung für das Gleichgewicht in vielen Regionen ist, weil sie direkte Quelle von Beschäftigung ist und einen Markt für viele lokale Produktionsaktivitäten bietet, nämlich Landwirtschaft, Kleinindustrie und Handwerk, Verkehr, Dienstleistungen, Handel, Bildung; </s>","This reminds us that tourism cannot be regarded as the poor countries' petroleum industry, because it requires huge investment in structures, infrastructures, entrepreneurship, professional expertise, social services and training; that the tourist industry is essential for the equilibrium of many regions as it provides a direct source of employment and a market for all local productive activities: agriculture, small firms in the industrial and craft sector, transport, services, retail sector, training, etc; that the development of the Community' s tourist industry is closely linked to its sustainability with regard to the Community objectives; that the varied geography and historical, cultural and environmental"


In [None]:
# |slow
# | echo: false
test_results_df = pd.DataFrame(
    test_results, columns=["arch", "tokenizer", "model_name", "result", "error"]
)
display_df(test_results_df)

Unnamed: 0,arch,tokenizer,model_name,result,error
0,bart,BartTokenizerFast,facebook/bart-base,PASSED,
1,fsmt,FSMTTokenizer,facebook/wmt19-de-en,PASSED,
2,marian,MarianTokenizer,Helsinki-NLP/opus-mt-de-en,PASSED,
3,mbart,MBartTokenizerFast,sshleifer/tiny-mbart,PASSED,
4,mt5,T5TokenizerFast,google/mt5-small,PASSED,
5,t5,T5TokenizerFast,t5-small,PASSED,


## Export -

In [None]:
# |hide
nbdev_export()

Converted 00_callbacks.ipynb.
Converted 00_utils.ipynb.
Converted 01_text-callbacks.ipynb.
Converted 01_text-utils.ipynb.
Converted 11_text-data-core.ipynb.
Converted 11_text-modeling-core.ipynb.
Converted 12_text-data-language-modeling.ipynb.
Converted 12_text-modeling-language-modeling.ipynb.
Converted 13_text-data-token-classification.ipynb.
Converted 13_text-modeling-token-classification.ipynb.
Converted 14_text-data-question-answering.ipynb.
Converted 14_text-modeling-question-answering.ipynb.
Converted 20_text-data-seq2seq-core.ipynb.
Converted 20_text-modeling-seq2seq-core.ipynb.
Converted 21_text-data-seq2seq-summarization.ipynb.
Converted 21_text-modeling-seq2seq-summarization.ipynb.
Converted 22_text-data-seq2seq-translation.ipynb.
Converted 22_text-modeling-seq2seq-translation.ipynb.
Converted 99a_text-examples-high-level-api.ipynb.
Converted 99b_text-examples-glue.ipynb.
Converted 99c_text-examples-glue-plain-pytorch.ipynb.
Converted 99d_text-examples-multilabel.ipynb.
Conv