In [1]:
from datasets import load_dataset
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import elotl.corpus
import elotl.nahuatl.orthography

corpus_na = elotl.corpus.load('axolotl')  
n = elotl.nahuatl.orthography.Normalizer("sep")

translation = []
idx = []
for i, line in enumerate(corpus_na):
    idx.append(i)
    translation.append({'es': line[0], 'na': n.normalize(line[1])})

In [3]:
translation[0], idx[0]

({'es': 'Vino a iluminar el sol y allí fue a ver a su',
  'na': 'tlaminako tonati uan noponi kiitato'},
 0)

In [4]:
my_dict = { "id": idx, "translation": translation}

books = Dataset.from_dict(my_dict)
books = books.train_test_split(test_size=0.1)

In [5]:
books

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 14505
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 1612
    })
})

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [7]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [8]:
source_lang = "es"

target_lang = "na"

prefix = "translate Spanish to Nahuatl: "


def preprocess_function(examples):

    inputs = [prefix + example[source_lang] for example in examples["translation"]]

    targets = [example[target_lang] for example in examples["translation"]]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():

        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [9]:
tokenized_books = books.map(preprocess_function, batched=True)

100%|███████████████████████████████████████████| 15/15 [00:01<00:00, 11.80ba/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 16.38ba/s]


In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
training_args = Seq2SeqTrainingArguments(

    output_dir="./results",

    evaluation_strategy="steps",
    eval_steps=100,

    learning_rate=2e-5,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=16,

    weight_decay=0.01,

    save_total_limit=3,

    num_train_epochs=3,

    fp16=True,

)

trainer = Seq2SeqTrainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_books["train"],

    eval_dataset=tokenized_books["test"],
    
    tokenizer=tokenizer,

    data_collator=data_collator,

)

trainer.train()

Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: translation, id. If translation, id are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14505
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2721


Step,Training Loss,Validation Loss
100,No log,4.138365
200,No log,3.872991
300,No log,3.726564
400,No log,3.621414
500,4.117900,3.534052
600,4.117900,3.468971
700,4.117900,3.415693
800,4.117900,3.370355
900,4.117900,3.332528
1000,3.638800,3.299966


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: translation, id. If translation, id are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: translation, id. If translation, id are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: translation, id. If translation, id are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  

TrainOutput(global_step=2721, training_loss=3.5787208386091396, metrics={'train_runtime': 420.6641, 'train_samples_per_second': 103.444, 'train_steps_per_second': 6.468, 'total_flos': 1455046226313216.0, 'train_loss': 3.5787208386091396, 'epoch': 3.0})

In [12]:
model.to("cpu")

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [19]:
books['test'][1]

{'id': 4660,
 'translation': {'es': 'En este año se desbordó el Acuecuéxatl, y se inundaron los mexicas.',
  'na': 'ipan in nikan kis in akuekuexatl inik apachiuke mexika.'}}

In [20]:
input_ids = tokenizer("translate Spanish to Nahuatl: En este año", return_tensors="pt").input_ids
outputs = model.generate(input_ids)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['tej ye ye tlako']