# Install all the necessary libraries

In [1]:
! pip install transformers datasets evaluate
! pip install tokenizers
! pip install sacrebleu

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


# Load the tokenizer

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


# Load data from file

In [4]:
paths = ["./web2.txt"]

with open(paths[0]) as file:
    lines = [line.rstrip() for line in file]

# Set maximum sequence length to 32

In [5]:
max_seq_len = 32

# Build pytorch compatible dataset

In [6]:
import random
from torch.utils.data import Dataset
import torch
class CustomDataset(Dataset):
    def __init__(self, tokenizer):
        self.examples = []
        # For every value in the dataframe 
        num_examples = 30000
        for i in range(num_examples):
            # randomise sequence length
            seq_length = random.randint(0, max_seq_len-1)
            indices = random.sample(range(len(lines)), seq_length)
            tokens = [lines[i] for i in sorted(indices)]
            ordered = " ".join(tokens)
            random.shuffle(tokens)
            unordered = " ".join(tokens)
            if i == 3:
              print("input = ", unordered)
              print("output = ", ordered)

            model_inputs = tokenizer(unordered, max_length=32, truncation=True)

            labels = tokenizer(text_target=ordered, max_length=32, truncation=True)

            model_inputs["labels"] = labels["input_ids"]
            self.examples.append(model_inputs)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]
      
# Create the train and evaluation dataset
train_dataset = CustomDataset(tokenizer)
eval_dataset = CustomDataset(tokenizer)



input =  plastotype tricarballylic versemongery unguessable caprone Adolph pook mycogastritis parbake unconspiringly Cerasus tricuspidal outshame propatriotic stuprum glomerulose goalkeeper nervation Kadmi corrivation philology isophene polychord simpletonianism euphoniously furriery Moattalite
output =  Adolph caprone Cerasus corrivation euphoniously furriery glomerulose goalkeeper isophene Kadmi Moattalite mycogastritis nervation outshame parbake philology plastotype polychord pook propatriotic simpletonianism stuprum tricarballylic tricuspidal unconspiringly unguessable versemongery
input =  mannerliness digredient escheatage preabundantly cidaris Gelidium frotton undissuadably callet fakement kroon exasperating uncastrated eutechnic transshipment Beth iconodulist bedamp polynome stallage photosculptural
output =  bedamp Beth callet cidaris digredient escheatage eutechnic exasperating fakement frotton Gelidium iconodulist kroon mannerliness photosculptural polynome preabundantly sta

# Load the pre-trained model (t5-base)
 

In [None]:
# already tried with simple with no good results.

In [7]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

In [8]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Define evaluation function

In [9]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Training 

In [None]:
# Define training hyperparameters 

In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    fp16=True,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 30000
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2345
  Number of trainable parameters = 222903552


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,2.597294,5.9083,17.2807
2,2.793000,2.568232,6.6161,17.2797
3,2.734000,2.548333,7.0103,17.2763
4,2.708700,2.541528,7.2107,17.2747
5,2.688400,2.537495,7.3301,17.275


***** Running Evaluation *****
  Num examples = 30000
  Batch size = 64
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_

TrainOutput(global_step=2345, training_loss=2.7240923070195895, metrics={'train_runtime': 1213.7023, 'train_samples_per_second': 123.589, 'train_steps_per_second': 1.932, 'total_flos': 5708980224000000.0, 'train_loss': 2.7240923070195895, 'epoch': 5.0})

In [17]:
# save models and tokenizer for inference 
tokenizer.save_pretrained('tokenizer')
trainer.model.save_pretrained('model')

tokenizer config file saved in tokenizer/tokenizer_config.json
Special tokens file saved in tokenizer/special_tokens_map.json
Configuration saved in model/config.json
Configuration saved in model/generation_config.json
Model weights saved in model/pytorch_model.bin


## Inference

In [18]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained('model')
tokenizer = AutoTokenizer.from_pretrained('tokenizer')

loading configuration file model/config.json
Model config T5Config {
  "_name_or_path": "model",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
 

In [None]:
# Inference

In [23]:
text = "c a"
inputs = tokenizer(text, return_tensors="pt").input_ids
outputs = model.generate(inputs, max_new_tokens=32, do_sample=False)
tokenizer.decode(outputs[0], skip_special_tokens=True)

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}



'a c'