In [None]:
%%bash
pip install nltk
pip install datasets
pip install transformers[torch]
pip install tokenizers
pip install evaluate
pip install rouge_score
pip install sentencepiece
pip install huggingface_hub

In [1]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [2]:
MODEL_NAME = "dumitrescustefan/t5-v1_1-base-romanian"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.generation_config.min_new_tokens = 0
model.generation_config.max_new_tokens = 256
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are 

In [3]:
model.resize_token_embeddings(len(tokenizer))

Embedding(64101, 768)

In [6]:
# model.config.pad_token_id = 64100
# model.generation_config.pad_token_id = 64100
# model.generation_config.eos_token_id = 2
# model.generation_config.decoder_start_token_id = 64100

In [4]:
model.config

T5Config {
  "_name_or_path": "dumitrescustefan/t5-v1_1-base-romanian",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.40.2",
  "use_cache": true,
  "vocab_size": 64101
}

In [5]:
model.generation_config

GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "max_new_tokens": 256,
  "min_new_tokens": 0,
  "pad_token_id": 0
}

In [6]:
tokenizer

T5Tokenizer(name_or_path='dumitrescustefan/t5-v1_1-base-romanian', vocab_size=64000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<ext

In [7]:
print(tokenizer.get_vocab()['<pad>'])
tokenizer.get_vocab()['</s>']

64100


2

In [11]:
# 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
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
dataset = load_dataset("mateiaassAI/MEID")#, split=['train[:75%]', 'train[80%:81%]'])
dataset = dataset["train"].train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['wrong', 'right'],
        num_rows: 161046
    })
    test: Dataset({
        features: ['wrong', 'right'],
        num_rows: 40262
    })
})

In [5]:
# We prefix our tasks with "answer the question"
prefix = "Please translate this sentence: "

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["wrong"]]
   model_inputs = tokenizer(inputs, max_length=256, padding = True, truncation=True)

   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["right"],
                      max_length=256,
                      padding = True,
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [14]:
# Map the preprocessing function across our dataset
tokenized_dataset = dataset["train"].select(range(100)).map(preprocess_function, batched=True)
tokenized_dataset_test = dataset["train"].select(range(100)).map(preprocess_function, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [11]:
tokenized_dataset[0]

{'wrong': 'În cazulcandidatului ndipenednt se înscrie și funcția pentru care acesta a candidat  primar și local',
 'right': 'În cazul candidatului independent se înscrie și funcția pentru care acesta a candidat: primar și consilier local',
 'input_ids': [12828,
  5,
  29,
  10,
  52413,
  10,
  21799,
  24,
  23770,
  50,
  10,
  44,
  58,
  206,
  56001,
  67,
  3,
  39,
  66,
  9,
  18271,
  66,
  39,
  37,
  24,
  4938,
  11,
  1704,
  26,
  17,
  226,
  3,
  5,
  3125,
  1745,
  11,
  826,
  2,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
  64100,
 

In [11]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [12]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

   return result

In [13]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 4
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 2

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   logging_steps=100,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False,
   save_strategy="steps",
   save_steps=500,
)

In [15]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset,
   eval_dataset=tokenized_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,2.364053,0.053295,0.001297,0.047772,0.047801
2,No log,2.265996,0.003565,0.0,0.003733,0.003779


TrainOutput(global_step=50, training_loss=2.861773681640625, metrics={'train_runtime': 329.0507, 'train_samples_per_second': 0.608, 'train_steps_per_second': 0.152, 'total_flos': 76019112345600.0, 'train_loss': 2.861773681640625, 'epoch': 2.0})

In [None]:
last_checkpoint = "/content/results/checkpoint-500"

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

In [None]:
my_question = "sa dus al mass."
inputs = "Please translate this sentence: " + my_question

In [None]:
inputs = tokenizer(inputs, return_tensors="pt")
outputs = finetuned_model.generate(**inputs, max_new_tokens = 256)
answer = tokenizer.decode(outputs[0])
print(answer)