In [1]:
!pip install evaluate
!pip install rouge_score



In [2]:
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import nltk
import evaluate
import numpy as np

In [3]:
MODEL_NAME = "dumitrescustefan/t5-v1_1-base-romanian"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
# model.generation_config.min_new_tokens = 0
# model.generation_config.max_new_tokens = 256
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are 

In [4]:
model.resize_token_embeddings(len(tokenizer))

Embedding(64101, 768)

In [5]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"

In [6]:
from datasets import load_dataset

dataset = load_dataset("mateiaassAI/MEID_v2", split=['train[:97%]', 'train[97%:100%]'])
dataset

[Dataset({
     features: ['wrong', 'right'],
     num_rows: 1967695
 }),
 Dataset({
     features: ['wrong', 'right'],
     num_rows: 60857
 })]

In [7]:
ds_train = dataset[0]
ds_test = dataset[1]
ds_test

Dataset({
    features: ['wrong', 'right'],
    num_rows: 60857
})

In [8]:
punctuation_marks = ['.', '?', '!', ';', '...']

def filter_sentences(sentences):
    text = sentences['right']
    if any(text.endswith(punc) for punc in punctuation_marks):
      words = text.split()
      if len(words) >= 10:
        return True
    return False

In [9]:
fds_train = ds_train.filter(filter_sentences, batched=False)
ds_test = ds_test.take(50000)
# fds_test = ds_test.filter(filter_sentences, batched=False)

In [10]:
print(len(fds_train))
print(len(ds_test))

1007018
50000


In [11]:
prefix = ""

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["wrong"]]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(text_target=examples["right"],
                          max_length=256,
                          truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
# tokenized_dataset = fds_train.take(1000000).map(preprocess_function, batched=True)
tokenized_dataset = fds_train.take(100).map(preprocess_function, batched=True)
tokenized_dataset_test = ds_test.map(preprocess_function, batched=True)
print(len(tokenized_dataset_test))

50000


In [13]:
print(len(tokenized_dataset[0]['input_ids']))
print(len(tokenized_dataset[0]['labels']))
# print(len(tokenized_dataset[0]['input_ids']))
print(tokenized_dataset[0]['labels'])
print(tokenized_dataset[0]['attention_mask'])
print(tokenized_dataset[0]['input_ids'])

102
107
[183, 988, 24, 485, 7568, 8241, 6, 3, 3848, 763, 3, 5, 2048, 306, 7, 2069, 19, 784, 245, 7, 3255, 3, 191, 15328, 234, 37453, 5, 26, 1119, 6, 15117, 945, 4, 12, 8213, 313, 6, 3375, 245, 7, 5856, 19, 3, 262, 674, 4550, 3, 5, 137, 50, 13340, 1214, 72, 23394, 9, 20654, 6, 9474, 11, 4806, 4, 4374, 3231, 4, 19, 9988, 4, 6, 1250, 6, 3235, 2853, 8, 125, 10, 50, 80, 5, 2771, 31135, 4, 51610, 1307, 3, 191, 3, 365, 4, 12, 9759, 245, 7, 3051, 8877, 262, 3, 191, 1199, 169, 3, 191, 3780, 48, 367, 13580, 7, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[183, 988, 24, 5, 32763, 7568, 8241, 6, 3, 3848, 763, 3, 5, 2048, 306, 7, 2069, 19, 8901, 245, 7, 3255, 3, 191, 15328, 234, 37453, 5, 26, 1119, 6, 15117, 945, 12, 8213, 313, 6, 3375, 2

In [14]:
!pip install sacrebleu



In [15]:
import numpy as np
import nltk
from datasets import load_dataset
import sacrebleu
import evaluate

# Ensure punkt tokenizer is available
nltk.download("punkt", quiet=True)

# Load the ROUGE metric
# rouge_metric = evaluate.load("rouge")

# Function to compute both ROUGE and BLEU metrics
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # # Tokenize sentences for ROUGE
    # decoded_preds_for_rouge = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    # decoded_labels_for_rouge = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # # Compute ROUGE score
    # rouge_result = rouge_metric.compute(predictions=decoded_preds_for_rouge, references=decoded_labels_for_rouge, use_stemmer=True)

    # Tokenize sentences for BLEU
    decoded_preds_for_bleu = [" ".join(nltk.word_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels_for_bleu = [" ".join(nltk.word_tokenize(label.strip())) for label in decoded_labels]

    # Compute BLEU score
    bleu = sacrebleu.corpus_bleu(decoded_preds_for_bleu, [decoded_labels_for_bleu])
    bleu_result = {"bleu": bleu.score}

    # Combine ROUGE and BLEU results
    # result = {**rouge_result, **bleu_result}
        # result = {**rouge_result, **bleu_result}
    return bleu_result

# Example usage (adjust accordingly)
# preds = ... (Your model predictions here)
# labels = ... (Your true labels here)
# eval_preds = (preds, labels)
# metrics = compute_metrics(eval_preds)
# print(metrics)


In [16]:
# nltk.download("punkt", quiet=True)
# metric = evaluate.load("rouge")

# def compute_metrics(eval_preds):
#     preds, labels = eval_preds

#    # decode preds and labels
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#    # rougeLSum expects newline after each sentence
#     decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
#     decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
#     result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
#     return result

In [17]:
!pip install accelerate -U



In [18]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 1

model.to("cuda")

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./kaggle/working/results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False,
    save_strategy="steps",
    save_steps=10000,
   report_to='none')

trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset,
   eval_dataset=tokenized_dataset_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


