In [1]:
from datasets import load_dataset
import torch
import evaluate
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from transformers.utils.notebook import NotebookProgressCallback
from pipeline import CustomTrainer

%load_ext autoreload
%autoreload 2

# Setup

In [7]:
models = ["google/flan-t5-small", "TheBloke/Llama-2-7B-GGUF", "bigscience/bloom"]

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload = True,
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    "google/flan-t5-small",
    device_map='auto',
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small", trust_remote_code=True)

dataset = load_dataset('csv',split='train',data_files="dataset_complete.csv")
dataset = dataset.train_test_split(test_size=0.05)

metric = evaluate.load("sacrebleu")

In [9]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels
    
def eval_metric(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [10]:
trainer_factory = CustomTrainer(tokenizer=tokenizer, tpe="flint", model=model, dataset=dataset, eval=eval_metric)

In [11]:
trainer = trainer_factory.get_trainer()
trainer.callback_handler.callbacks.pop()
trainer.add_callback(NotebookProgressCallback)

Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1046 > 512). Running this sequence through the model will result in indexing errors


Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
results = trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
0,0.201,0.029959
2,0.0572,0.018191
4,0.0448,0.017402
6,0.0396,0.016483
8,0.0366,0.015986
9,0.0358,0.01597
