In [39]:
# Installing the necessary libraries
!pip install -q transformers datasets
# install peft from github
!pip install -q git+https://github.com/huggingface/peft

  Installing build dependencies ... [?25l[?25hcanceled
[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/commands/install.py", line 377, in run
    requirement_set = resolver.resolve(
                      ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/resolution/resolvelib/resolver.py", line 76, in resolve
    collected = self.factory.collect_root_requirements(root_reqs)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/resolution

In [64]:
import pandas as pd
import torch
import datasets
from datasets import load_dataset
from datasets import Dataset, DatasetDict

In [55]:
dataset = pd.read_csv('/content/en-annotated.tsv', sep='\t', header=None)

In [63]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.1)

train = train.rename(columns={0: 'dialogue', 1: 'summary'})
test = test.rename(columns={0: 'dialogue', 1: 'summary'})

In [67]:
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(test)

ds = DatasetDict()

ds['train'] = tds
ds['test'] = vds

dataset = ds

In [68]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary', '__index_level_0__'],
        num_rows: 15775
    })
    test: Dataset({
        features: ['dialogue', 'summary', '__index_level_0__'],
        num_rows: 1753
    })
})


In [69]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="bigscience/mt0-small"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [70]:
from datasets import concatenate_datasets
import numpy as np

In [18]:
train[1]

Unnamed: 0,1
896,2
9534,2
2762,5
8814,2
4082,1
...,...
2237,5
946,2
1982,"4, 6"
17276,8


In [72]:
# Here we tokenize the dialogues, which is the input of our model
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
# take 85 percentile of max length for better utilization
max_source_length = int(np.percentile(input_lenghts, 100))
print(f"Max source length: {max_source_length}")

# Here we tokenize the summary, which should be the output of our model
# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
# take 90 percentile of max length for better utilization
max_target_length = int(np.percentile(target_lenghts, 100))
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/17528 [00:00<?, ? examples/s]

Max source length: 89


Map:   0%|          | 0/17528 [00:00<?, ? examples/s]

Max target length: 8


In [80]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = ["classify emotions: " + item for item in sample["dialogue"]]

    # tokenize inputs which was the dialogue
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument, which was the summary
    labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk("data/train")
tokenized_dataset["test"].save_to_disk("data/eval")

Map:   0%|          | 0/15775 [00:00<?, ? examples/s]

Map:   0%|          | 0/1753 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels']


Saving the dataset (0/1 shards):   0%|          | 0/15775 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1753 [00:00<?, ? examples/s]

In [75]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType

# when you're using a big model, you can quantisize the model  to save memory by using its
# bit configuration in the parameter setting, that is, 'load_in_4bit=True' or 'load_in_8bit=True'
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto")

config.json:   0%|          | 0.00/773 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

In [76]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training when you use a quatizied model
# model = repare_model_for_kbit_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 688,128 || all params: 300,864,896 || trainable%: 0.2287


In [77]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [78]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="tutorial"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=1,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    # save_strategy
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [82]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [None]:
# Save our LoRA model & tokenizer results
peft_model_id= output_dir
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)
# if you want to save the base model to call
# trainer.model.base_model.save_pretrained(peft_model_id)

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc.
peft_model_id = output_dir
config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
model.eval()

print("Peft model loaded")

In [None]:
# use the first sample of the test set
sample = dataset['test'][0]

input_ids = tokenizer(sample["dialogue"], return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = model.generate(input_ids=input_ids, max_new_tokens=40, do_sample=True, top_p=0.9)
print(f"input sentence: {sample['dialogue']}\n{'---'* 20}")

print(f"summary:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}")