Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
337 changes: 32 additions & 305 deletions llm/peft/lora/lora_seq2seq.ipynb

Large diffs are not rendered by default.

254 changes: 254 additions & 0 deletions llm/peft/lora/lora_seq2seq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import mindspore
from mindnlp.transformers import AutoModelForSeq2SeqLM
from mindnlp.peft import get_peft_model, LoraConfig, TaskType
from mindnlp.core import ops

from mindnlp.transformers import AutoTokenizer
from mindnlp.transformers.optimization import get_linear_schedule_with_warmup
from tqdm import tqdm
from datasets import load_dataset

model_name_or_path = "bigscience/mt0-large"
tokenizer_name_or_path = "bigscience/mt0-large"

checkpoint_name = "financial_sentiment_analysis_lora_v1.ckpt"
max_length = 128
lr = 1e-3
num_epochs = 3
batch_size = 8


# In[ ]:


# creating model
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


# In[ ]:


# loading dataset
dataset = load_dataset("financial_phrasebank", "sentences_allagree")
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]

classes = dataset["train"].features["label"].names
dataset = dataset.map(
lambda x: {"text_label": [classes[label] for label in x["label"]]},
batched=True,
num_proc=1,
)

dataset["train"][0]

# In[ ]:

print(dataset.source.ds)
classes = dataset.source.ds.features["label"].names
classes


# In[ ]:


train_dataset, validation_dataset = dataset.shuffle(64).split([0.9, 0.1])


# In[ ]:


def add_text_label(sentence, label):
return sentence, label, classes[label.item()]

train_dataset = train_dataset.map(add_text_label, ['sentence', 'label'], ['sentence', 'label', 'text_label'])
validation_dataset = validation_dataset.map(add_text_label, ['sentence', 'label'], ['sentence', 'label', 'text_label'])


# In[ ]:


next(train_dataset.create_dict_iterator())


# In[ ]:


tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)


# In[ ]:


import numpy as np
from mindnlp.dataset import BaseMapFunction
from threading import Lock
lock = Lock()

class MapFunc(BaseMapFunction):
def __call__(self, sentence, label, text_label):
lock.acquire()
model_inputs = tokenizer(sentence, max_length=max_length, padding="max_length", truncation=True)
labels = tokenizer(text_label, max_length=3, padding="max_length", truncation=True)
lock.release()
labels = labels['input_ids']
labels = np.where(np.equal(labels, tokenizer.pad_token_id), -100, labels)
return model_inputs['input_ids'], model_inputs['attention_mask'], labels


def get_dataset(dataset, tokenizer, shuffle=True):
input_colums=['sentence', 'label', 'text_label']
output_columns=['input_ids', 'attention_mask', 'labels']
dataset = dataset.map(MapFunc(input_colums, output_columns),
input_colums, output_columns)
if shuffle:
dataset = dataset.shuffle(64)
dataset = dataset.batch(batch_size)
return dataset

train_dataset = get_dataset(train_dataset, tokenizer)
eval_dataset = get_dataset(validation_dataset, tokenizer, shuffle=False)


# In[ ]:


next(train_dataset.create_dict_iterator())


# In[ ]:


from mindnlp.core import optim
# optimizer and lr scheduler
optimizer = optim.AdamW(model.trainable_params(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=(len(train_dataset) * num_epochs),
)


# In[ ]:


from mindnlp.core import value_and_grad
# training and evaluation
def forward_fn(**batch):
outputs = model(**batch)
loss = outputs.loss
return loss

grad_fn = value_and_grad(forward_fn, model.trainable_params())

for epoch in range(num_epochs):
model.set_train()
total_loss = 0
train_total_size = train_dataset.get_dataset_size()
for step, batch in enumerate(tqdm(train_dataset.create_dict_iterator(), total=train_total_size)):
optimizer.zero_grad()
loss = grad_fn(**batch)
optimizer.step()
total_loss += loss.float()
lr_scheduler.step()

model.set_train(False)
eval_loss = 0
eval_preds = []
eval_total_size = eval_dataset.get_dataset_size()
for step, batch in enumerate(tqdm(eval_dataset.create_dict_iterator(), total=eval_total_size)):
with mindspore._no_grad():
outputs = model(**batch)
loss = outputs.loss
eval_loss += loss.float()
eval_preds.extend(
tokenizer.batch_decode(ops.argmax(outputs.logits, -1).asnumpy(), skip_special_tokens=True)
)

eval_epoch_loss = eval_loss / len(eval_dataset)
eval_ppl = ops.exp(eval_epoch_loss)
train_epoch_loss = total_loss / len(train_dataset)
train_ppl = ops.exp(train_epoch_loss)
print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")


# In[ ]:


# print accuracy
correct = 0
total = 0

ground_truth = []

for pred, data in zip(eval_preds, validation_dataset.create_dict_iterator(output_numpy=True)):
true = str(data['text_label'])
ground_truth.append(true)
if pred.strip() == true.strip():
correct += 1
total += 1
accuracy = correct / total * 100
print(f"{accuracy=} % on the evaluation dataset")
print(f"{eval_preds[:10]=}")
print(f"{ground_truth[:10]=}")


# In[ ]:


next(eval_dataset.create_tuple_iterator())


# In[ ]:


# saving model
peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"
model.save_pretrained(peft_model_id)


# In[ ]:


ckpt = f"{peft_model_id}/adapter_model.ckpt"
get_ipython().system('du -h $ckpt')


# In[ ]:


from mindnlp.peft import PeftModel, PeftConfig

peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)


# In[ ]:


model.set_train(False)
example = next(validation_dataset.create_dict_iterator(output_numpy=True))

print(example['text_label'])
inputs = tokenizer(example['text_label'], return_tensors="ms")
print(inputs)

with mindspore._no_grad():
outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
print(outputs)
print(tokenizer.batch_decode(outputs.asnumpy(), skip_special_tokens=True))

18 changes: 0 additions & 18 deletions llm/peft/lora/roberta_sequence_classification.ipynb
Original file line number Diff line number Diff line change
@@ -1,23 +1,5 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "7228a58b-4f81-4f5d-ac6c-d9439b3f4447",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"env: HF_ENDPOINT=https://hf-mirror.com\n"
]
}
],
"source": [
"%env HF_ENDPOINT=https://hf-mirror.com"
]
},
{
"cell_type": "code",
"execution_count": 2,
Expand Down
78 changes: 78 additions & 0 deletions llm/peft/lora/roberta_sequence_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import argparse
import os

import mindtorch
from mindtorch.optim import AdamW
from mindtorch.utils.data import DataLoader
# import mindnlp

from peft import (
get_peft_config,
get_peft_model,
get_peft_model_state_dict,
set_peft_model_state_dict,
LoraConfig,
PeftType,
PrefixTuningConfig,
PromptEncoderConfig,
)

import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
from tqdm import tqdm

batch_size = 32
model_name_or_path = "roberta-large"
task = "mrpc"
peft_type = PeftType.LORA
device = "npu" # "cuda"
num_epochs = 20

peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
lr = 3e-4

if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
padding_side = "left"
else:
padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
tokenizer.pad_token_id = tokenizer.eos_token_id

datasets = load_dataset("glue", task)
metric = evaluate.load("glue", task)


def tokenize_function(examples):
# max_length=None => use the model max length (it's actually the default)
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
return outputs


tokenized_datasets = datasets.map(
tokenize_function,
batched=True,
remove_columns=["idx", "sentence1", "sentence2"],
)

# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
# transformers library
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")


def collate_fn(examples):
return tokenizer.pad(examples, padding="longest", return_tensors="pt")


# Instantiate dataloaders.
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
)

model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model
Loading
Loading