In [3]:
# Define the global varialbes
model_name_or_path = "THUDM/chatglm3-6b"
train_data_path = "HasturOfficial/adgen"
eval_data_path = None
seed = 8
max_input_length = 512
max_output_length = 1536
lora_rank = 4
lora_alpha = 32
lora_dropout = 0.05
resume_from_checkpoint = None
prompt_text = ""
compute_dtype = "fp32"

## Data preparation


In [None]:
from datasets import load_dataset, ClassLabel, Sequence
from transformers import AutoTokenizer

dataset = load_dataset(train_data_path)

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)


def tokenize_func(example, tokenizer, ignore_label_id=-100):
    # construct the text of the question
    question = prompt_text + example["content"]
    if example.get("input", None) and example["input"].strip():
        question += f'\n{example["input"]}'

    # construct the text of the answer
    answer = example["summary"]

    # tokenize the question and the answer
    q_ids = tokenizer.encode(text=question, add_special_tokens=False)
    a_ids = tokenizer.encode(text=answer, add_special_tokens=False)

    # intercept the sequence if it is too long
    if len(q_ids) > max_input_length - 2:  # reserve 2 positions for gmask and bos
        q_ids = q_ids[: max_input_length - 2]
    if len(a_ids) > max_output_length - 1:
        a_ids = a_ids[: max_output_length - 1]

    # construct the format of input of the model
    input_ids = tokenizer.build_inputs_with_special_tokens(q_ids, a_ids)
    question_length = len(q_ids) + 2  # add 2 for gmask and bos

    # ignore the tokens in the question part (marked with ignore_label_id) and focus on predicting the tokens in the answer part.
    labels = [ignore_label_id] * question_length + input_ids[question_length:]
    return {"input_ids": input_ids, "labels": labels}


column_names = dataset["train"].column_names
tokenized_dataset = dataset["train"].map(
    lambda example: tokenize_func(example, tokenizer),
    batched=False,  # indicates that the function is applied on single examples, not in batches
    remove_columns=column_names,  # remove the original columns after the function is applied
)

# dataset processing: shuffle, flatten
tokenized_dataset = tokenized_dataset.shuffle(seed=seed)
tokenized_dataset = tokenized_dataset.flatten_indices()

In [7]:
# Define the data collator to process the data in batches
import torch
from typing import List, Dict, Optional


class DataCollatorForChatGLM:
    def __init__(
        self, pad_token_id: int, max_length: int = 2048, ignore_label_id: int = -100
    ):
        """init DataCollator

        Args:
            pad_token_id (int): token id for padding
            max_length (int, optional): the max lenth of the a batch.
            ignore_label_id (int, optional): _description_. Defaults to -100.
        """
        self.pad_token_id = pad_token_id  # padding sequences to the same length
        self.max_length = max_length
        self.ignore_label_id = ignore_label_id

    def __call__(self, batch_data: List[Dict[str, List]]) -> Dict[str, torch.Tensor]:
        """process the batch data

        Args:
            batch_data (List[Dict[str, List]]): the list of dict of multiple samples:
            where each dictionary represents a sample and contains a list of input ids and labels.

        Returns:
            Dict[str, torch.Tensor]: the list of dict of processed data
        """
        # compute the length of each sample in the batch
        len_list = [len(d["input_ids"]) for d in batch_data]
        batch_max_len = max(len_list)  # find the max length of the batch

        input_ids, label = [], []
        for len_of_d, d in sorted(zip(len_list, batch_data), key=lambda x: -x[0]):
            pad_len = batch_max_len - len_of_d  # compute the length of padding

            # add padding to the input_ids and labels
            ids = d["input_ids"] + [self.pad_token_id] * pad_len
            label = d["labels"] + [self.ignore_label_id] * pad_len

            # ensure the length of the input_ids and labels is not more than max_length
            if batch_max_len > self.max_length:
                ids = ids[: self.max_length]
                label = label[: self.max_length]

            # intergrate the input_ids and labels into the batch
            input_ids.append(torch.LongTensor(ids))
            label.append(torch.LongTensor(label))

        input_ids = torch.stack(input_ids)
        labels = torch.stack(labels)

        return {"input_ids": input_ids, "labels": labels}


# prepare the data collator
data_collator = DataCollatorForChatGLM(pad_token_id=tokenizer.pad_token_id)

## Traning


In [None]:
## Load the model
from transformers import AutoModel, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training


_compute_dtype_map = {
    "fp32": torch.float32,
    "fp16": torch.float16,
    "bf16": torch.bfloat16,
}

q_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=_compute_dtype_map["bf16"],
)

# load the model
model = AutoModel.from_pretrained(
    model_name_or_path,
    config=q_config,
    trust_remote_code=True,
)

kbit_model = prepare_model_for_kbit_training(model)

### LoRA Config


In [None]:
from peft import TaskType, LoraConfig, get_peft_model
from peft.utils import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING


target_modules = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING["chatglm"]

lora_config = LoraConfig(
    target_modules=target_modules,
    r=lora_rank,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    inference_mode=False,
    task_type=TaskType.CAUSAL_LM,
)

qlora_model = get_peft_model(kbit_model, lora_config)
# qlora_model.print_trainable_parameters()  # print the trainable parameters of the model

### Fine-tune Traning


In [None]:
## Hyperparameters configuration
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=f"models/{model_name_or_path}",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    learning_rate=1e-3,
    num_train_epochs=1,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    optim="adamw_torch",
    fp16=True,
)

trainer = Trainer(
    model=qlora_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)


# start training
trainer.train()


# save the trained model
qlora_model.save_pretrained(f"models/demo/{model_name_or_path}")

## Inference


In [None]:
from peft import PeftConfig, PeftModel

model_name_or_path = "THUDM/chatglm3-6b"
peft_model_path = f"models/demo/{model_name_or_path}"

# load No fine-tunned model
config = PeftConfig.from_pretrained(peft_model_path)
q_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float32,
)


base_model = AutoModel.from_pretrained(
    model_name_or_path,
    config=q_config,
    trust_remote_code=True,
    device_map="auto",  # Transformers库会自动检测你的硬件环境，并尝试以最优的方式分配模型的各个部分到不同的设备上（比如多个GPU）
)

base_model.requires_grad_(False)
base_model.eval()


input_text = "类型#裙*版型#显瘦*风格#文艺*风格#简约*图案#印花*图案#撞色*裙下摆#压褶*裙长#连衣裙*裙领型#圆领"
tokenizer = AutoTokenizer.from_pretrained(
    config.model_name_or_path, trust_remote_code=True
)

response, history = base_model.chat(tokenizer=tokenizer, query=input_text)
print(f"Before fine-tuning: {response}")


model = PeftModel.from_pretrained(base_model, peft_model_path)
response, history = model.chat(tokenizer=tokenizer, query=input_text)
print(f"After fine-tuning: {response}")