<a href="https://colab.research.google.com/github/matthew-mcateer/LLaMahack_submission/blob/main/Zephyr_7B_fine_Tuning_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stability.ai Zephyr-3B-beta fine-tuning demo

An exploration of fine-tuning Stability.ai's Zephyr-3B model, released just 3 days prior: https://stability.ai/news/stablelm-zephyr-3b-stability-llm

![](https://ci3.googleusercontent.com/meips/ADKq_Naspq5gzr0LaoeQjhVlnbi_KaPlVY_npUcMvxzGYBEoUYW6jHXZxpY8kusoutD4dR2mdjgl4pEiPWy7YXwNsHB4whS1_jPw5NrQDOAocGW9PnVu2rtMk9p291O-IIVYaPgLCXQQcIT0sTbJ99ublCWhtgWRRgLbYnqkKWouYSdlwy6tdfyf7b8ERy4vW1BiHh0wwpsa18fUCDQy106fynOZzfI70r-peUFRQWbcBEMnSQEuXzQI8abjy_MQnEbeFFZ6-uHln6pXH3tKU6-fWz7pftG06mEAkPLKq9zEQytbTICDAO4zrLeJ=s0-d-e1-ft#https://substackcdn.com/image/fetch/w_1136,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffee7fcb0-d73f-46ce-b23b-6e481977e72e_568x278.png)

In [1]:
!pip install datasets transformers trl peft accelerate bitsandbytes auto-gptq autoawq optimum



In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Importing Dependencies

In [3]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig, GPTQConfig, TrainingArguments
from trl import SFTTrainer



### Chatbot Config

In [4]:
class Config:
    MODEL_ID = "TheBloke/zephyr-7B-beta-GPTQ"
    DATASET_ID = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"
    CONTEXT_FIELD= ""
    INSTRUCTION_FIELD = "instruction"
    TARGET_FIELD = "response"
    BITS = 4
    DISABLE_EXLLAMA = True
    DEVICE_MAP = "auto"
    USE_CACHE = False
    LORA_R = 16 # For fine-tuning with LoRA
    LORA_ALPHA = 16 # For fine-tuning with LoRA
    LORA_DROPOUT = 0.05 # For fine-tuning with LoRA
    BIAS = "none"
    # Weni was having trouble with this selection of modules ["q_proj", "k_proj",  "v_proj" "o_proj"]
    TARGET_MODULES = ["q_proj", "v_proj"] # For fine-tuning with LoRA
    TASK_TYPE = "CAUSAL_LM"
    OUTPUT_DIR = "zephyr-support-chatbot"
    BATCH_SIZE = 8
    GRAD_ACCUMULATION_STEPS = 1
    OPTIMIZER = "paged_adamw_32bit"
    LR = 2e-4
    LR_SCHEDULER = "cosine"
    LOGGING_STEPS = 50
    SAVE_STRATEGY = "epoch"
    NUM_TRAIN_EPOCHS = 1
    MAX_STEPS = 250
    FP16 = True
    PUSH_TO_HUB = True
    DATASET_TEXT_FIELD = "text"
    MAX_SEQ_LENGTH = 512
    PACKING = False

## Zephyr Trainer

In [5]:
class ZephyrTrainer:
    def __init__(self):
        """
        A Trainer used to train the Zephyr 7B model which beats Llama2-70b-chat model for your custom usecase

        Initialized:
        config: Parameters required for the trainer to create and process dataset, train and save model finally
        tokenizer: Tokenizer required in training loop
        """

        self.config = Config()
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.MODEL_ID)
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def process_data_sample(self, example):
        """
        Helper function to process the dataset sample by adding prompt and clean if necessary.

        Args:
        example: Data sample

        Returns:
        processed_example: Data sample post processing
        """

        processed_example = (
            "<|system|>\n You are a support chatbot who helps with user queries chatbot who always responds in the style of a professional.\n<|user|>\n"
            + example[self.config.INSTRUCTION_FIELD]
            + "\n<|assistant|>\n"
            + example[self.config.TARGET_FIELD]
        )

        return processed_example

    def create_dataset(self):
        """
        Downloads and processes the dataset

        Returns:
        processed_data: Training ready processed dataset
        """

        data = load_dataset(self.config.DATASET_ID, split="train")

        print(
            "\n====================================================================\n"
        )
        print("\t\t\tDOWNLOADED DATASET")
        print(
            "\n====================================================================\n"
        )

        df = data.to_pandas()
        df[self.config.DATASET_TEXT_FIELD] = df[
            [self.config.INSTRUCTION_FIELD, self.config.TARGET_FIELD]
        ].apply(lambda x: self.process_data_sample(x), axis=1)

        print(
            "\n====================================================================\n"
        )
        print("\t\t\tPROCESSED DATASET")
        print(df.iloc[0])
        print(
            "\n====================================================================\n"
        )

        processed_data = Dataset.from_pandas(
            df[[self.config.DATASET_TEXT_FIELD]]
        )
        return processed_data

    def prepare_model(self):
        """
        Prepares model for finetuning by quantizing it and attaching lora modules to the model

        Returns:
        model - Model ready for finetuning
        peft_config - LoRA Adapter config
        """

        bnb_config = GPTQConfig( # Going with GPTQ as a train-time example. I believe AWQ focuses on inference time
            bits=self.config.BITS,
            disable_exllama=self.config.DISABLE_EXLLAMA,
            tokenizer=self.tokenizer,
        )

        model = AutoModelForCausalLM.from_pretrained(
            self.config.MODEL_ID,
            quantization_config=bnb_config,
            device_map=self.config.DEVICE_MAP,
        )

        print(
            "\n====================================================================\n"
        )
        print("\t\t\tDOWNLOADED MODEL")
        print(model)
        print(
            "\n====================================================================\n"
        )

        model.config.use_cache = self.config.USE_CACHE
        model.config.pretraining_tp = 1
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)

        print(
            "\n====================================================================\n"
        )
        print("\t\t\tMODEL CONFIG UPDATED")
        print(
            "\n====================================================================\n"
        )

        peft_config = LoraConfig(
            r=self.config.LORA_R,
            lora_alpha=self.config.LORA_ALPHA,
            lora_dropout=self.config.LORA_DROPOUT,
            bias=self.config.BIAS,
            task_type=self.config.TASK_TYPE,
            target_modules=self.config.TARGET_MODULES,
        )

        model = get_peft_model(model, peft_config)

        print(
            "\n====================================================================\n"
        )
        print("\t\t\tPREPARED MODEL FOR FINETUNING")
        print(model)
        print(
            "\n====================================================================\n"
        )

        return model, peft_config

    def set_training_arguments(self):
        """
        Sets the arguments for the training loop in TrainingArguments class
        """

        training_arguments = TrainingArguments(
            output_dir=self.config.OUTPUT_DIR,
            per_device_train_batch_size=self.config.BATCH_SIZE,
            gradient_accumulation_steps=self.config.GRAD_ACCUMULATION_STEPS,
            optim=self.config.OPTIMIZER,
            learning_rate=self.config.LR,
            lr_scheduler_type=self.config.LR_SCHEDULER,
            save_strategy=self.config.SAVE_STRATEGY,
            logging_steps=self.config.LOGGING_STEPS,
            num_train_epochs=self.config.NUM_TRAIN_EPOCHS,
            max_steps=self.config.MAX_STEPS,
            fp16=self.config.FP16,
            push_to_hub=self.config.PUSH_TO_HUB,
        )

        return training_arguments

    def train(self):
        """
        Trains the model on the specified dataset in config
        """

        data = self.create_dataset()
        model, peft_config = self.prepare_model()
        training_args = self.set_training_arguments()

        print(
            "\n====================================================================\n"
        )
        print("\t\t\tPREPARED FOR FINETUNING")
        print(
            "\n====================================================================\n"
        )

        trainer = SFTTrainer(
            model=model,
            train_dataset=data,
            peft_config=peft_config,
            dataset_text_field=self.config.DATASET_TEXT_FIELD,
            args=training_args,
            tokenizer=self.tokenizer,
            packing=self.config.PACKING,
            max_seq_length=self.config.MAX_SEQ_LENGTH,
        )
        trainer.train()

        print(
            "\n====================================================================\n"
        )
        print("\t\t\tFINETUNING COMPLETED")
        print(
            "\n====================================================================\n"
        )

        trainer.push_to_hub()

In [6]:
if __name__ == "__main__":
    zephyr_trainer = ZephyrTrainer()
    zephyr_trainer.train()

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]



			DOWNLOADED DATASET




Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.




			PROCESSED DATASET
flags                                                          B
instruction     question about cancelling order {{Order Number}}
category                                                   ORDER
intent                                              cancel_order
response       I've understood you have a question regarding ...
text           <|system|>\n You are a support chatbot who hel...
Name: 0, dtype: object




config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. use_exllama, exllama_config, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



			DOWNLOADED MODEL
MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (rotary_emb): MistralRotaryEmbedding()
          (k_proj): QuantLinear()
          (o_proj): QuantLinear()
          (q_proj): QuantLinear()
          (v_proj): QuantLinear()
        )
        (mlp): MistralMLP(
          (act_fn): SiLUActivation()
          (down_proj): QuantLinear()
          (gate_proj): QuantLinear()
          (up_proj): QuantLinear()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
)




			MODEL CONFIG UPDATED




			PREPARED MODEL FOR FINETUNING
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(


Map:   0%|          | 0/26872 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
50,1.0492
100,0.6915
150,0.6447
200,0.6095
250,0.6077




			FINETUNING COMPLETED




In [7]:
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig
from transformers import AutoTokenizer
import torch

def process_data_sample(example):

    processed_example = "<|system|>\n You are a support chatbot who helps with user queries chatbot who always responds in the style of a professional.\n<|user|>\n" + example["instruction"] + "\n<|assistant|>\n"

    return processed_example

tokenizer = AutoTokenizer.from_pretrained("/content/zephyr-support-chatbot")

inp_str = process_data_sample(
    {
        "instruction": "i have a question about cancelling order {{Order Number}}",
    }
)

inputs = tokenizer(inp_str, return_tensors="pt").to("cuda")

model = AutoPeftModelForCausalLM.from_pretrained(
    "/content/zephyr-support-chatbot",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda")

generation_config = GenerationConfig(
    do_sample=True,
    top_k=1,
    temperature=0.1,
    max_new_tokens=256,
    pad_token_id=tokenizer.eos_token_id
)

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


In [8]:
import time
st_time = time.time()
outputs = model.generate(**inputs, generation_config=generation_config)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
print(time.time()-st_time)

<|system|>
 You are a support chatbot who helps with user queries chatbot who always responds in the style of a professional.
<|user|>
i have a question about cancelling order {{Order Number}}
<|assistant|>
I'll take care of it! I understand that you have a question about canceling order number {{Order Number}}. Let me assist you with that. To cancel your order, you can reach out to our customer support team. They will be able to guide you through the cancellation process and provide you with any necessary information. Rest assured, we are here to help you every step of the way. Is there anything else I can assist you with? Feel free to let me know. Your satisfaction is our top priority!
36.53734612464905
