# Zephyr finetuning (LORA)

> Add blockquote



In [None]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
! pip install datasets transformers trl peft accelerate bitsandbytes auto-gptq optimum
# !: Executes the following command as a shell command
# pip install: Installs Python packages using pip
# datasets: Library for ready-to-use datasets and tools to process and share them
# transformers: Library for state-of-the-art pre-trained NLP models
# trl: Integrates reinforcement learning with transformer models
# peft: For parameter-efficient fine-tuning of large models
# accelerate: Makes training and evaluation of deep learning models faster and easier
# bitsandbytes: Efficient implementations of optimization algorithms for large-scale deep learning
# auto-gptq: Automatically quantizes GPT models to reduce size and increase speed
# optimum: Optimizes transformers models for deployment on different hardware

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.9.6-py3-none-any.whl (245 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting auto-gptq
  Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2

In [2]:
import torch
# torch: A popular deep learning library providing tensors and dynamic neural networks in Python.

from datasets import load_dataset, Dataset
# load_dataset: Function to load a dataset from Hugging Face datasets library.
# Dataset: A class representing a dataset in the Hugging Face datasets library.

from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
# LoraConfig: Configuration class for LoRA (Low-Rank Adaptation) training.
# prepare_model_for_kbit_training: Prepares a model for k-bit (quantized) training.
# get_peft_model: Gets a parameter-efficient fine-tuning (PEFT) model.

from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, TrainingArguments
# AutoModelForCausalLM: Automatically loads a pre-trained causal language model.
# AutoTokenizer: Automatically loads the tokenizer for a pre-trained model.
# GPTQConfig: Configuration class for GPT quantization.
# TrainingArguments: Arguments for configuring the training process.

from trl import SFTTrainer
# SFTTrainer: Trainer for supervised fine-tuning (SFT) of transformer models.


In [3]:
class Config:
    # Configuration class for model training and setup

    MODEL_ID = "TheBloke/zephyr-7B-alpha-GPTQ"
    # MODEL_ID: Identifier for the pre-trained model to be used

    DATASET_ID = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"
    # DATASET_ID: Identifier for the dataset to be used for training

    CONTEXT_FIELD= ""
    # CONTEXT_FIELD: Field name for context in the dataset (if any)

    INSTRUCTION_FIELD = "instruction"
    # INSTRUCTION_FIELD: Field name for instructions in the dataset

    TARGET_FIELD = "response"
    # TARGET_FIELD: Field name for target responses in the dataset

    BITS = 4
    # BITS: Number of bits for quantization

    DISABLE_EXLLAMA = True
    # DISABLE_EXLLAMA: Flag to disable ExLLama, a specific optimization (if applicable)

    DEVICE_MAP = "auto"
    # DEVICE_MAP: Device configuration for training (e.g., CPU, GPU)

    USE_CACHE = False
    # USE_CACHE: Flag to disable caching of model predictions

    LORA_R = 16
    # LORA_R: Low-Rank Adaptation rank parameter

    LORA_ALPHA = 16
    # LORA_ALPHA: Low-Rank Adaptation scaling factor

    LORA_DROPOUT = 0.05
    # LORA_DROPOUT: Dropout rate for Low-Rank Adaptation

    BIAS = "none"
    # BIAS: Bias configuration for the model (none, frozen, or trainable)

    TARGET_MODULES = ["q_proj", "v_proj"]
    # TARGET_MODULES: List of target modules for Low-Rank Adaptation

    TASK_TYPE = "CAUSAL_LM"
    # TASK_TYPE: Task type (e.g., CAUSAL_LM for causal language modeling)

    OUTPUT_DIR = "zephyr-support-chatbot"
    # OUTPUT_DIR: Directory to save the trained model and outputs

    BATCH_SIZE = 8
    # BATCH_SIZE: Number of samples per batch during training

    GRAD_ACCUMULATION_STEPS = 1
    # GRAD_ACCUMULATION_STEPS: Number of gradient accumulation steps

    OPTIMIZER = "paged_adamw_32bit"
    # OPTIMIZER: Optimizer to be used for training (e.g., AdamW)

    LR = 2e-4
    # LR: Learning rate for the optimizer

    LR_SCHEDULER = "cosine"
    # LR_SCHEDULER: Learning rate scheduler type (e.g., cosine annealing)

    LOGGING_STEPS = 50
    # LOGGING_STEPS: Frequency of logging training metrics (in steps)

    SAVE_STRATEGY = "epoch"
    # SAVE_STRATEGY: Strategy for saving checkpoints (e.g., every epoch)

    NUM_TRAIN_EPOCHS = 1
    # NUM_TRAIN_EPOCHS: Number of training epochs

    MAX_STEPS = 250
    # MAX_STEPS: Maximum number of training steps

    FP16 = True
    # FP16: Flag to use 16-bit (half-precision) floating point for training

    PUSH_TO_HUB = True
    # PUSH_TO_HUB: Flag to push the trained model to the Hugging Face hub

    DATASET_TEXT_FIELD = "text"
    # DATASET_TEXT_FIELD: Field name for text data in the dataset

    MAX_SEQ_LENGTH = 512
    # MAX_SEQ_LENGTH: Maximum sequence length for model inputs

    PACKING = False
    # PACKING: Flag to enable/disable input packing for sequences



In [None]:
class ZephyrTrainer:
    def __init__(self):
        self.config = Config()
        # Initialize configuration from the Config class

        self.tokenizer = AutoTokenizer.from_pretrained(self.config.MODEL_ID)
        # Load the tokenizer for the specified pre-trained model

        self.tokenizer.pad_token = self.tokenizer.eos_token
        # Set the padding token to be the same as the end-of-sequence token

    def process_data_sample(self, example):
        processed_example = (
            "\n You are a support chatbot who helps with user queries "
            "chatbot who always responds in the style of a professional.\n\n"
            + example[self.config.INSTRUCTION_FIELD]
            + "\n\n"
            + example[self.config.TARGET_FIELD]
        )
        # Combine instruction and target fields with a prompt for the chatbot's style

        return processed_example

    def create_dataset(self):
        data = load_dataset(self.config.DATASET_ID, split="train")
        # Load the dataset for training from the specified dataset ID

        print("\n====================================================================\n")
        print("\t\t\tDOWNLOADED DATASET")
        print("\n====================================================================\n")

        df = data.to_pandas()
        # Convert the dataset to a Pandas DataFrame

        df[self.config.DATASET_TEXT_FIELD] = df[
            [self.config.INSTRUCTION_FIELD, self.config.TARGET_FIELD]
        ].apply(lambda x: self.process_data_sample(x), axis=1)
        # Process each row in the DataFrame to create the text field for training

        print("\n====================================================================\n")
        print("\t\t\tPROCESSED DATASET")
        print(df.iloc[0])
        print("\n====================================================================\n")

        processed_data = Dataset.from_pandas(df[[self.config.DATASET_TEXT_FIELD]])
        # Convert the processed DataFrame back to a Hugging Face Dataset

        return processed_data

    def prepare_model(self):
        bnb_config = GPTQConfig(
            bits=self.config.BITS,
            disable_exllama=self.config.DISABLE_EXLLAMA,
            tokenizer=self.tokenizer,
        )
        # Configure GPTQ for quantization with specified parameters

        model = AutoModelForCausalLM.from_pretrained(
            self.config.MODEL_ID,
            quantization_config=bnb_config,
            device_map=self.config.DEVICE_MAP,
        )
        # Load the pre-trained causal language model with the quantization config

        print("\n====================================================================\n")
        print("\t\t\tDOWNLOADED MODEL")
        print(model)
        print("\n====================================================================\n")

        model.config.use_cache = self.config.USE_CACHE
        # Update model configuration to disable caching

        model.config.pretraining_tp = 1
        # Set pretraining tensor parallelism to 1

        model.gradient_checkpointing_enable()
        # Enable gradient checkpointing to save memory during training

        model = prepare_model_for_kbit_training(model)
        # Prepare the model for k-bit training

        print("\n====================================================================\n")
        print("\t\t\tMODEL CONFIG UPDATED")
        print("\n====================================================================\n")

        peft_config = LoraConfig(
            r=self.config.LORA_R,
            lora_alpha=self.config.LORA_ALPHA,
            lora_dropout=self.config.LORA_DROPOUT,
            bias=self.config.BIAS,
            task_type=self.config.TASK_TYPE,
            target_modules=self.config.TARGET_MODULES,
        )
        # Configure Low-Rank Adaptation (LoRA) for fine-tuning

        model = get_peft_model(model, peft_config)
        # Apply the PEFT model configuration to the model

        print("\n====================================================================\n")
        print("\t\t\tPREPARED MODEL FOR FINETUNING")
        print(model)
        print("\n====================================================================\n")

        return model, peft_config

    def set_training_arguments(self):
        training_arguments = TrainingArguments(
            output_dir=self.config.OUTPUT_DIR,
            per_device_train_batch_size=self.config.BATCH_SIZE,
            gradient_accumulation_steps=self.config.GRAD_ACCUMULATION_STEPS,
            optim=self.config.OPTIMIZER,
            learning_rate=self.config.LR,
            lr_scheduler_type=self.config.LR_SCHEDULER,
            save_strategy=self.config.SAVE_STRATEGY,
            logging_steps=self.config.LOGGING_STEPS,
            num_train_epochs=self.config.NUM_TRAIN_EPOCHS,
            max_steps=self.config.MAX_STEPS,
            fp16=self.config.FP16,
            push_to_hub=self.config.PUSH_TO_HUB,
        )
        # Set the training arguments for the Trainer

        return training_arguments

    def train(self):
        data = self.create_dataset()
        # Create the processed training dataset

        model, peft_config = self.prepare_model()
        # Prepare the model for training with PEFT configuration

        training_args = self.set_training_arguments()
        # Set the training arguments

        trainer = SFTTrainer(
            model=model,
            train_dataset=data,
            peft_config=peft_config,
            dataset_text_field=self.config.DATASET_TEXT_FIELD,
            args=training_args,
            tokenizer=self.tokenizer,
            packing=self.config.PACKING,
            max_seq_length=self.config.MAX_SEQ_LENGTH,
        )
        # Initialize the SFTTrainer with the model, data, and training arguments

        trainer.train()
        # Start the fine-tuning process

        print("\n====================================================================\n")
        print("\t\t\tFINETUNING COMPLETED")
        print("\n====================================================================\n")

        trainer.push_to_hub()
        # Push the trained model to the Hugging Face hub

In [None]:
zephyr_trainer = ZephyrTrainer()
# Create an instance of the ZephyrTrainer class

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/983 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

In [None]:
zephyr_trainer.train()

Downloading readme:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26872 [00:00<?, ? examples/s]

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

Some weights of the model checkpoint at TheBloke/zephyr-7B-alpha-GPTQ were not used when initializing MistralForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_proj.bias', 'model.layers.10.self_attn.v_proj.bias', 'model.layers.11.mlp.down_proj.bias', 'model.layers.11.mlp.gate

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Map:   0%|          | 0/26872 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss


KeyboardInterrupt: 

In [None]:
from peft import AutoPeftModelForCausalLM
# AutoPeftModelForCausalLM: A class to automatically load a PEFT model for causal language modeling

from transformers import GenerationConfig
# GenerationConfig: Configuration class for text generation settings

from transformers import AutoTokenizer
# AutoTokenizer: Automatically loads the tokenizer for a pre-trained model

import torch
# torch: A popular deep learning library providing tensors and dynamic neural networks in Python

In [None]:
def process_data_sample(example):
    # Function to process a single data sample

    processed_example = (
        "\n You are a support chatbot who helps with user queries "
        "chatbot who always responds in the style of a professional.\n\n"
        + example["instruction"]
        + "\n\n"
    )
    # Create a processed example string with a prompt and the instruction from the example

    return processed_example
    # Return the processed example


In [None]:

tokenizer = AutoTokenizer.from_pretrained("/content/zephyr-support-chatbot")
# Load the tokenizer from the pre-trained model located at the specified path


In [None]:
inp_str = process_data_sample(
    {
        "instruction": "i have a question about cancelling order {{Order Number}}",
    }
)
# Create an input string by processing a data sample with the specified instruction
# The data sample contains a single key-value pair where the key is "instruction"
# and the value is the instruction text about cancelling an order


In [None]:
inputs = tokenizer(inp_str, return_tensors="pt").to("cuda")
# Tokenize the input string and return tensors in PyTorch format
# Move the tokenized inputs to the GPU for processing (assuming "cuda" is available)

In [None]:
model = AutoPeftModelForCausalLM.from_pretrained(
    "/content/zephyr-support-chatbot",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda"
)
# Load the PEFT model for causal language modeling from the specified path
# low_cpu_mem_usage=True: Optimize for low CPU memory usage during model loading
# return_dict=True: Return the model's outputs as a dictionary
# torch_dtype=torch.float16: Use 16-bit floating point precision for the model
# device_map="cuda": Map the model to GPU for processing


Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


In [None]:
generation_config = GenerationConfig(
    do_sample=True,
    # Enable sampling for text generation

    top_k=1,
    # Use top-k sampling with k=1 (only the most likely token is considered at each step)

    temperature=0.1,
    # Set the temperature for sampling (lower values make the model more confident and deterministic)

    max_new_tokens=256,
    # Maximum number of new tokens to generate

    pad_token_id=tokenizer.eos_token_id
    # Set the padding token ID to the end-of-sequence token ID from the tokenizer
)


In [None]:
import time
# Import the time module to measure the execution time

st_time = time.time()
# Record the start time

outputs = model.generate(**inputs, generation_config=generation_config)
# Generate text using the model with the specified inputs and generation configuration

print(tokenizer.decode(outputs[0], skip_special_tokens=True))
# Decode the generated tokens into a string, skipping special tokens, and print the result

print(time.time() - st_time)
# Print the total time taken for the text generation process

<|system|>
 You are a support chatbot who helps with user queries chatbot who always responds in the style of a professional.
<|user|>
i have a question about cancelling order {{Order Number}}
<|assistant|>
I've got it! I understand that you have a question about canceling order number {{Order Number}}. Let me assist you with that. To cancel your order, you can reach out to our customer support team. They will guide you through the process and provide you with all the necessary information. Rest assured, we are here to help you every step of the way. If you have any other questions or concerns, feel free to let me know. I'm here to support you!
13.943601369857788
