# Mistral Finetuning

In [1]:
# Installing necessary libraries for machine learning and model optimization
!pip install accelerate
# accelerate: A library to accelerate training and inference of machine learning models.

!pip install peft
# peft: Parameter-efficient fine-tuning, used to fine-tune pre-trained models efficiently.

!pip install bitsandbytes
# bitsandbytes: A lightweight library for 8-bit optimizers and quantization, useful for reducing the memory footprint.

!pip install trl py7zr auto-gptq optimum
# trl: Transformers Reinforcement Learning library, for training language models with reinforcement learning.
# py7zr: A library for handling 7zip archives, useful for compressing and decompressing model files.
# auto-gptq: Automatic model quantization library, for optimizing models to run efficiently on hardware.
# optimum: A library to optimize and deploy machine learning models on various hardware.

!pip install git+https://github.com/huggingface/transformers
# transformers: The Hugging Face Transformers library, installed directly from the GitHub repository to get the latest updates.

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12=

In [2]:
from huggingface_hub import notebook_login

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import torch
from datasets import load_dataset,Dataset

In [5]:
#The “samsum” dataset contains dialogues that are useful for tasks such as text summarization.
df=load_dataset("samsum",split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

The repository for samsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/samsum.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

In [6]:
df.shape

(14732, 3)

In [7]:
df

Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 14732
})

In [8]:
# Convert the dataset to a Pandas DataFrame
# This conversion is useful because it allows for easier data manipulation and analysis using Pandas’ powerful data handling capabilities.
datadf=df.to_pandas()

In [9]:
datadf.head()

Unnamed: 0,id,dialogue,summary
0,13818513,Amanda: I baked cookies. Do you want some?\r\...,Amanda baked cookies and will bring Jerry some...
1,13728867,Olivia: Who are you voting for in this electio...,Olivia and Olivier are voting for liberals in ...
2,13681000,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...",Kim may try the pomodoro technique recommended...
3,13730747,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...
4,13728094,Sam: hey overheard rick say something\r\nSam:...,"Sam is confused, because he overheard Rick com..."


In [10]:
# Randomly sample 50 rows from the DataFrame
# The resulting DataFrame datadf now contains only these 50 randomly chosen rows.
datadf = datadf.sample(50)

In [11]:
# Combine the "dialogue" and "summary" columns into a new "text" column with a specific format
datadf["text"] = datadf[["dialogue", "summary"]].apply(
    lambda x: "### Human: Summarize this following dialogue: " + x["dialogue"] + "\n### Assistant: " + x["summary"],
    axis=1
)
#This transformation combines the dialogue and its summary into a single text string in a way that mimics a human-assistant interaction, which can be useful for training dialogue models or other natural language processing tasks.

In [12]:
datadf.head()

Unnamed: 0,id,dialogue,summary,text
12994,13818954,"Tom: How are you today?\r\nJenny: I'm fine, al...",Jenny and Mia are feeling fine after last night.,### Human: Summarize this following dialogue: ...
8628,13717285,Agatha: Can everyone please send me their actu...,"Agatha is getting married and needs Emily, Ana...",### Human: Summarize this following dialogue: ...
5078,13716204,"Christopher: Okay, I think I've added everyone...","Christopher, Joan, Lindsay, Brian and Kenneth ...",### Human: Summarize this following dialogue: ...
4212,13865395,"Henry: Charlie broke his arm, we’re going to t...",Charlie broke his arm. Henry and Victoria went...,### Human: Summarize this following dialogue: ...
9567,13820663,"Victoria: <file_other>\r\nMegan: oooo, my fav ...",Victoria sends Megan and Florence a link. Vict...,### Human: Summarize this following dialogue: ...


In [13]:
datadf["text"].iloc[0]

"### Human: Summarize this following dialogue: Tom: How are you today?\r\nJenny: I'm fine, already at the office\r\nMia: Me too!\r\nTom: Great, I was afraid it could be difficult after last night\n### Assistant: Jenny and Mia are feeling fine after last night."

In [14]:
datadf.head(1)

Unnamed: 0,id,dialogue,summary,text
12994,13818954,"Tom: How are you today?\r\nJenny: I'm fine, al...",Jenny and Mia are feeling fine after last night.,### Human: Summarize this following dialogue: ...


In [15]:
# Convert the Pandas DataFrame back to a Hugging Face Dataset
data = Dataset.from_pandas(datadf)

In [16]:
# Import necessary components from the Transformers library
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, TrainingArguments

# AutoModelForCausalLM: A class for loading pre-trained causal language models. Causal language models are typically used for tasks such as text generation.
# AutoTokenizer: A class for loading pre-trained tokenizers that correspond to the pre-trained models. Tokenizers are used to convert text into a format suitable for model input (e.g., token IDs).
# GPTQConfig: This configuration class is likely used for specifying settings related to GPT-based models, particularly those involving quantization or other specialized configurations.
# TrainingArguments: A class for specifying training hyperparameters and arguments, such as learning rate, batch size, number of epochs, etc., when fine-tuning models.

In [17]:
# Load a pre-trained tokenizer from the specified model checkpoint
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ")

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [18]:
# Retrieve the end-of-sequence (EOS) token from the tokenizer
eos_token = tokenizer.eos_token

In [19]:
# Set the padding token to be the same as the end-of-sequence (EOS) token
tokenizer.pad_token = tokenizer.eos_token
# This configuration can help maintain consistency in how sequences are processed, especially when dealing with models that may handle padding and end-of-sequence markers similarly.

In [20]:
# Create a GPTQConfig object with specific quantization settings
quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True, tokenizer=tokenizer)

# GPTQConfig: Configuration for quantization, which reduces the precision of the model weights to save memory and computation.
# bits=4: Specifies 4-bit quantization, which significantly reduces the model size.
# disable_exllama=True: Disables a specific feature or optimization (ExLlama) which may not be needed or compatible with the current setup.
# tokenizer=tokenizer: Passes the tokenizer object to the configuration, ensuring compatibility between the quantized model and the tokenizer.

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


In [21]:
# Load a pre-trained causal language model with quantization settings and automatic device mapping
model = AutoModelForCausalLM.from_pretrained(
    "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ",
    quantization_config=quantization_config_loading,
    device_map="auto"
)

# AutoModelForCausalLM.from_pretrained: Loads a pre-trained causal language model from the specified checkpoint.
# "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ": The model checkpoint identifier, referring to the specific version of the Mistral-7B model.
# quantization_config=quantization_config_loading: Applies the previously defined quantization settings to the model.
# device_map="auto": Automatically maps the model to available devices (e.g., CPUs, GPUs) for efficient computation.

config.json:   0%|          | 0.00/963 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

Some weights of the model checkpoint at TheBloke/Mistral-7B-Instruct-v0.1-GPTQ were not used when initializing MistralForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_proj.bias', 'model.layers.10.self_attn.v_proj.bias', 'model.layers.11.mlp.down_proj.bias', 'model.layers.11

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [22]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (rotary_emb): MistralRotaryEmbedding()
          (k_proj): QuantLinear()
          (o_proj): QuantLinear()
          (q_proj): QuantLinear()
          (v_proj): QuantLinear()
        )
        (mlp): MistralMLP(
          (act_fn): SiLU()
          (down_proj): QuantLinear()
          (gate_proj): QuantLinear()
          (up_proj): QuantLinear()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
)


In [23]:
# Disable the use of cache during model execution
model.config.use_cache = False
# Set the pretraining tensor parallelism to 1
model.config.pretraining_tp = 1
# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# model.config.use_cache = False: Disables the use of cache, which might be done to save memory or ensure real-time processing.
# model.config.pretraining_tp = 1: Sets the pretraining tensor parallelism parameter to 1, which might be relevant for model parallelism settings.
# model.gradient_checkpointing_enable(): Enables gradient checkpointing, a technique used during training to save memory by not storing all intermediate activations.

In [24]:
# Import the function to prepare the model for k-bit training
from peft import prepare_model_for_kbit_training

# prepare_model_for_kbit_training: A function that prepares the model for k-bit training, which involves techniques for efficient training with reduced precision.

In [25]:
model = prepare_model_for_kbit_training(model)

In [26]:
# Import the LoraConfig class from the peft library
from peft import LoraConfig

# LoraConfig: A configuration class for Low-Rank Adaptation (LoRA), which is a technique to efficiently fine-tune large-scale models by reducing the number of trainable parameters.

In [28]:
# Create a LoRA configuration for fine-tuning the model
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)

# r=16: Specifies the rank of the low-rank adaptation. Higher values can capture more information but increase the number of parameters.
# lora_alpha=16: A scaling factor for the LoRA updates, which helps to balance the adaptation strength.
# lora_dropout=0.05: Dropout rate applied to the LoRA layers to prevent overfitting.
# bias="none": Indicates that no bias term is added in the LoRA layers.
# task_type="CAUSAL_LM": Specifies the task type as Causal Language Modeling (LM), which involves generating text based on previous context.
# target_modules=["q_proj", "v_proj"]: Lists the specific modules (query projection and value projection) in the model to which LoRA should be applied.

In [29]:
# Import the function to get a model with the PEFT configuration applied
from peft import get_peft_model

# get_peft_model: A function that applies the PEFT (Parameter-Efficient Fine-Tuning) configuration to a model,
# enabling efficient fine-tuning with techniques such as LoRA.

In [30]:
model = get_peft_model(model,peft_config)

In [31]:
# Import the SFTTrainer class from the trl library
from trl import SFTTrainer

# SFTTrainer: A class for supervised fine-tuning (SFT) of transformer models.
# It is designed to facilitate the training process, including setting up the training loop, handling data, and optimizing the model.

In [32]:
# Import the TrainingArguments class from the transformers library
from transformers import TrainingArguments

# TrainingArguments: A class that defines the training hyperparameters and arguments for model training.
# This includes settings such as learning rate, batch size, number of epochs, output directory, and more.
# These arguments are crucial for configuring and controlling the training process.

In [33]:
# Define the training arguments for supervised fine-tuning
training_arguments = TrainingArguments(
    output_dir="mistral-finetuned-samsum",         # Directory where the model checkpoints and other outputs will be saved
    per_device_train_batch_size=8,                 # Batch size for training on each device (e.g., GPU)
    gradient_accumulation_steps=1,                 # Number of gradient accumulation steps to effectively increase the batch size
    optim="paged_adamw_32bit",                     # Optimizer to use, in this case, a variant of AdamW optimized for 32-bit precision
    learning_rate=2e-4,                            # Initial learning rate for training
    lr_scheduler_type="cosine",                    # Learning rate scheduler type, in this case, cosine annealing
    save_strategy="epoch",                         # Strategy for saving model checkpoints, here it's set to save at the end of each epoch
    logging_steps=100,                             # Interval for logging training metrics
    num_train_epochs=1,                            # Number of epochs to train the model
    max_steps=250,                                 # Maximum number of training steps
    fp16=True,                                     # Use mixed precision training (16-bit floating point)
    push_to_hub=True                               # Push the trained model to the Hugging Face Hub
)

# TrainingArguments: This object contains all the hyperparameters and settings required for training the model.
# It configures the training process, such as where to save outputs, how large the batches should be, the optimizer and learning rate settings,
# checkpoint saving strategy, logging frequency, number of epochs, and whether to use mixed precision training.
# Additionally, it includes the option to push the trained model to the Hugging Face Hub for sharing and deployment.

In [36]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
# Initialize the SFTTrainer for supervised fine-tuning
trainer = SFTTrainer(
    model=model,                    # The pre-trained model to be fine-tuned
    train_dataset=data,             # The dataset to use for training
    peft_config=peft_config,        # PEFT configuration for efficient fine-tuning
    dataset_text_field="text",      # The field in the dataset containing the text data
    args=training_arguments,        # Training arguments defined earlier
    tokenizer=tokenizer,            # Tokenizer to preprocess the text data
    packing=False,                  # Whether to use packing for efficient data loading
    max_seq_length=512              # Maximum sequence length for the inputs
)

# SFTTrainer: A class for supervised fine-tuning (SFT) of transformer models.
# - model=model: The pre-trained model that we are fine-tuning.
# - train_dataset=data: The dataset used for training, which has been converted to the Hugging Face Dataset format.
# - peft_config=peft_config: Configuration for parameter-efficient fine-tuning using techniques like LoRA.
# - dataset_text_field="text": The field in the dataset that contains the text data to be used for training.
# - args=training_arguments: The training arguments that define the hyperparameters and settings for the training process.
# - tokenizer=tokenizer: The tokenizer to preprocess the text data, ensuring compatibility with the model.
# - packing=False: Disables packing, which might be used for more efficient data loading and processing.
# - max_seq_length=512: The maximum sequence length for the input text data, ensuring that inputs are truncated or padded to this length.


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [38]:
trainer.train()



Step,Training Loss
100,1.2571
200,0.1225




TrainOutput(global_step=250, training_loss=0.5633915090560913, metrics={'train_runtime': 2246.3818, 'train_samples_per_second': 0.89, 'train_steps_per_second': 0.111, 'total_flos': 596811948097536.0, 'train_loss': 0.5633915090560913, 'epoch': 35.714285714285715})

In [None]:
trainer.push_to_hub()

events.out.tfevents.1717263551.e0cc96174c73.2107.0:   0%|          | 0.00/6.29k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ShehbazPatel/mistral-finetuned-samsum/commit/bd96297f0fd3f0fb9cf979088cbf73109891930b', commit_message='End of training', commit_description='', oid='bd96297f0fd3f0fb9cf979088cbf73109891930b', pr_url=None, pr_revision=None, pr_num=None)

In [40]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! cp -r /content/mistral-finetuned-samsum /content/drive/MyDrive/

# infercening

In [39]:
# Import necessary classes and modules
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig, AutoTokenizer
import torch

# AutoPeftModelForCausalLM: A class for loading a pre-trained causal language model with parameter-efficient fine-tuning (PEFT) configurations.
# GenerationConfig: A class for configuring generation parameters for the model, such as maximum length, temperature, and top-k sampling.
# AutoTokenizer: A class for loading the pre-trained tokenizer corresponding to the model.
# torch: A popular deep learning library used for tensor operations, model training, and inference.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/mistral-finetuned-samsum")

In [None]:
# Tokenize the input text and convert it to PyTorch tensors
inputs = tokenizer("""
###Human: Summarize this following dialogue: Vasanth: I'm at the railway station in Chennai Karthik: No problems so far? Vasanth: no, everything's going smoothly Karthik: good. lets meet there soon!
###Assistant: """, return_tensors="pt").to("cuda")

# tokenizer: The tokenizer object is used to convert the input text into token IDs.
# return_tensors="pt": Specifies that the output should be in PyTorch tensor format.
# to("cuda"): Moves the tensor to the GPU for faster processing if a CUDA-enabled GPU is available.


In [None]:
# Load the fine-tuned model with PEFT configuration
model = AutoPeftModelForCausalLM.from_pretrained(
    "/content/mistral-finetuned-samsum",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda"
)

# AutoPeftModelForCausalLM.from_pretrained: Loads the fine-tuned model with the PEFT configuration from the specified directory.
# "/content/mistral-finetuned-samsum": Path to the directory where the fine-tuned model is stored.
# low_cpu_mem_usage=True: Optimizes the model loading process to use less CPU memory.
# return_dict=True: Ensures that the model returns outputs as a dictionary.
# torch_dtype=torch.float16: Loads the model weights in 16-bit floating point precision to save memory and improve performance.
# device_map="cuda": Maps the model to the GPU for faster inference.

In [None]:
# Create a generation configuration for text generation
generation_config = GenerationConfig(
    do_sample=True,                    # Enable sampling for text generation
    top_k=1,                           # Use top-k sampling with k=1, meaning only the highest probability token is considered
    temperature=0.1,                   # Set the temperature to 0.1 to make the distribution sharper, focusing on high-probability tokens
    max_new_tokens=25,                 # Generate a maximum of 25 new tokens
    pad_token_id=tokenizer.eos_token_id # Use the EOS token as the padding token
)

# GenerationConfig: This object configures various parameters for generating text with the model.
# - do_sample=True: Enables sampling, allowing the model to generate more diverse outputs.
# - top_k=1: Only the highest probability token is considered at each step, making the generation more deterministic.
# - temperature=0.1: A low temperature value makes the distribution sharper, focusing on high-probability tokens.
# - max_new_tokens=25: Limits the generated text to 25 new tokens to control the length of the output.
# - pad_token_id=tokenizer.eos_token_id: Specifies the padding token ID, using the EOS token to pad sequences if necessary.

In [None]:
import time

# Record the start time
st_time = time.time()

# Generate text using the model with the specified generation configuration
outputs = model.generate(**inputs, generation_config=generation_config)

# Decode the generated tokens to get the output text
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the generated text
print(output_text)

# Print the time taken for the generation process
print(time.time() - st_time)

# time.time(): Function to get the current time in seconds since the epoch.
# model.generate: Method to generate text based on the input and generation configuration.
# tokenizer.decode: Converts the generated token IDs back to a human-readable text string.
# skip_special_tokens=True: Option to remove special tokens (e.g., padding, EOS) from the output text.