# Training Mistral 7B Instruction v2.0 using DPO: 

Follow this blog for reference: https://medium.com/@mauryaanoop3/dpo-fine-tuning-for-enhanced-language-model-performance-466fec349a5e

In [1]:
!git config --global credential.helper store

# Install the required libraries
!pip install huggingface_hub trl bitsandbytes sentencepiece transformers peft datasets

Collecting trl
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.5-py3-none-any.whl.metadata (8.2 kB)
Collecting docstring-parser>=0.16 (from tyro>=0.5.11->trl)
  Downloading docstring_parser-0.16-py3-none-any.whl.metadata (3.0 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading trl-0.9.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownload

In [2]:
# Import the notebook_login method
from huggingface_hub import notebook_login

# Log in interactively
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("DhruvParth/Mistral-7B-Instruct-v2.0-PairRM-DPO-Dataset")
ds

Downloading readme:   0%|          | 0.00/706 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/351k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt_id', 'prompt', 'chosen', 'rejected', 'all_generated_resopnses', 'all_rm_scores'],
        num_rows: 50
    })
})

## 1. Environment Setup and Library Installation:

In [4]:
# Importing packages
import os
import gc
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
from trl import DPOTrainer, DPOConfig
import bitsandbytes as bnb

2024-08-05 00:27:28.278168: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-05 00:27:28.278305: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-05 00:27:28.406249: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


This section installs the necessary libraries:
- trl: Provides the DPO training functionalities.
- bitsandbytes: Enables 4-bit quantization for memory efficiency.
- sentencepiece: For tokenization with SentencePiece models.
- transformers: The core library for working with pre-trained models.
- peft: Offers Parameter-Efficient Fine-Tuning (PEFT) techniques, specifically LoRA.

## 2. Model and Tokenizer Initialization:

In [6]:
# Define model names and tokens
peft_model_name = "mistralai/Mistral-7B-Instruct-v0.2" # The model obtained after the SFT step
new_model = "Mistral-7B-Instruct-v0.2-DPO-v0.1" #the name of the DPO trained model

# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(peft_model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

We have already downloaded the dataset: 

In [7]:
ds.get("train")

Dataset({
    features: ['prompt_id', 'prompt', 'chosen', 'rejected', 'all_generated_resopnses', 'all_rm_scores'],
    num_rows: 50
})

In [8]:
train_dataset = ds.get("train")

In [9]:
def fix_format_for_DPO_trainer(row):
    row["chosen"] = row['chosen'][1]['content']
    row['rejected'] = row['rejected'][1]['content']
    return row

In [10]:
updated_train_dataset = train_dataset.map(fix_format_for_DPO_trainer)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [11]:
updated_train_dataset[0]['prompt']

'Can I spend the night alone in a tent in a forest outside Stockholm in -20°C without risking my life?\n\nThe backstory\nFrom the end of January, I\'m starting my studies in a suburb of Stockholm. I\'ve decided to, if it turns out plausible, not rent an apartment, but live in a tent. (This is not out of frugality, but out of a will to try something new.)\nI do have friends who I could visit once a week or so to prepare food and wash my clothes, so I think I can solve the practical problems, or at least those that I\'ve come to think of. I\'d camp in one of the forests, maybe 1 km from "civilisation". I\'d have access to showers etc at university every day.\nHowever: I don\'t want to freeze to death in my sleep! That\'s very important to me. I\'ve read that the nights can get as cold as -20°C (-4°F). With the proper preparations, would this be a plausible way of living, at least for a month or so?\nI do have camping experience, and have been hiking for three weeks, but only in summer.'

In [12]:
updated_train_dataset[0]['chosen']

"Living in a tent in a forest outside Stockholm during the winter months, when temperatures can drop to -20°C (-4°F), presents unique challenges that go beyond the practical considerations you mentioned. While it's possible to survive with the right equipment and knowledge, it's essential to understand the increased risks involved.\n\nHere are the key considerations for camping in such extreme cold:\n\n1. Extreme temperature: The most significant risk factor is the extreme cold, which can lead to frostbite and hypothermia. Both of these conditions can be dangerous or even life-threatening.\n\n2. Frostbite: Frostbite occurs when your extremities (fingers, toes, ears, and nose) freeze due to the lack of blood flow. Symptoms include numbness, tingling, and waxy-looking skin. Preventing frostbite involves keeping the affected areas warm and dry, as well as protecting them with insulated clothing.\n\n3. Hypothermia: Hypothermia is a more severe condition where your body temperature drops be

In [13]:
updated_train_dataset[0]['rejected']

'Given your camping experience and the availability of occasional access to warmer environments, spending a night alone in a tent in a forest outside Stockholm during extreme cold weather (-20°C or -4°F) carries some risks. Here are some factors and considerations to help you make an informed decision:\n\n1. Proper Equipment: Investing in a high-quality four-season tent, a good sleeping bag rated for extremely cold temperatures (around -20°C or lower), and an insulated sleeping pad are essential. Make sure that all the gear is in good condition and suited to the extreme climate.\n\n2. Additional Insulation: Consider adding an insulated underlay for your sleeping pad, and using a thermal blanket inside the tent as extra insulation. You may also want to insulate the ground under the tent with a barrier like a tarp or insulated mats.\n\n3. Shelter Access: Ensure that there is a nearby forest shelter or other warm emergency shelter located nearby. Keep in mind that this might require addit

## 3. LoRA Configuration and Model Loading:

If you face an issue loading the model following the above mentioned article, follow suggestions mentioned here: 
[How to fix the “Can’t find ‘adapter_config.json’” error with Hugging Face](https://medium.com/@Thimira/how-to-fix-the-cant-find-adapter-config-json-error-with-hugging-face-2e0a16643f74)

Additional Article to follow: [Mistral Mastery: Fine-Tuning & Fast Inference Guide](https://medium.com/@parikshitsaikia1619/mistral-mastery-fine-tuning-fast-inference-guide-62e163198b06)

GitHub Repo for Reference: [Fine_tune_a_Mistral_7b_model_with_DPO.ipynb](https://github.com/mlabonne/llm-course/blob/main/Fine_tune_a_Mistral_7b_model_with_DPO.ipynb)

In [14]:
# LoRA configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'v_proj', 'q_proj', 'dense']
)

# Load the base model with BitsAndBytes configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# Load the entire model on the GPU 0
device_map = {"": 0}

# Load base model
# Loads model from hugging face and device mapping
model = AutoModelForCausalLM.from_pretrained(
    peft_model_name,
    quantization_config=bnb_config,
    device_map=device_map
)

model.config.use_cache = False

#Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Here, we configure LoRA and load the base model:

- The peft_config defines the LoRA parameters, a PEFT technique that significantly reduces the number of trainable parameters, making the fine-tuning process more efficient.
- The bnb_config configures BitsAndBytes for 4-bit quantization, further reducing memory usage.
- We load the pre-trained model using AutoPeftModelForCausalLM, applying the specified LoRA and quantization configurations.

## 4. Training Arguments and DPO Trainer Initialization:

In [15]:
# Training arguments
training_args = DPOConfig(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=50, # we set up the max_steps to 50, due to free GPU useage
    save_strategy="no",
    logging_steps=1,
    output_dir=new_model,
    optim="paged_adamw_32bit",
    warmup_steps=5,
)

# Create DPO trainer
dpo_trainer = DPOTrainer(
    model,
    args=training_args,
    train_dataset=updated_train_dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    beta=0.1, # The parameter 'beta' is the hyperparameter of the implicit reward and is normally set from 0.1 to 0.5. It's important to note that if beta tends to zero, we tend to ignore the reference model.
    max_prompt_length=512,
    max_length=1024,
)


Deprecated positional argument(s) used in DPOTrainer, please use the DPOConfig to set these arguments instead.


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [16]:
gc.collect()
torch.cuda.empty_cache()

In [17]:
# Fine-tune model with DPO
dpo_trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
1,0.6931
2,0.6931
3,0.6934
4,0.6911
5,0.692
6,0.6907
7,0.6269
8,0.5885
9,0.5346
10,0.4644


TrainOutput(global_step=50, training_loss=0.2680965988337994, metrics={'train_runtime': 4152.9059, 'train_samples_per_second': 0.096, 'train_steps_per_second': 0.012, 'total_flos': 0.0, 'train_loss': 0.2680965988337994, 'epoch': 8.0})

In [18]:
# Save artifacts
dpo_trainer.model.save_pretrained("final_checkpoint")
tokenizer.save_pretrained("final_checkpoint")

('final_checkpoint/tokenizer_config.json',
 'final_checkpoint/special_tokens_map.json',
 'final_checkpoint/tokenizer.model',
 'final_checkpoint/added_tokens.json',
 'final_checkpoint/tokenizer.json')

In [19]:
# Flush memory
del dpo_trainer, model
gc.collect()
torch.cuda.empty_cache()

In [21]:
# Reload model in FP16 (instead of NF4)
base_model = AutoModelForCausalLM.from_pretrained(
    peft_model_name,
    return_dict=True,
    torch_dtype=torch.float16,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

NameError: name 'model_name' is not defined

In [22]:
tokenizer = AutoTokenizer.from_pretrained(peft_model_name)

In [23]:
# Merge base model with the adapter
model = PeftModel.from_pretrained(base_model, "final_checkpoint")
model = model.merge_and_unload()

In [24]:
# Save model and tokenizer
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('Mistral-7B-Instruct-v0.2-DPO-v0.1/tokenizer_config.json',
 'Mistral-7B-Instruct-v0.2-DPO-v0.1/special_tokens_map.json',
 'Mistral-7B-Instruct-v0.2-DPO-v0.1/tokenizer.model',
 'Mistral-7B-Instruct-v0.2-DPO-v0.1/added_tokens.json',
 'Mistral-7B-Instruct-v0.2-DPO-v0.1/tokenizer.json')

In [25]:
# Push them to the HF Hub
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/DhruvParth/Mistral-7B-Instruct-v0.2-DPO-v0.1/commit/9042417cae1f50db83c96379245749ef82809a59', commit_message='Upload tokenizer', commit_description='', oid='9042417cae1f50db83c96379245749ef82809a59', pr_url=None, pr_revision=None, pr_num=None)