In [1]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U peft
%pip install -U accelerate
%pip install -U trl

In [2]:
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging



In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("HUGGINGFACE_TOKEN")
secret_wandb = user_secrets.get_secret("wandb")

In [4]:
!huggingface-cli login --token $secret_hf

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
# Monitering the LLM
wandb.login(key = secret_wandb)
run = wandb.init(
    project='Fine tuning mistral 7B', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlifeofcoding[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20231125_005916-9m4c9rde[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mstill-dream-8[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/lifeofcoding/Fine%20tuning%20mistral%207B[0m
[34m[1mwandb[0m: üöÄ View run at [34m[4mhttps://wandb.ai/lifeofcoding/Fine%20tuning%20mistral%207B/runs/9m4c9rde[0m


In [6]:
# Define out params
# This will error within kaggle using the base model,
# use base model directly from HF for production i.e. mistralai/Mistral-7B-v0.1
#base_model = "mistralai/Mistral-7B-v0.1"
base_model = "/kaggle/input/mistral/pytorch/7b-v0.1-hf/1"
dataset_name = "databricks/databricks-dolly-15k"
new_model = "mistral-7b-dolly"
padding_side = "right"

In [7]:
# Importing a sample of our dataset
train_dataset = load_dataset(dataset_name, split="train[0:800]")
eval_dataset = load_dataset(dataset_name, split="train[800:1000]")

Downloading and preparing dataset json/databricks--databricks-dolly-15k to /root/.cache/huggingface/datasets/json/databricks--databricks-dolly-15k-ed25119c913eb841/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/databricks--databricks-dolly-15k-ed25119c913eb841/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


In [8]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = padding_side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, True)

In [9]:
# Helper function to format the prompt
def generate_prompt(sample):
    full_prompt =f"""<s>[INST]{sample['instruction']}
{f"Here is some context: {sample['context']}" if len(sample["context"]) > 0 else None}
 [/INST] {sample['response']}
</s>"""
    return {"text": full_prompt}

In [10]:
generated_train_dataset = train_dataset.map(generate_prompt, remove_columns=list(train_dataset.features))
generated_val_dataset = eval_dataset.map(generate_prompt, remove_columns=list(train_dataset.features))

  0%|          | 0/800 [00:00<?, ?ex/s]

  0%|          | 0/200 [00:00<?, ?ex/s]

In [11]:
# Inspect the data to make sure all looks well
generated_train_dataset[200]

{'text': "<s>[INST]What is wine?\nHere is some context: Wine is an alcoholic drink typically made from fermented grapes. Yeast consumes the sugar in the grapes and converts it to ethanol and carbon dioxide, releasing heat in the process. Different varieties of grapes and strains of yeasts are major factors in different styles of wine. These differences result from the complex interactions between the biochemical development of the grape, the reactions involved in fermentation, the grape's growing environment (terroir), and the wine production process. Many countries enact legal appellations intended to define styles and qualities of wine. These typically restrict the geographical origin and permitted varieties of grapes, as well as other aspects of wine production. Wines can be made by fermentation of other fruit crops such as plum, cherry, pomegranate, blueberry, currant and elderberry.\n [/INST] Wine is an alcoholic drink typically made from fermented grapes.\n</s>"}

In [12]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [13]:
# Load base model (Mistral 7B)
bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
        base_model,
        load_in_4bit=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
#Adding the adapters in the layers
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1, # Coventional
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

trainable params: 92274688 || all params: 3844345856 || trainable%: 2.4002702008713337


In [15]:
# Hyperparamter
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="wandb",
    evaluation_strategy="steps", # Evaluate the model every logging step
    eval_steps=25,               # Evaluate and save checkpoints every x steps
    do_eval=True,                # Perform evaluation at the end of training
)

In [16]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=generated_train_dataset,
    eval_dataset=generated_val_dataset,
    peft_config=peft_config,
    max_seq_length=None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
25,1.626,1.570537
50,1.7025,1.578838
75,1.52,1.522839
100,1.5614,1.497158
125,1.4389,1.441251
150,1.367,1.462093
175,1.4941,1.427628
200,1.313,1.454334


TrainOutput(global_step=200, training_loss=1.5028684043884277, metrics={'train_runtime': 5846.162, 'train_samples_per_second': 0.137, 'train_steps_per_second': 0.034, 'total_flos': 7171184247472128.0, 'train_loss': 1.5028684043884277, 'epoch': 1.0})

In [18]:
# Save the fine-tuned lora model
trainer.model.save_pretrained(new_model)
wandb.finish()
model.config.use_cache = True
model.eval()

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:                      eval/loss ‚ñà‚ñà‚ñÖ‚ñÑ‚ñÇ‚ñÉ‚ñÅ‚ñÇ
[34m[1mwandb[0m:                   eval/runtime ‚ñÑ‚ñÅ‚ñÖ‚ñÜ‚ñÜ‚ñà‚ñÖ‚ñÑ
[34m[1mwandb[0m:        eval/samples_per_second ‚ñÖ‚ñà‚ñÖ‚ñÉ‚ñÉ‚ñÅ‚ñÉ‚ñÖ
[34m[1mwandb[0m:          eval/steps_per_second ‚ñà‚ñà‚ñà‚ñÅ‚ñÅ‚ñÅ‚ñà‚ñà
[34m[1mwandb[0m:                    train/epoch ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñà‚ñà‚ñà
[34m[1mwandb[0m:              train/global_step ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñà‚ñà‚ñà
[34m[1mwandb[0m:            train/learning_rate ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
[34m[1mwandb[0m:                     train/loss ‚ñá‚ñà‚ñÖ‚ñÖ‚ñÉ‚ñÇ‚ñÑ‚ñÅ
[34m[1mwandb[0m:               train/total_flos ‚ñÅ
[34m[1mwandb[0m:               train/train_loss ‚ñÅ
[34m[1mwandb[0m:            train/train_runtime ‚ñÅ
[34m[1mwandb[0m:

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): Linear4bit(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
              )
              (k_proj): Linear4bit(
                (lora_dropout): ModuleDict(


In [19]:
# This will error within kaggle using the base model,
# use base model directly from HF for production i.e. mistralai/Mistral-7B-v0.1
try:
    trainer.model.push_to_hub(new_model, use_temp_dir=False)
except:
    print("An exception occurred")

adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

An exception occurred


In [20]:
logging.set_verbosity(logging.CRITICAL)

prompt = """
What is a Plumbus? Here is some context: Plumbuses are made of organic tissue, fleebs, dinglebops, and grumbos.
"""
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, eos_token_id=model.config.eos_token_id, max_new_tokens=25)
result = pipe(f"<s>[INST] {prompt} [/INST]")
generated = result[0]['generated_text']
print(generated[generated.find('[/INST]')+8:])

Plumbuses are made of organic tissue, fleebs, dinglebops, and grumbos.
None


In [21]:
# Empty VRAM
del model
del pipe
del trainer

In [22]:
# Reload model in FP16 and merge it with LoRA weights
basemodel = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
#model = PeftModel.from_pretrained(basemodel, new_model) if you pushed lora to HF
model = PeftModel.from_pretrained(basemodel, './results/checkpoint-200')
model = model.merge_and_unload() # Merge lora back to base model

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = padding_side

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
# This will error within kaggle using the base model,
# use base model directly from HF for production i.e. mistralai/Mistral-7B-v0.1
try:
    model.push_to_hub(new_model + "-merged", max_shard_size='2GB')
    tokenizer.push_to_hub(new_model + "-merged")
except:
    print("An exception occurred")

An exception occurred


That it! Find me at, or explore ML and SWE topics at [LivingTheCode.Life](https://livingthecode.life/)