<a href="https://colab.research.google.com/github/oabai/finetune/blob/main/Fine_tune_Mistral_7B_With_Travel_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'mistral/pytorch/7b-v0.1-hf/1:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-models-data%2F3899%2F5111%2Fbundle%2Farchive.tar.gz%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240925%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240925T214314Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9a9951846af40302eab3d428bd420c3495388570b4888fdc5789d74ada6fc672ec494b9c7155f803ce237881902d871a2313b40edea73bf322e50fabc3c0501a67fe5f36e45ae3630d59938a740832636bdd87521b061655b10d7cbaf08baae1982c8e3ceda268cd06f8e2bb9ded1b9f0fdee2395363b808c6a1905c8449f337dff7313e538f61bdae05c39474f8d1643d1be8cd1ce4f9cbcdf9f62128eb69153d57b0f997173f061d4b5288254ce9142969ccb4280843323fe74a20ddfa18a8dd7770377c5412e325ad8aa64078f283bc38d434b1dcdbb841dd15fa9cd1c40dd2847b8732d037fe9cd8cce75068530de68c5a318675332f3ce3195212fd0780'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U peft
%pip install -U accelerate
%pip install -U trl
%pip install -U datasets


In [None]:
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging

In [None]:
!rm -rf /root/.cache/huggingface/token

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("HUGGINGFACE_TOKEN")
secret_wandb = user_secrets.get_secret("wandb")

In [None]:
!huggingface-cli login --token $secret_hf

In [None]:
# Monitering the LLM
wandb.login(key = secret_wandb)
run = wandb.init(
    project='Fine tuning mistral 7B with moroccan Darija',
    job_type="training",
    anonymous="allow"
)

In [None]:
# Define out params
# This will error within kaggle using the base model,
# use base model directly from HF for production i.e. mistralai/Mistral-7B-v0.1
#base_model = "mistralai/Mistral-7B-v0.1"
base_model = "/kaggle/input/mistral/pytorch/7b-v0.1-hf/1"
dataset_name = "Digicactus/moroccantravel"
new_model = "digicactus_7b_darija_moroccan"
padding_side = "right"

In [None]:
# Importing a sample of our dataset
train_dataset = load_dataset(dataset_name, split="train[0:300]")
eval_dataset = load_dataset(dataset_name, split="train[300:320]")

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = padding_side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

In [None]:
# Helper function to format the prompt
def generate_prompt(sample):
    full_prompt =f"""<s>[INST]{sample['input']}
 [/INST] {sample['response']}
</s>"""
    return {"text": full_prompt}

In [None]:
generated_train_dataset = train_dataset.map(generate_prompt, remove_columns=list(train_dataset.features))
generated_val_dataset = eval_dataset.map(generate_prompt, remove_columns=list(train_dataset.features))

In [None]:
# Inspect the data to make sure all looks well
generated_train_dataset[200]

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
# Load base model (Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
        base_model,
        #load_in_4bit=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

In [None]:
#Adding the adapters in the layers
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1, # Coventional
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

In [None]:
# Hyperparamter
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="wandb",
    evaluation_strategy="steps", # Evaluate the model every logging step
    eval_steps=25,               # Evaluate and save checkpoints every x steps
    do_eval=True,                # Perform evaluation at the end of training
)

In [None]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=generated_train_dataset,
    eval_dataset=generated_val_dataset,
    peft_config=peft_config,
    max_seq_length=None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

In [None]:
trainer.train()

In [None]:
# Save the fine-tuned lora model
trainer.model.save_pretrained(new_model)
wandb.finish()
model.config.use_cache = True
model.eval()

In [None]:
# This will error within kaggle using the base model,
# use base model directly from HF for production i.e. mistralai/Mistral-7B-v0.1
try:
    trainer.model.push_to_hub(new_model, use_temp_dir=False)
except:
    print("An exception occurred")

In [None]:
logging.set_verbosity(logging.CRITICAL)

prompt = """
Est-il facile de trouver des distributeurs automatiques de billets au Maroc?
"""
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-generation", model="oabai/digicactus_7b_darija_moroccan-merged", max_new_tokens=25)

#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, eos_token_id=model.config.eos_token_id, max_new_tokens=25)
result = pipe(f"<s>[INST] {prompt} [/INST]")
generated = result[0]['generated_text']
print(generated[generated.find('[/INST]')+8:])

In [None]:
# Empty VRAM
del model
del pipe
del trainer

In [None]:
# Reload model in FP16 and merge it with LoRA weights
basemodel = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
#model = PeftModel.from_pretrained(basemodel, new_model) if you pushed lora to HF
model = PeftModel.from_pretrained(basemodel, './results/checkpoint-50')
model = model.merge_and_unload() # Merge lora back to base model

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = padding_side

In [None]:

    model.push_to_hub(new_model + "-merged", max_shard_size='2GB')
    tokenizer.push_to_hub(new_model + "-merged")


In [None]:
# Make sure you have git-lfs installed (https://git-lfs.com)
!git lfs install
# Clone your model from Huggingface
!git clone https://huggingface.co/oabai/digicactus_7b_darija_moroccan-merged
# Clone llama.cpp's repository. They provide code to convert models into gguf.
!git clone https://github.com/ggerganov/llama.cpp.git


Git LFS initialized.
Cloning into 'digicactus_7b_darija_moroccan-merged'...
remote: Enumerating objects: 26, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 26 (delta 3), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (26/26), 467.90 KiB | 6.16 MiB/s, done.
error: unable to write file model-00002-of-00008.safetensors
error: unable to write file model-00007-of-00008.safetensors
error: unable to write file model-00005-of-00008.safetensors
error: unable to write file model-00003-of-00008.safetensors
error: unable to write file model-00004-of-00008.safetensors
Filtering content: 100% (9/9), 4.33 GiB | 37.42 MiB/s, done.
fatal: unable to checkout working tree
You can inspect what was checked out with 'git status'
and retry with 'git restore --source=HEAD :/'

fatal: could not create work tree dir 'llama.cpp': No space left on device


In [None]:
#if colab
!pip install -r /kaggle/working/llama.cpp/requirements.txt

#if local then cd to cloned repo and perform following line
# You can create venv as well
#!pip install -r requirements.txt


In [None]:
#for colab
#         path to convert.py ⬇︎         path of model ⬇︎
!python /kaggle/working/llama.cpp/convert.py /kaggle/working/digicactus_7b_darija_moroccan-merged  \
  --outfile finetuned-2.gguf \ # gguf model name that you want to assign
  --outtype q8_0 #quantize in 8-bit


That it! Find me at, or explore ML and SWE topics at [LivingTheCode.Life](https://livingthecode.life/)