In [None]:
#install required libraries
!pip3 install datasets > /dev/null
!pip3 install transformers -U > /dev/null
!pip3 install peft > /dev/null
!pip3 install bitsandbytes -U > /dev/null

In [None]:
#reboot Colab
import os
os.kill(os.getpid(), 9)

In [None]:
#import packages
import torch
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

# load the base model and its tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1",
                                             load_in_4bit=True,
                                             torch_dtype=torch.float16,
                                             device_map="auto",
                                             )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json:   0%|          | 0.00/92.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/19 [00:00<?, ?it/s]

model-00001-of-00019.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00002-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00005-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00006-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00007-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00008-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00009-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00010-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00011-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00012-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00013-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00014-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00015-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00016-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00017-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00018-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00019-of-00019.safetensors:   0%|          | 0.00/4.22G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
# Prepare model for training
model = prepare_model_for_kbit_training(model)
tokenizer.pad_token = "!" #Mixtral uses specific EOS token : "Note that <s> and </s> are special tokens for beginning of string (BOS) and end of string (EOS) while [INST] and [/INST] are regular strings." so we use "!"
CUTOFF_LEN = 256
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=[ "w1", "w2", "w3"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)

In [None]:
#Load a technical docummentation Q&A on Vertex AI dataset
import datasets

dataset = datasets.load_dataset("fredmo/vertexai-qna-500")
print("dataset", dataset)
train_data = dataset["train"]

# function to build the dataset in the required instruct format
def prompt(query):
   sys_msg = "help me answering this question about ML platform"
   instruction = f"<s> [INST] {sys_msg}\n{query['input_text']}[/INST] {query['output_text']} </s>"

def tokenize(prompt):
   return tokenizer(
       prompt + tokenizer.eos_token,
       truncation=True,
       max_length=CUTOFF_LEN,
       padding="max_length"
   )

train_data = train_data.shuffle().map(lambda x: tokenize(prompt(x)), remove_columns=["input_text", "output_text"])

Downloading readme:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/164k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

dataset DatasetDict({
    train: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 531
    })
})


Map:   0%|          | 0/531 [00:00<?, ? examples/s]

In [None]:
# trainer , note the output folder location that you can change if needed
trainer = Trainer(
    model=model,
    train_dataset=train_data,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=6,
        learning_rate=1e-4,
        logging_steps=2,
        optim="adamw_torch",
        save_strategy="epoch",
        output_dir="mixtral-moe-lora-instruct-VertexAI"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False

trainer.train()



Step,Training Loss
2,3.0815
4,2.2637
6,1.5117
8,1.2196
10,1.2533
12,1.0599
14,1.544
16,1.1762
18,1.3058
20,1.238




TrainOutput(global_step=792, training_loss=0.5151753302550677, metrics={'train_runtime': 10686.7512, 'train_samples_per_second': 0.298, 'train_steps_per_second': 0.074, 'total_flos': 2.2717128987220378e+17, 'train_loss': 0.5151753302550677, 'epoch': 5.97})

In [None]:
#### Second Part: merge the base model and tuned adapter layers then use the model
### important: For this part we recommend to run this on a machicne in Google Cloud such as: A2, A100 GPU 80 GB, 170 GB RAM, 450 GB ssd hdd

# install the required libraries
!pip3 install datasets > /dev/null
!pip3 install transformers -U > /dev/null
!pip3 install peft > /dev/null
!pip3 install bitsandbytes -U > /dev/null

In [None]:
#reboot the notebook
import os
os.kill(os.getpid(), 9)

In [None]:
# required packages
import torch
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import PeftConfig, prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

In [None]:
#the base model and tuned adapter layers path
base_model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"

adapter_model_name =  = "mixtral-moe-lora-instruct-VertexAI/"  ## here select the folder where you saved your tuned adapter layers

In [None]:
#load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    load_in_4bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
    )

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/92.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/19 [00:00<?, ?it/s]

model-00001-of-00019.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00002-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00005-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00006-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00007-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00008-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00009-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00010-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00011-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00012-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00013-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00014-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00015-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00016-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00017-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00018-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00019-of-00019.safetensors:   0%|          | 0.00/4.22G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
#load the base model along with the tuned adapter layers
tuned_model = PeftModel.from_pretrained(base_model, adapter_model_name)

#merge the base model and the tuned adapter layers
tune_model = tuned_model.merge_and_unload()

#save the tuned model
tuned_model.save_pretrained("tuned_model/")

### we recommend that you save your tuned model in a Google Cloud Bucket or Google Drive or Hugging Face Hub



In [None]:
#Inference
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
text = "<s>[INST] Help me with my ML platform questions [/INST] Of course!</s> [INST] What is the model garden? [/INST]"
inputs = tokenizer(text, return_tensors="pt")

#base model
print("\n Base Model: \n")
base_outputs = base_model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(base_outputs[0], skip_special_tokens=True))

#load the base model along with the tuned adapter layers
tuned_model = PeftModel.from_pretrained(base_model, adapter_model_name)

#tuned model
print("\n Tuned Model: \n")
tuned_outputs = tuned_model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(tuned_outputs[0], skip_special_tokens=True))

### we recommend to use an evaluation method such as  Auto SxS to ensure the tuned model perfoms better than the base model

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Base Model: 

[INST] Help me with my ML platform questions [/INST] Of course!  [INST] What is the model garden? [/INST] A model garden is a collection of pre-built and pre-trained machine learning models that can be used for various tasks. It is a place where you can find models that have been created and trained by machine learning experts. These models can be used


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Tuned Model: 

[INST] Help me with my ML platform questions [/INST] Of course!  [INST] What is the model garden? [/INST]The model garden is a platform that helps you discover, test, customize, and deploy AI models.


In [None]:
## we can see that the Base Model is answering on a generic way
## and the tuned model answers as it's described in the tuning dataset
## in that sense the tuned model is basing it's answer on the custom content we passed along the tuning job