# Instruct tuning the model

This notebook draws heavily a similar one done for the [phi3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/sample_finetune.py) model. 

The difference here is that this will focus on a model's full fine-tuning process, work for going from a base model to a new insruction model, and should work for almost any model on HuggingFace.

At the end of the notebook are the steps to save this as a gguf format which will allow for fast and easy inference.

In [1]:
import sys
import logging

import datasets
from datasets import load_dataset
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
import os
import json
import wandb


In [2]:
logger = logging.getLogger(__name__)
wandb.init(project="smollm-ft")
###################
# Hyper-parameters
###################
training_config = {
    "do_eval": False,
    "learning_rate": 5.0e-4,
    "weight_decay": 0.01,
    "per_device_train_batch_size": 8,
    "gradient_accumulation_steps": 1,
    "log_level": "info",
    "logging_steps": 150,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 5,
    "max_steps": -1,
    "output_dir": "./picard-smol-ft",
    "overwrite_output_dir": True,
    "remove_unused_columns": True,
    "save_steps": 500,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.05,
    "report_to":"wandb",
    "neftune_noise_alpha":5,
    "push_to_hub": True,
    }

train_conf = TrainingArguments(**training_config)



Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnoahpunintended[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
###############
# Setup logging
###############
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

logger.info(f"Training/evaluation parameters {train_conf}")


2024-11-15 18:48:36 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_str

In [4]:


####################
# Base Model Loading
####################
checkpoint_path = "HuggingFaceTB/SmolLM2-360M"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
#    attn_implementation="flash_attention_2",  # only works on latest gpus, probably not worth it in most cases
     torch_dtype=torch.bfloat16,
   device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)


###################
# Tokenizer Loading
###################

checkpoint_path = "HuggingFaceTB/SmolLM2-360M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 2048
tokenizer.pad_token = "<|endoftext|>"  # note this is specific to smollm
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'
# https://stackoverflow.com/questions/76446228/setting-padding-token-as-eos-token-when-using-datacollatorforlanguagemodeling-fr


[INFO|configuration_utils.py:733] 2024-11-15 18:48:36,532 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-360M/snapshots/3ce05f63c246c44616da500b47b01f082f4d3bcc/config.json
[INFO|configuration_utils.py:800] 2024-11-15 18:48:36,534 >> Model config LlamaConfig {
  "_name_or_path": "HuggingFaceTB/SmolLM2-360M",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "eos_token_id": 0,
  "hidden_act": "silu",
  "hidden_size": 960,
  "initializer_range": 0.02,
  "intermediate_size": 2560,
  "is_llama_config": true,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 15,
  "num_hidden_layers": 32,
  "num_key_value_heads": 5,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_interleaved": false,
  "rope_scaling": null,
  "rope_theta": 100000,
  "tie_word_embeddings": true,
  "torch_dtype": "

[INFO|modeling_utils.py:3678] 2024-11-15 18:48:36,608 >> loading weights file model.safetensors from cache at /home/zeus/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-360M/snapshots/3ce05f63c246c44616da500b47b01f082f4d3bcc/model.safetensors
[INFO|modeling_utils.py:1606] 2024-11-15 18:48:36,708 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.
[INFO|configuration_utils.py:1038] 2024-11-15 18:48:36,711 >> Generate config GenerationConfig {
  "bos_token_id": 0,
  "eos_token_id": 0,
  "use_cache": false
}

[INFO|modeling_utils.py:4507] 2024-11-15 18:48:37,299 >> All model checkpoint weights were used when initializing LlamaForCausalLM.

[INFO|modeling_utils.py:4515] 2024-11-15 18:48:37,300 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at HuggingFaceTB/SmolLM2-360M.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.

In [5]:
tokenizer.pad_token, tokenizer.eos_token, tokenizer.eos_token_id, tokenizer.pad_token_id

('<|endoftext|>', '<|im_end|>', 2, 0)

### Setting up the fine-tune 

Now that the synthetic dataset is made, next up is ensure the model is capable of answering like we expect, without the large system prompt impacting latency. 

The solution to this is to open up the dataset, replace the system prompt with something much simpler, and starting training with that.

In [6]:
##################
# Data Processing
##################
def apply_chat_template(
    example,
    tokenizer,
):
    messages = example["messages"]['messages']
    messages[0]['content'] = "You are Pi-Card." # Or just remove?
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False).strip('\n')
    return example

raw_dataset = load_dataset("nkasmanoff/pi-card-sft-data") 


train_dataset = raw_dataset["train"]

processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    desc="Applying chat template to train_sft",
)

# shuffle the dataset
processed_train_dataset = processed_train_dataset.shuffle(seed=42)

Overwrite dataset info from restored data version if exists.


2024-11-15 18:48:38 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6


2024-11-15 18:48:38 - INFO - datasets.info - Loading Dataset info from /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6


Found cached dataset pi-card-sft-data (/home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6)


2024-11-15 18:48:38 - INFO - datasets.builder - Found cached dataset pi-card-sft-data (/home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6)


Loading Dataset info from /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6


2024-11-15 18:48:38 - INFO - datasets.info - Loading Dataset info from /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6


Process #0 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00000_of_00010.arrow


2024-11-15 18:48:39 - INFO - datasets.arrow_dataset - Process #0 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00000_of_00010.arrow


Process #1 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00001_of_00010.arrow


2024-11-15 18:48:39 - INFO - datasets.arrow_dataset - Process #1 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00001_of_00010.arrow


Process #2 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00002_of_00010.arrow


2024-11-15 18:48:39 - INFO - datasets.arrow_dataset - Process #2 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00002_of_00010.arrow


Process #3 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00003_of_00010.arrow


2024-11-15 18:48:39 - INFO - datasets.arrow_dataset - Process #3 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00003_of_00010.arrow


Process #4 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00004_of_00010.arrow


2024-11-15 18:48:39 - INFO - datasets.arrow_dataset - Process #4 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00004_of_00010.arrow


Process #5 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00005_of_00010.arrow


2024-11-15 18:48:39 - INFO - datasets.arrow_dataset - Process #5 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00005_of_00010.arrow


Process #6 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00006_of_00010.arrow


2024-11-15 18:48:39 - INFO - datasets.arrow_dataset - Process #6 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00006_of_00010.arrow


Process #7 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00007_of_00010.arrow


2024-11-15 18:48:39 - INFO - datasets.arrow_dataset - Process #7 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00007_of_00010.arrow


Process #8 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00008_of_00010.arrow


2024-11-15 18:48:39 - INFO - datasets.arrow_dataset - Process #8 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00008_of_00010.arrow


Process #9 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00009_of_00010.arrow


2024-11-15 18:48:39 - INFO - datasets.arrow_dataset - Process #9 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_00009_of_00010.arrow


Loading cached processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_*_of_00010.arrow


2024-11-15 18:48:41 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-22813cdd6ab427cc_*_of_00010.arrow


Concatenating 10 shards


2024-11-15 18:48:41 - INFO - datasets.arrow_dataset - Concatenating 10 shards


Loading cached shuffled indices for dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-68cc631114fc8bcc.arrow


2024-11-15 18:48:41 - INFO - datasets.arrow_dataset - Loading cached shuffled indices for dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-68cc631114fc8bcc.arrow


In [7]:
processed_train_dataset[13]

{'messages': {'messages': [{'content': 'You are Pi-Card.', 'role': 'system'},
   {'content': 'How are clouds formed?', 'role': 'user'},
   {'content': 'Clouds are formed when water vapor in the air condenses into visible liquid droplets or ice crystals, typically as a result of changes in temperature or humidity.',
    'role': 'assistant'}]},
 'text': '<|im_start|>system\nYou are Pi-Card.<|im_end|>\n<|im_start|>user\nHow are clouds formed?<|im_end|>\n<|im_start|>assistant\nClouds are formed when water vapor in the air condenses into visible liquid droplets or ice crystals, typically as a result of changes in temperature or humidity.<|im_end|>'}

In [9]:
model.eval();
system_prompt = "You are Pi-Card."
#prompt = """What is the oort cloud?"""
prompt = "I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?"
#prompt = f"<|im_start|>system\nYou are Pi-CARD, the Raspberry Pi AI assistant<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
input_ids = tokenizer.encode(prompt, return_tensors='pt')
input_ids = input_ids.to(model.device)
output = model.generate(input_ids, max_new_tokens=256,  do_sample=False, pad_token_id=tokenizer.eos_token_id)
output_text = tokenizer.decode(output[0], skip_special_tokens=False, pad_token_id = tokenizer.eos_token_id)
formatted_output_text = "<|im_end|>".join(output_text.split("<|im_end|>")[:2]) + "<|im_end|>"
print(formatted_output_text)

<|im_start|>user
I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?<|im_end|>
<|im_start|>assistant
I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?

I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?

I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?

I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?

I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?

I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?

I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?

I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?

I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?

I'm going on a wildern

In [8]:
###########
# Training
###########

model.train();
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    train_dataset=processed_train_dataset,
    max_seq_length=2048,
    dataset_text_field="text",
    tokenizer=tokenizer,
    #packing=True,
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

# trainer.push_to_hub()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
[INFO|training_args.py:2100] 2024-11-15 18:48:42,773 >> PyTorch: setting up devices
Loading cached processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-0c9a379ef98ce0cf.arrow


2024-11-15 18:48:43 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___pi-card-sft-data/default/0.0.0/ace7c218bc809b351ad35add013fe25169b7c2d6/cache-0c9a379ef98ce0cf.arrow


[INFO|trainer.py:2134] 2024-11-15 18:48:44,687 >> ***** Running training *****
[INFO|trainer.py:2135] 2024-11-15 18:48:44,688 >>   Num examples = 5,660
[INFO|trainer.py:2136] 2024-11-15 18:48:44,689 >>   Num Epochs = 5
[INFO|trainer.py:2137] 2024-11-15 18:48:44,689 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:2140] 2024-11-15 18:48:44,690 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:2141] 2024-11-15 18:48:44,690 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2142] 2024-11-15 18:48:44,691 >>   Total optimization steps = 3,540
[INFO|trainer.py:2143] 2024-11-15 18:48:44,692 >>   Number of trainable parameters = 361,821,120
[INFO|integration_utils.py:807] 2024-11-15 18:48:44,693 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
150,1.386
300,1.2111
450,1.2583
600,1.1989
750,1.1106
900,0.8435
1050,0.8381
1200,0.8115
1350,0.8577
1500,0.6348


[INFO|trainer.py:3503] 2024-11-15 19:07:03,094 >> Saving model checkpoint to ./picard-smol-ft/checkpoint-500
[INFO|configuration_utils.py:472] 2024-11-15 19:07:03,096 >> Configuration saved in ./picard-smol-ft/checkpoint-500/config.json
[INFO|configuration_utils.py:807] 2024-11-15 19:07:03,097 >> Configuration saved in ./picard-smol-ft/checkpoint-500/generation_config.json
[INFO|modeling_utils.py:2799] 2024-11-15 19:07:04,431 >> Model weights saved in ./picard-smol-ft/checkpoint-500/model.safetensors
[INFO|tokenization_utils_base.py:2684] 2024-11-15 19:07:04,433 >> tokenizer config file saved in ./picard-smol-ft/checkpoint-500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2693] 2024-11-15 19:07:04,434 >> Special tokens file saved in ./picard-smol-ft/checkpoint-500/special_tokens_map.json
[INFO|tokenization_utils_base.py:2684] 2024-11-15 19:07:07,822 >> tokenizer config file saved in ./picard-smol-ft/tokenizer_config.json
[INFO|tokenization_utils_base.py:2693] 2024-11-15 19:07:

***** train metrics *****
  epoch                    =        5.0
  total_flos               = 76087062GF
  train_loss               =     0.6602
  train_runtime            = 2:09:16.34
  train_samples_per_second =      3.649
  train_steps_per_second   =      0.456


# Evaluation and saving the model

In [None]:
# Load the model from the checkpoint

# find most recently created folder in checkpoint_dir and set as checkpoint path
checkpoint_path = sorted(os.listdir(train_conf.output_dir))[-1]
checkpoint_path = os.path.join(train_conf.output_dir, checkpoint_path)
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
     torch_dtype=torch.bfloat16,
   device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)


In [10]:
model.eval();
#prompt = """Who are you?"""
prompt = """I have 45 pills. Sofie dose is 1 pill in morning and half pill at night. How long will this last"""
prompt = f"<|im_start|>system\nYou are Pi-Card.<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
input_ids = tokenizer.encode(prompt, return_tensors='pt')
input_ids = input_ids.to(model.device)
output = model.generate(input_ids, max_new_tokens=256,  do_sample=False, pad_token_id=tokenizer.eos_token_id)
output_text = tokenizer.decode(output[0], skip_special_tokens=False, pad_token_id = tokenizer.eos_token_id)
formatted_output_text = "<|im_end|>".join(output_text.split("<|im_end|>")[:3]) + "<|im_end|>" 
print(formatted_output_text)

<|im_start|>system
You are Pi-Card.<|im_end|>
<|im_start|>user
I have 45 pills. Sofie dose is 1 pill in morning and half pill at night. How long will this last<|im_end|>
<|im_start|>assistant
To find the number of days the medication will last, we need to divide the total number of pills by the dose of each pill. Since there are 32 pills in a box for Sofie, and each box contains 4 pills, the medication will last for 12 days if taken as directed.<|im_end|>


# Saving to gguf
#https://github.com/ggerganov/llama.cpp/discussions/2948




In [None]:
# Start by downloading llama-cpp if not already done

#!git clone https://github.com/ggerganov/llama.cpp.git
!pip install -r llama.cpp/requirements.txt

In [None]:
# Create gguf file

# Please note you'll need to update the checkpoint path and model names to the one you want to convert & save
!python llama.cpp/convert_hf_to_gguf.py nature-buddy/checkpoint-2005 --outfile nature-buddy-0.135b-f16.gguf --outtype f16


The quanitzation output is going to have an outsized impact on latency / performance. 

While f16 is the default and good, it's worth noting the model was trained using bf16, a slightly different format, so that outtype may be worth testing.

Now that you have the gguf you can either work with that directly, or convert it to an ollama format, which can be easier to work with in some cases. 

For instructions on how to do this, please see the instructions in create ollama text file.