# Instruct tuning the model

This notebook draws heavily a similar one done for the [phi3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/sample_finetune.py) model. 

The difference here is that this will focus on a model's full fine-tuning process, work for going from a base model to a new insruction model, and should work for almost any model on HuggingFace.

At the end of the notebook are the steps to save this as a gguf format which will allow for fast and easy inference.

In [1]:
import sys
import logging

import datasets
from datasets import load_dataset
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
import os
import json
import wandb


In [2]:
logger = logging.getLogger(__name__)
wandb.init(project="smollm-ft")
###################
# Hyper-parameters
###################
training_config = {
    "do_eval": False,
    "learning_rate": 5.0e-04,
    "per_device_train_batch_size": 8,
    "gradient_accumulation_steps": 1,
    "log_level": "info",
    "logging_steps": 100,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 8,
    "max_steps": -1,
    "output_dir": "./nature-buddy",
    "overwrite_output_dir": True,
    "remove_unused_columns": True,
    "save_steps": 500,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.05,
    "report_to":"wandb",
    "neftune_noise_alpha":3,
    "push_to_hub": True,
    }

train_conf = TrainingArguments(**training_config)



Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnoahpunintended[0m ([33mfdlx[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
###############
# Setup logging
###############
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

logger.info(f"Training/evaluation parameters {train_conf}")


2024-11-07 22:40:21 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_str

In [4]:


####################
# Base Model Loading
####################
checkpoint_path = "HuggingFaceTB/SmolLM2-135M"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
#    attn_implementation="flash_attention_2",  # only works on latest gpus, probably not worth it in most cases
     torch_dtype=torch.bfloat16,
   device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)


###################
# Tokenizer Loading
###################

checkpoint_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 2048
tokenizer.pad_token = "<|endoftext|>"  # note this is specific to smollm
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'
# https://stackoverflow.com/questions/76446228/setting-padding-token-as-eos-token-when-using-datacollatorforlanguagemodeling-fr


config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

[INFO|configuration_utils.py:733] 2024-11-07 22:41:01,872 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M/snapshots/28e66ca6931668447a3bac213f23d990ad3b0e2b/config.json
[INFO|configuration_utils.py:800] 2024-11-07 22:41:01,875 >> Model config LlamaConfig {
  "_name_or_path": "HuggingFaceTB/SmolLM2-135M",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "eos_token_id": 0,
  "hidden_act": "silu",
  "hidden_size": 576,
  "initializer_range": 0.041666666666666664,
  "intermediate_size": 1536,
  "is_llama_config": true,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 9,
  "num_hidden_layers": 30,
  "num_key_value_heads": 3,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_interleaved": false,
  "rope_scaling": null,
  "rope_theta": 100000,
  "tie_word_embeddings": true,
  "

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

[INFO|modeling_utils.py:3678] 2024-11-07 22:41:27,852 >> loading weights file model.safetensors from cache at /home/zeus/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M/snapshots/28e66ca6931668447a3bac213f23d990ad3b0e2b/model.safetensors
[INFO|modeling_utils.py:1606] 2024-11-07 22:41:27,870 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.
[INFO|configuration_utils.py:1038] 2024-11-07 22:41:27,872 >> Generate config GenerationConfig {
  "bos_token_id": 0,
  "eos_token_id": 0,
  "use_cache": false
}

[INFO|modeling_utils.py:4507] 2024-11-07 22:41:28,342 >> All model checkpoint weights were used when initializing LlamaForCausalLM.

[INFO|modeling_utils.py:4515] 2024-11-07 22:41:28,343 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at HuggingFaceTB/SmolLM2-135M.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

[INFO|configuration_utils.py:993] 2024-11-07 22:41:28,411 >> loading configuration file generation_config.json from cache at /home/zeus/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M/snapshots/28e66ca6931668447a3bac213f23d990ad3b0e2b/generation_config.json
[INFO|configuration_utils.py:1038] 2024-11-07 22:41:28,411 >> Generate config GenerationConfig {
  "bos_token_id": 0,
  "eos_token_id": 0
}

[INFO|tokenization_utils_base.py:2269] 2024-11-07 22:41:28,758 >> loading file vocab.json from cache at /home/zeus/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/5a33ba103645800d7b3790c4448546c1b73efc71/vocab.json
[INFO|tokenization_utils_base.py:2269] 2024-11-07 22:41:28,758 >> loading file merges.txt from cache at /home/zeus/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/5a33ba103645800d7b3790c4448546c1b73efc71/merges.txt
[INFO|tokenization_utils_base.py:2269] 2024-11-07 22:41:28,759 >> loading file tokenizer.json from c

In [5]:
tokenizer.pad_token_id, tokenizer.eos_token_id

(0, 2)

### Setting up the fine-tune 

Now that the synthetic dataset is made, next up is ensure the model is capable of answering like we expect, without the large system prompt impacting latency. 

The solution to this is to open up the dataset, replace the system prompt with something much simpler, and starting training with that.

In [31]:
##################
# Data Processing
##################
def apply_chat_template(
    example,
    tokenizer,
):

    try:
        messages = example["messages"]

        example["text"] = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=False).strip('\n')
        return example
    except Exception as e:
        print(messages)

raw_dataset = load_dataset("nkasmanoff/nature_buddy_sft") 


raw_datset2 = load_dataset("HuggingFaceTB/everyday-conversations-llama3.1-2k")
train_dataset = raw_dataset['train']
# delete any rows from train_dataset where the messages are empty
train_dataset = train_dataset.filter(lambda x: len(x['messages']) > 0)




train_dataset2 = raw_datset2['train_sft']
train_dataset = datasets.concatenate_datasets([train_dataset, train_dataset2])
column_names = list(train_dataset.features)

processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    desc="Applying chat template to train_sft",
)

# shuffle the dataset
processed_train_dataset = processed_train_dataset.shuffle(seed=42)

Overwrite dataset info from restored data version if exists.


2024-11-07 22:49:48 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b


2024-11-07 22:49:48 - INFO - datasets.info - Loading Dataset info from /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b


Found cached dataset nature_buddy_sft (/home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b)


2024-11-07 22:49:48 - INFO - datasets.builder - Found cached dataset nature_buddy_sft (/home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b)


Loading Dataset info from /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b


2024-11-07 22:49:48 - INFO - datasets.info - Loading Dataset info from /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b


Overwrite dataset info from restored data version if exists.


2024-11-07 22:49:49 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /home/zeus/.cache/huggingface/datasets/HuggingFaceTB___everyday-conversations-llama3.1-2k/default/0.0.0/451e129a6730488b7213951eb815af95f381eeea


2024-11-07 22:49:49 - INFO - datasets.info - Loading Dataset info from /home/zeus/.cache/huggingface/datasets/HuggingFaceTB___everyday-conversations-llama3.1-2k/default/0.0.0/451e129a6730488b7213951eb815af95f381eeea


Found cached dataset everyday-conversations-llama3.1-2k (/home/zeus/.cache/huggingface/datasets/HuggingFaceTB___everyday-conversations-llama3.1-2k/default/0.0.0/451e129a6730488b7213951eb815af95f381eeea)


2024-11-07 22:49:49 - INFO - datasets.builder - Found cached dataset everyday-conversations-llama3.1-2k (/home/zeus/.cache/huggingface/datasets/HuggingFaceTB___everyday-conversations-llama3.1-2k/default/0.0.0/451e129a6730488b7213951eb815af95f381eeea)


Loading Dataset info from /home/zeus/.cache/huggingface/datasets/HuggingFaceTB___everyday-conversations-llama3.1-2k/default/0.0.0/451e129a6730488b7213951eb815af95f381eeea


2024-11-07 22:49:49 - INFO - datasets.info - Loading Dataset info from /home/zeus/.cache/huggingface/datasets/HuggingFaceTB___everyday-conversations-llama3.1-2k/default/0.0.0/451e129a6730488b7213951eb815af95f381eeea


Filter:   0%|          | 0/2821 [00:00<?, ? examples/s]

Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-415bb2ac2f21f986.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-415bb2ac2f21f986.arrow


Some of the datasets have disparate format. Resetting the format of the concatenated dataset.


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Some of the datasets have disparate format. Resetting the format of the concatenated dataset.


Process #0 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00000_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Process #0 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00000_of_00010.arrow


Process #1 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00001_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Process #1 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00001_of_00010.arrow


Process #2 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00002_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Process #2 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00002_of_00010.arrow


Process #3 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00003_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Process #3 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00003_of_00010.arrow


Process #4 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00004_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Process #4 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00004_of_00010.arrow


Process #5 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00005_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Process #5 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00005_of_00010.arrow


Process #6 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00006_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Process #6 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00006_of_00010.arrow


Process #7 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00007_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Process #7 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00007_of_00010.arrow


Process #8 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00008_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Process #8 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00008_of_00010.arrow


Process #9 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00009_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Process #9 will write at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00009_of_00010.arrow


Spawning 10 processes


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Spawning 10 processes


Applying chat template to train_sft (num_proc=10):   0%|          | 0/5078 [00:00<?, ? examples/s]

Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00000_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00000_of_00010.arrow


Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00001_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00001_of_00010.arrow


Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00002_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00002_of_00010.arrow


Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00003_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00003_of_00010.arrow


Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00004_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00004_of_00010.arrow


Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00005_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00005_of_00010.arrow


Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00006_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00006_of_00010.arrow


Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00007_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00007_of_00010.arrow


Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00008_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00008_of_00010.arrow


Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00009_of_00010.arrow


2024-11-07 22:49:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-7e9bd9a539945fed_00009_of_00010.arrow


Concatenating 10 shards


2024-11-07 22:49:50 - INFO - datasets.arrow_dataset - Concatenating 10 shards


Caching indices mapping at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-5b81e0e5fc7b6cef.arrow


2024-11-07 22:49:50 - INFO - datasets.arrow_dataset - Caching indices mapping at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-5b81e0e5fc7b6cef.arrow


In [32]:
len(processed_train_dataset )

5078

In [33]:
model.eval();
#prompt = """What is the oort cloud?"""
prompt = "I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?"
#prompt = f"<|im_start|>system\nYou are Pi-Card, the Raspberry Pi voice assistant.<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
input_ids = tokenizer.encode(prompt, return_tensors='pt')
input_ids = input_ids.to(model.device)
output = model.generate(input_ids, max_new_tokens=256,  do_sample=False, pad_token_id=tokenizer.eos_token_id)
output_text = tokenizer.decode(output[0], skip_special_tokens=False, pad_token_id = tokenizer.eos_token_id)
formatted_output_text = "<|im_end|>".join(output_text.split("<|im_end|>")[:2]) + "<|im_end|>"
print(formatted_output_text)

<|im_start|>user
I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?<|im_end|>
<|im_start|>assistant
I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?
I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?
I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?
I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?
I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?
I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?
I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?
I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?
I'm going on a wilderness survival trip and I'm not sure how to find food. Can you help me?
I'm going on a wilderness survi

In [34]:
###########
# Training
###########

model.train();
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    train_dataset=processed_train_dataset,
    max_seq_length=2048,
    dataset_text_field="text",
    tokenizer=tokenizer,
    #packing=True,
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
trainer.push_to_hub()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
[INFO|training_args.py:2100] 2024-11-07 22:51:18,168 >> PyTorch: setting up devices


Map:   0%|          | 0/5078 [00:00<?, ? examples/s]

Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-73f8361304d3b2ba.arrow


2024-11-07 22:51:18 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/zeus/.cache/huggingface/datasets/nkasmanoff___nature_buddy_sft/default/0.0.0/d5d11b39304ec8b237019b762793d8d8c61cbf9b/cache-73f8361304d3b2ba.arrow


[INFO|trainer.py:2134] 2024-11-07 22:51:20,860 >> ***** Running training *****
[INFO|trainer.py:2135] 2024-11-07 22:51:20,861 >>   Num examples = 5,078
[INFO|trainer.py:2136] 2024-11-07 22:51:20,861 >>   Num Epochs = 8
[INFO|trainer.py:2137] 2024-11-07 22:51:20,862 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:2140] 2024-11-07 22:51:20,862 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:2141] 2024-11-07 22:51:20,863 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2142] 2024-11-07 22:51:20,863 >>   Total optimization steps = 5,080
[INFO|trainer.py:2143] 2024-11-07 22:51:20,864 >>   Number of trainable parameters = 134,515,008
[INFO|integration_utils.py:807] 2024-11-07 22:51:20,865 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
100,1.789
200,1.0453
300,1.0088
400,0.9274
500,0.9118
600,0.8458
700,0.6983
800,0.6493
900,0.6411
1000,0.6432


[INFO|trainer.py:3503] 2024-11-07 22:53:29,201 >> Saving model checkpoint to ./nature-buddy/checkpoint-500
[INFO|configuration_utils.py:472] 2024-11-07 22:53:29,203 >> Configuration saved in ./nature-buddy/checkpoint-500/config.json
[INFO|configuration_utils.py:807] 2024-11-07 22:53:29,204 >> Configuration saved in ./nature-buddy/checkpoint-500/generation_config.json
[INFO|modeling_utils.py:2799] 2024-11-07 22:53:29,693 >> Model weights saved in ./nature-buddy/checkpoint-500/model.safetensors
[INFO|tokenization_utils_base.py:2684] 2024-11-07 22:53:29,695 >> tokenizer config file saved in ./nature-buddy/checkpoint-500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2693] 2024-11-07 22:53:29,696 >> Special tokens file saved in ./nature-buddy/checkpoint-500/special_tokens_map.json
[INFO|tokenization_utils_base.py:2684] 2024-11-07 22:53:30,795 >> tokenizer config file saved in ./nature-buddy/tokenizer_config.json
[INFO|tokenization_utils_base.py:2693] 2024-11-07 22:53:30,796 >> Spec

KeyboardInterrupt: 

# Evaluation and saving the model

In [None]:
# Load the model from the checkpoint

# find most recently created folder in checkpoint_dir and set as checkpoint path
checkpoint_path = sorted(os.listdir(train_conf.output_dir))[-1]
checkpoint_path = os.path.join(train_conf.output_dir, checkpoint_path)
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
     torch_dtype=torch.bfloat16,
   device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)


In [None]:
model.eval();
#prompt = """What is the oort cloud?"""
prompt = "What is the safest way to purify water in the wilderness?"
#prompt = f"<|im_start|>system\nYou are Pi-Card, the Raspberry Pi voice assistant.<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
input_ids = tokenizer.encode(prompt, return_tensors='pt')
input_ids = input_ids.to(model.device)
output = model.generate(input_ids, max_new_tokens=256,  do_sample=False, pad_token_id=tokenizer.eos_token_id)
output_text = tokenizer.decode(output[0], skip_special_tokens=False, pad_token_id = tokenizer.eos_token_id)
formatted_output_text = "<|im_end|>".join(output_text.split("<|im_end|>")[:2]) + "<|im_end|>" 
print(formatted_output_text)

# Saving to gguf
#https://github.com/ggerganov/llama.cpp/discussions/2948




In [None]:
# Start by downloading llama-cpp if not already done

#!git clone https://github.com/ggerganov/llama.cpp.git
!pip install -r llama.cpp/requirements.txt

In [None]:
# Create gguf file

# Please note you'll need to update the checkpoint path and model names to the one you want to convert & save
!python llama.cpp/convert_hf_to_gguf.py nature-buddy/checkpoint-2005 --outfile nature-buddy-0.135b-f16.gguf --outtype f16


The quanitzation output is going to have an outsized impact on latency / performance. 

While f16 is the default and good, it's worth noting the model was trained using bf16, a slightly different format, so that outtype may be worth testing.

Now that you have the gguf you can either work with that directly, or convert it to an ollama format, which can be easier to work with in some cases. 

For instructions on how to do this, please see the instructions in create ollama text file.