In [None]:
import sys
import logging

import datasets
from datasets import load_dataset
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
import os
import json



In [None]:
logger = logging.getLogger(__name__)
os.environ["WANDB_PROJECT"] = "llm-ft"  # name your W&B project

###################
# Hyper-parameters
###################
training_config = {
    "do_eval": False,
    "learning_rate": 5.0e-06,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 5,
    "max_steps": -1,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    "remove_unused_columns": True,
    "save_steps": 100,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    "report_to":"wandb"
    }


train_conf = TrainingArguments(**training_config)



In [None]:


###############
# Setup logging
###############
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process a small summary
logger.warning(
    f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
    + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
)
logger.info(f"Training/evaluation parameters {train_conf}")
#logger.info(f"PEFT parameters {peft_conf}")


In [None]:


################
# Module Loading
################
checkpoint_path = "Qwen/Qwen2-0.5B-Instruct"
# checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
#    attn_implementation="flash_attention_2",  # loading the model with flash-attenstion support
     torch_dtype=torch.bfloat16,
   device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 1024
tokenizer.pad_token = tokenizer.eos_token  # use unk rather than eos token to prevent endless generation
# https://stackoverflow.com/questions/76446228/setting-padding-token-as-eos-token-when-using-datacollatorforlanguagemodeling-fr
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

In [None]:
# print number of parameters
num_params = sum(p.numel() for p in model.parameters())
logger.info(f"Number of parameters: {num_params}")


# print number of trainable parameters
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
logger.info(f"Number of trainable parameters: {num_trainable_params}")


In [None]:

# Load the conversations had with pi-card thus far
conversations = os.listdir('../storage')
from ast import literal_eval    

assistant_messages = []
for f in conversations:
    conversation_path = os.path.join('../storage', f)
    conversation = open(conversation_path).read()

    assistant_messages.append({"messages":literal_eval(conversation.replace('\n ',''))})


with open('assistant_messages.json', 'w') as f:
    json.dump(assistant_messages, f, indent=4)

In [None]:

##################
# Data Processing
##################
def apply_chat_template(
    example,
    tokenizer,
):
    messages = example["messages"]
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False)
    return example

raw_dataset = load_dataset('json', data_files='assistant_messages.json', split='train') #load_dataset("vicgalle/alpaca-gpt4", trust_remote_code=True)

train_dataset = raw_dataset


def create_messages(example):
    messages = [
        {'content': 'You are a helpful AI assistant', 'role': 'system'},
    ]
    if len(example['input']) > 0:
        messages.append({'content': example['instruction'] + ' ' + example['input'], 'role': 'user'})
    else:
        messages.append({'content': example['instruction'], 'role': 'user'})
    messages.append({'content': example['output'], 'role': 'assistant'})

    return {'messages': messages}


#train_dataset = train_dataset.map(create_messages)
column_names = list(train_dataset.features)

processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    desc="Applying chat template to train_sft",
)




# Load an additional chat dataset to (hopefully) help with performance

raw_dataset = load_dataset("HuggingFaceH4/ultrachat_200k", trust_remote_code=True)
train_dataset = raw_dataset["train_sft"]
test_dataset = raw_dataset["test_sft"]
column_names = list(train_dataset.features)

processed_train_dataset_ultrachat = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)

processed_test_dataset = test_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to test_sft",
)



In [None]:
# Select subsets of the huge dataset
processed_train_dataset_ultrachat = processed_train_dataset_ultrachat.select(range(15000))
processed_test_dataset = processed_test_dataset.select(range(5000))

# combine the datasets
processed_train_dataset = datasets.concatenate_datasets([processed_train_dataset_ultrachat, processed_train_dataset])

In [None]:

###########
# Training
###########
trainer = SFTTrainer(
    model=model,
    args=train_conf,
  #  peft_config=peft_conf,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_test_dataset,
    max_seq_length=1024,
    dataset_text_field="text",
    tokenizer=tokenizer,
    #packing=True,
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()


In [None]:
# test model 
model.eval();

prompt = """<|im_start|>system
You are PiCard, a Raspbery Pi Voice Assistant. Answer questions truthfully in only a sentence.<|im_end|>
<|im_start|>user
Tell me a fun story about Jupiter.<|im_end|>
"""

inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
outputs = model.generate(**inputs, max_length=512, do_sample=False)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
#############
# Evaluation
#############
tokenizer.padding_side = 'right'
metrics = trainer.evaluate()
metrics["eval_samples"] = len(processed_test_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


# ############
# # Save model
# ############
trainer.save_model(train_conf.output_dir)

In [None]:
# Save to gguf format
#https://github.com/ggerganov/llama.cpp/discussions/2948
!python llama.cpp/convert-hf-to-gguf-update.py checkpoint_dir/checkpoint-18800 \
  --outfile model_v2.gguf \
  --outtype q8_0




In [None]:
# Create modelfile for ollama