In [1]:
# Fine-Tune (Unsloth)
from unsloth import FastLanguageModel
import wandb
import torch
from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset
from transformers import TrainingArguments
from datasets import load_dataset
import ollama

# Unload Model if it's active
ollama.generate(model='TRACHI', keep_alive=0)

# The model that you want to train from the Hugging Face hub
model_id = "unsloth/llama-3-8b-Instruct-bnb-4bit"

# The instruction dataset to use
dataset_name = "norygano/TRACHI"

# Fine-tuned model name
new_model = "Llama-3-TRACHI-8B-Instruct"

# Constants
model_name = model_id.split('/')[-1]

max_seq_length = 4096

# Load your dataset
raw_dataset = load_dataset("norygano/TRACHI")

# Load model + tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Function to apply chat template to each entry in the dataset
def apply_chat_template(batch):
    formatted_chats = [tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) for chat in batch['chat']]
    return {'formatted_chat': formatted_chats}

# Applying chat template to the dataset
formatted_dataset = raw_dataset.map(apply_chat_template, batched=True)

# Creating ConstantLengthDataset
train_dataset = ConstantLengthDataset(
    tokenizer=tokenizer,
    dataset=formatted_dataset['train'],
    dataset_text_field='formatted_chat',
    seq_length=max_seq_length,
    eos_token_id=tokenizer.eos_token_id
)

eval_dataset = ConstantLengthDataset(
    tokenizer=tokenizer,
    dataset=formatted_dataset['test'],
    dataset_text_field='formatted_chat',
    seq_length=max_seq_length,
    eos_token_id=tokenizer.eos_token_id
)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.resize_token_embeddings(len(tokenizer))

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    max_seq_length = max_seq_length,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

args = TrainingArguments(
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    max_steps = -1,
    lr_scheduler_type='cosine',
    learning_rate=4e-4,
    num_train_epochs=2,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 1,
    output_dir = "outputs",
    evaluation_strategy='steps',
    optim = "adamw_8bit",
    tf32=True,
    neftune_noise_alpha=5,
    seed = 3407,
)

trainer = SFTTrainer(
    model = model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field = "formatted_chat",
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = args
    )

# Setup Wandb
wandb.init(project='TRACHI_Llama', entity='norygano', config=args.to_dict())

trainer.train()

# Finish Wandb session
wandb.finish()

# Save trained model
trainer.model.save_pretrained(new_model)

==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
[34m[1mwandb[0m: Currently logged in as: [33mnorygano[0m. Use [1m`wandb login --relogin`[0m to force relogin


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 493 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 62
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2918,3.48351
2,3.3067,3.153913


TypeError: unsupported operand type(s) for /: 'int' and 'NoneType'

In [4]:
model.save_pretrained_merged("Llama-3-TRACHI-8B-Instruct", save_method = "merged_16bit",)

Unsloth: You're not saving a tokenizer as well?
You can do it separately via `tokenizer.save_pretrained(...)`


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 35.13 out of 54.92 RAM for saving.


100%|██████████| 32/32 [00:01<00:00, 27.33it/s]



Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


In [5]:
# Cleanup #CUDA-OOM
if 'model' in locals():
  del model

# Quantize
import os
QUANTIZATION_METHODS = ["q5_k_m"]
new_model = "Llama-3-TRACHI-8B-Instruct"

# Convert to fp16
fp16 = f"{new_model}.fp16.bin"
model_path = os.path.join(new_model, fp16)
print(model_path)
!python llama.cpp/convert.py {new_model} --outtype f16 --outfile {model_path} --pad-vocab --vocab-type bpe

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    qtype = f"{new_model}/{new_model}.{method.upper()}.gguf"
    !llama.cpp/quantize {model_path} {qtype} {method}

Llama-3-TRACHI-8B-Instruct/Llama-3-TRACHI-8B-Instruct.fp16.bin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading model file Llama-3-TRACHI-8B-Instruct/model-00001-of-00004.safetensors
Loading model file Llama-3-TRACHI-8B-Instruct/model-00001-of-00004.safetensors
Loading model file Llama-3-TRACHI-8B-Instruct/model-00002-of-00004.safetensors
Loading model file Llama-3-TRACHI-8B-Instruct/model-00003-of-00004.safetensors
Loading model file Llama-3-TRACHI-8B-Instruct/model-00004-of-00004.safetensors
params = Params(n_vocab=128256, n_embd=4096, n_layer=32, n_ctx=8192, n_ff=14336, n_head=32, n_head_kv=8, n_experts=None, n_experts_used=None, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=500000.0, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=<GGMLFileType.MostlyF16: 1>, path_model=PosixPath('Llama-3-TRACHI-8B-Instruct'))
Loaded vocab file PosixPath('Llama-3-TRACHI-8B-Instruct/tokenizer.json'), type 'bpe'
Vocab info: <BpeVocab with 128000 base tokens and 256 added tokens>
Special vocab info: <SpecialVocab with 280147 merges, special tokens {'bos': 128000, 'eos': 12800

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


main: build = 2736 (dba497e0)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing 'Llama-3-TRACHI-8B-Instruct/Llama-3-TRACHI-8B-Instruct.fp16.bin' to 'Llama-3-TRACHI-8B-Instruct/Llama-3-TRACHI-8B-Instruct.Q5_K_M.gguf' as Q5_K_M
llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from Llama-3-TRACHI-8B-Instruct/Llama-3-TRACHI-8B-Instruct.fp16.bin (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = .
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 128256
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_leng

In [6]:
# Update Modelfile
import ollama
import os

path = os.path.join(os.getcwd(), 'modelfiles', 'Modelfile_TRACHI_L')
print(path)
ollama.create(model='TRACHI', path=path)

/home/nory/projects/discollama/modelfiles/Modelfile_TRACHI_L


{'status': 'success'}

In [None]:
# Push -> HF
from huggingface_hub import create_repo, HfApi
api = HfApi()

new_model = "Llama-3-TRACHI-8B-Instruct"
#api.create_repo(f'{new_model}-GGUF')

# Upload gguf files
api.upload_folder(
    folder_path=new_model,
    repo_id=f"norygano/{new_model}-GGUF",
    allow_patterns=f"*.gguf",
    repo_type="model",
)

In [None]:
# Reload (FP16) -> merge w/ LoRA weights
from datetime import datetime
from peft import LoraConfig, PeftModel
from transformers import (
    DataCollatorForLanguageModeling,
    AutoModelForCausalLM,
    #MistralForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    BitsAndBytesConfig,
)

device_map = {"": 0}

# Cleanup #CUDA-OOM
if 'model' in locals():
  del model
if 'pipe' in locals():
  del pipe
if 'trainer' in locals():
  del trainer
import gc
gc.collect()

# Reload the base model in bf16
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.bfloat16,
    device_map=device_map
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
base_model.resize_token_embeddings(len(tokenizer))
print(len(tokenizer))

# Assuming PeftModel is a custom or previously defined model class for handling post-training operations
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer consistent with the first step and apply the same configurations
tokenizer = AutoTokenizer.from_pretrained(model_id)

# The following lines were commented out in the first step but included here for consistency
# Uncomment and adjust if necessary based on your specific requirements
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# model.resize_token_embeddings(len(tokenizer))

# This setting was not changed in the first step, but keep it if needed for your use case
tokenizer.padding_side = "right"

# Get the current date and time
now = datetime.now()

# Format the date and time as a string
formatted_now = now.strftime("%Y-%m-%d %H:%M:%S")

# Print the formatted date and time
print("Current Date and Time:", formatted_now)

model.save_pretrained(new_model)

In [None]:
# Cleanup #CUDA-OOM
if 'model' in locals():
  del model

# Quantize
import os
QUANTIZATION_METHODS = ["q5_k_m"]
new_model = "Llama-3-TRACHI-8B-Instruct"

# Convert to fp16
fp16 = f"{new_model}.fp16.bin"
model_path = os.path.join(new_model, fp16)
print(model_path)
!python llama.cpp/convert.py {new_model} --outtype f16 --outfile {model_path} --pad-vocab --vocab-type bpe

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    qtype = f"{new_model}/{new_model}.{method.upper()}.gguf"
    !llama.cpp/quantize {model_path} {qtype} {method}

In [None]:
# TODO: Implement batch training
learning_rates = [4e-4, 4.5e-4, 5e-4]

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

for lr in learning_rates:

 # Load base model
  model = AutoModelForCausalLM.from_pretrained(
      model_id,
      attn_implementation="flash_attention_2",
      quantization_config=bnb_config,
      device_map=device_map
  )


  # Initialize data collator
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, return_tensors="pt", mlm=False)
  
  training_arguments = TrainingArguments(
      output_dir=output_dir,
      num_train_epochs=num_train_epochs,
      per_device_train_batch_size=per_device_train_batch_size,
      gradient_accumulation_steps=gradient_accumulation_steps,
      evaluation_strategy='steps',
      optim=optim,
      save_steps=save_steps,
      logging_steps=logging_steps,
      learning_rate=lr,
      weight_decay=weight_decay,
      fp16=fp16,
      bf16=bf16,
      max_grad_norm=max_grad_norm,
      max_steps=max_steps,
      warmup_ratio=warmup_ratio,
      group_by_length=group_by_length,
      lr_scheduler_type=lr_scheduler_type,
      tf32=True,
      neftune_noise_alpha=5,
      report_to="wandb"
  )

  # Set supervised fine-tuning parameters
  trainer = SFTTrainer(
      model=model,
      train_dataset=dataset['train'],
      eval_dataset=dataset['test'],
      data_collator=data_collator,
      peft_config=peft_config,
      dataset_text_field="formatted_chat",
      max_seq_length=max_seq_length,
      tokenizer=tokenizer,
      args=training_arguments,
      packing=packing,
  )

  # Setup Wandb
  wandb.init(project='TRACHI_Llama', entity='norygano', config=training_arguments.to_dict())

  # Start training and let SFTTrainer handle evaluation
  trainer.train()

  # Finish Wandb session
  wandb.finish()

  # Save trained model
  trainer.model.save_pretrained(new_model)

  # Cleanup
  del model, trainer
  torch.cuda.empty_cache()
  gc.collect()



In [None]:
# TODO: Use unsloth Save when tokenizer is fixed
model.save_pretrained_gguf("Llama-3-TRACHI-8B-Instruct", tokenizer, quantization_method = "q4_k_m")