In [1]:
import ollama

# Unload Model if it's active
ollama.generate(model='TRACHI', keep_alive=0)

{'model': 'TRACHI',
 'created_at': '2024-04-29T19:34:49.369966186Z',
 'response': '',
 'done': True}

In [2]:
# Fine-Tune
from unsloth import FastLanguageModel
import wandb
import torch
from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset
from transformers import (
    DataCollatorForLanguageModeling,
    AutoModelForCausalLM,
    #MistralForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    BitsAndBytesConfig,
)
from peft import LoraConfig, PeftModel
from datasets import load_dataset
import ollama

# Unload Model if it's active
ollama.generate(model='TRACHI', keep_alive=0)


# The model that you want to train from the Hugging Face hub
model_id = "cognitivecomputations/dolphin-2.8-mistral-7b-v02"

# The instruction dataset to use
dataset_name = "norygano/TRACHI"

# Fine-tuned model name
new_model = "dolphin-mistral-TRACHI-7b-v02"

# Constants
model_name = model_id.split('/')[-1]

# Sequence Length
max_seq_length = 1024

# Load your dataset
dataset = load_dataset(dataset_name)

# bnb
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype = "bfloat16",
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_use_double_quant = True,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
  print('No Padding Token set by tokenizer. Setting padding token to eos_token.')
  tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

dataset = load_dataset(dataset_name)

# Function to apply chat template to each entry in the dataset
def apply_chat_template(batch):
    # Apply the chat template with `add_generation_prompt=False`
    # Adjust the following line if your data structure is different
    formatted_chats = [tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) for chat in batch['chat']]
    return {'formatted_chat': formatted_chats}

# Applying chat template to the dataset
dataset = dataset.map(apply_chat_template, batched=True)

# Print some entries after applying chat template
'''
print("Sample entries after applying chat template:")
for i in range(3):  # Print the first 3 entries
    print(dataset['train'][i]['formatted_chat'])
'''

# Tokenize the formatted chats
def tokenize_function(batch):
    # Ensure this line correctly handles your data's structure
    return tokenizer(batch['formatted_chat'], padding=True, truncation=True, max_length=max_seq_length)

# Applying tokenization
dataset = dataset.map(tokenize_function, batched=True)

# Load model + tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    attn_implementation="flash_attention_2",
    quantization_config=bnb_config,
    device_map={"": 0}
)

model.use_flash_attention=True

model.config.use_cache = False
model.config.pretraining_tp = 1
model.resize_token_embeddings(len(tokenizer))

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, return_tensors="pt", mlm=False)

# Do model patching and add fast LoRA weights
peft_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    task_type="CAUSAL_LM"
)

args = TrainingArguments(
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    gradient_accumulation_steps = 8,
    warmup_steps=10,
    max_steps = -1,
    lr_scheduler_type='cosine',
    learning_rate=3e-4,
    num_train_epochs=2,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_strategy="steps",
    logging_steps = 1,
    output_dir = "outputs",
    evaluation_strategy='epoch',
    optim = "adamw_8bit",
    tf32=True,
    neftune_noise_alpha=5,
    seed = 3407,
    save_strategy="epoch",
    report_to="wandb"
)

trainer = SFTTrainer(
    model = model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field = "formatted_chat",
    peft_config=peft_config,
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = args
    )

# Setup Wandb
wandb.init(project='TRACHI_Llama', entity='norygano', config=args.to_dict())

trainer.train()

# Finish Wandb session
wandb.finish()

# Save trained model
trainer.model.save_pretrained(new_model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/129 [00:00<?, ? examples/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


TypeError: transformers.models.auto.auto_factory._BaseAutoModelClass.from_pretrained() got multiple values for keyword argument 'quantization_config'

In [1]:
# DEBUG: Reload deps and vars
from unsloth import FastLanguageModel
import wandb
import torch
from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset
from transformers import (
    DataCollatorForLanguageModeling,
    AutoModelForCausalLM,
    #MistralForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    BitsAndBytesConfig,
)
from peft import LoraConfig, PeftModel
from datasets import load_dataset
import ollama

# Unload Model if it's active
ollama.generate(model='TRACHI', keep_alive=0)


# The model that you want to train from the Hugging Face hub
model_id = "cognitivecomputations/dolphin-2.9-llama3-8b"

# The instruction dataset to use
dataset_name = "norygano/TRACHI"

# Fine-tuned model name
new_model = "dolphin-llama-3-TRACHI-8b"

# Constants
model_name = model_id.split('/')[-1]

In [2]:
# Reload (FP16) -> merge w/ LoRA weights
from datetime import datetime


# Cleanup #CUDA-OOM
if 'model' in locals():
  del model
if 'pipe' in locals():
  del pipe
if 'trainer' in locals():
  del trainer
import gc
gc.collect()

# The model that you want to train from the Hugging Face hub
#model_id = "cognitivecomputations/dolphin-2.2.1-mistral-7b"

# The instruction dataset to use
#dataset_name = "norygano/TRACHI"

# Fine-tuned model name
#new_model = "dolphin-mistral-TRACHI-7b"

# Reload the base model in bf16
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=False,
    return_dict=True,
    torch_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
base_model.resize_token_embeddings(len(tokenizer))
print(len(tokenizer))

# Assuming PeftModel is a custom or previously defined model class for handling post-training operations
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer consistent with the first step and apply the same configurations
tokenizer = AutoTokenizer.from_pretrained(model_id)

# The following lines were commented out in the first step but included here for consistency
# Uncomment and adjust if necessary based on your specific requirements
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# model.resize_token_embeddings(len(tokenizer))

# This setting was not changed in the first step, but keep it if needed for your use case
tokenizer.padding_side = "right"

# Get the current date and time
now = datetime.now()

# Format the date and time as a strin
formatted_now = now.strftime("%Y-%m-%d %H:%M:%S")

# Print the formatted date and time
print("Current Date and Time:", formatted_now)

model.save_pretrained(new_model)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


128258


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Current Date and Time: 2024-05-01 00:03:07


In [5]:
# Quantize
import os
QUANTIZATION_METHODS = ["q8_0"]

# Convert to fp16
fp16 = f"{new_model}.fp16.bin"
model_path = os.path.join(new_model, fp16)
print(model_path)
!python llama.cpp/convert.py {new_model} --outtype f16 --outfile {model_path} --vocab-type bpe
# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    qtype = f"{new_model}/{new_model}.{method.upper()}.gguf"
    !llama.cpp/quantize {model_path} {qtype} {method}

dolphin-llama-3-TRACHI-8b/dolphin-llama-3-TRACHI-8b.fp16.bin
Loading model file dolphin-llama-3-TRACHI-8b/model-00001-of-00007.safetensors
Loading model file dolphin-llama-3-TRACHI-8b/model-00001-of-00007.safetensors
Loading model file dolphin-llama-3-TRACHI-8b/model-00002-of-00007.safetensors
Loading model file dolphin-llama-3-TRACHI-8b/model-00003-of-00007.safetensors
Loading model file dolphin-llama-3-TRACHI-8b/model-00004-of-00007.safetensors
Loading model file dolphin-llama-3-TRACHI-8b/model-00005-of-00007.safetensors
Loading model file dolphin-llama-3-TRACHI-8b/model-00006-of-00007.safetensors
Loading model file dolphin-llama-3-TRACHI-8b/model-00007-of-00007.safetensors
params = Params(n_vocab=128258, n_embd=4096, n_layer=32, n_ctx=8192, n_ff=14336, n_head=32, n_head_kv=8, n_experts=None, n_experts_used=None, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=500000.0, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=<GGMLFileType.MostlyF16: 1>, path_model=P

In [8]:
# Update Modelfile
import ollama
import os

path = os.path.join(os.getcwd(), 'modelfiles', 'Modelfile_TRACHI_D2')
print(path)
ollama.create(model='TRACHI', path=path)

/home/nory/projects/discollama/modelfiles/Modelfile_TRACHI_D2


{'status': 'success'}

In [5]:
# Push -> HF
from huggingface_hub import create_repo, HfApi
api = HfApi()

#api.create_repo(f'{new_model}-GGUF')

# Upload gguf files
api.upload_folder(
    folder_path=new_model,
    repo_id=f"norygano/{new_model}-GGUF",
    allow_patterns=f"*.gguf",
    repo_type="model",
)

Llama-3-TRACHI-8B-Instruct.Q5_K_M.gguf:   0%|          | 0.00/5.73G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/norygano/Llama-3-TRACHI-8B-Instruct-GGUF/commit/8275f7d8ea6316b2a61680716ce69133607bd5a9', commit_message='Upload folder using huggingface_hub', commit_description='', oid='8275f7d8ea6316b2a61680716ce69133607bd5a9', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Reload (FP16) -> merge w/ LoRA weights
from datetime import datetime
from peft import LoraConfig, PeftModel
from transformers import (
    DataCollatorForLanguageModeling,
    AutoModelForCausalLM,
    #MistralForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    BitsAndBytesConfig,
)

device_map = {"": 0}

# Cleanup #CUDA-OOM
if 'model' in locals():
  del model
if 'pipe' in locals():
  del pipe
if 'trainer' in locals():
  del trainer
import gc
gc.collect()

# Reload the base model in bf16
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.bfloat16,
    device_map=device_map
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
base_model.resize_token_embeddings(len(tokenizer))
print(len(tokenizer))

# Assuming PeftModel is a custom or previously defined model class for handling post-training operations
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer consistent with the first step and apply the same configurations
tokenizer = AutoTokenizer.from_pretrained(model_id)

# The following lines were commented out in the first step but included here for consistency
# Uncomment and adjust if necessary based on your specific requirements
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# model.resize_token_embeddings(len(tokenizer))

# This setting was not changed in the first step, but keep it if needed for your use case
tokenizer.padding_side = "right"

# Get the current date and time
now = datetime.now()

# Format the date and time as a string
formatted_now = now.strftime("%Y-%m-%d %H:%M:%S")

# Print the formatted date and time
print("Current Date and Time:", formatted_now)

model.save_pretrained(new_model)

In [None]:
# Cleanup #CUDA-OOM
if 'model' in locals():
  del model

# Quantize
import os
QUANTIZATION_METHODS = ["q5_k_m"]
new_model = "Llama-3-TRACHI-8B-Instruct"

# Convert to fp16
fp16 = f"{new_model}.fp16.bin"
model_path = os.path.join(new_model, fp16)
print(model_path)
!python llama.cpp/convert.py {new_model} --outtype f16 --outfile {model_path} --pad-vocab --vocab-type bpe

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    qtype = f"{new_model}/{new_model}.{method.upper()}.gguf"
    !llama.cpp/quantize {model_path} {qtype} {method}

In [None]:
# TODO: Implement batch training
learning_rates = [4e-4, 4.5e-4, 5e-4]

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

for lr in learning_rates:

 # Load base model
  model = AutoModelForCausalLM.from_pretrained(
      model_id,
      attn_implementation="flash_attention_2",
      quantization_config=bnb_config,
      device_map=device_map
  )


  # Initialize data collator
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, return_tensors="pt", mlm=False)
  
  training_arguments = TrainingArguments(
      output_dir=output_dir,
      num_train_epochs=num_train_epochs,
      per_device_train_batch_size=per_device_train_batch_size,
      gradient_accumulation_steps=gradient_accumulation_steps,
      evaluation_strategy='steps',
      optim=optim,
      save_steps=save_steps,
      logging_steps=logging_steps,
      learning_rate=lr,
      weight_decay=weight_decay,
      fp16=fp16,
      bf16=bf16,
      max_grad_norm=max_grad_norm,
      max_steps=max_steps,
      warmup_ratio=warmup_ratio,
      group_by_length=group_by_length,
      lr_scheduler_type=lr_scheduler_type,
      tf32=True,
      neftune_noise_alpha=5,
      report_to="wandb"
  )

  # Set supervised fine-tuning parameters
  trainer = SFTTrainer(
      model=model,
      train_dataset=dataset['train'],
      eval_dataset=dataset['test'],
      data_collator=data_collator,
      peft_config=peft_config,
      dataset_text_field="formatted_chat",
      max_seq_length=max_seq_length,
      tokenizer=tokenizer,
      args=training_arguments,
      packing=packing,
  )

  # Setup Wandb
  wandb.init(project='TRACHI_Llama', entity='norygano', config=training_arguments.to_dict())

  # Start training and let SFTTrainer handle evaluation
  trainer.train()

  # Finish Wandb session
  wandb.finish()

  # Save trained model
  trainer.model.save_pretrained(new_model)

  # Cleanup
  del model, trainer
  torch.cuda.empty_cache()
  gc.collect()



In [None]:
# TODO: Use unsloth Save when tokenizer is fixed
model.save_pretrained_gguf("Llama-3-TRACHI-8B-Instruct", tokenizer, quantization_method = "q4_k_m")