In [1]:
# Imports
import gc
import ollama
import torch
from torch.quantization import get_default_qat_qconfig, prepare_qat, convert
from datasets import load_dataset
import wandb
from transformers import (
    DataCollatorForLanguageModeling,
    AutoModelForCausalLM,
    #MistralForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    BitsAndBytesConfig,
)
import bitsandbytes
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from datasets import concatenate_datasets, load_dataset
from math import ceil

In [2]:
# Parameters

# The model that you want to train from the Hugging Face hub
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# The instruction dataset to use
dataset_name = "norygano/TRACHI"

# Fine-tuned model name
new_model = "Llama-3-TRACHI-8B-Instruct"

# Constants
model_name = model_id.split('/')[-1]

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 16

# Alpha parameter for LoRA scaling
lora_alpha = 32

# Dropout probability for LoRA layers
lora_dropout = 0.05

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 8

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 5e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.06

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 5

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = 4096

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [3]:
# Fine-Tune

# Unload Model if it's active
ollama.generate(model='TRACHI', keep_alive=0)

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Assuming `model` is your model variable
if 'model' in locals():
    del model
    torch.cuda.empty_cache()
    gc.collect()

# Load your dataset
torch.autograd.set_detect_anomaly(True)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
#model.resize_token_embeddings(len(tokenizer))
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

dataset = load_dataset(dataset_name)

# Function to duplicate entries in the dataset
def duplicate_entries(dataset, duplication_factor):
    duplicated_datasets = [dataset for _ in range(duplication_factor)]
    concatenated_dataset = concatenate_datasets(duplicated_datasets)
    return concatenated_dataset.shuffle(seed=42)  # Shuffle to mix the entries

# Increase the weight of the dataset by duplicating its entries
#dataset = duplicate_entries(dataset, duplication_factor=1)

#model = AutoModelForCausalLM.from_pretrained(model_name)

# Function to apply chat template to each entry in the dataset
def apply_chat_template(batch):
    # Apply the chat template with `add_generation_prompt=False`
    # Adjust the following line if your data structure is different
    formatted_chats = [tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) for chat in batch['chat']]
    return {'formatted_chat': formatted_chats}

# Applying chat template to the dataset
dataset = dataset.map(apply_chat_template, batched=True)

# Tokenize the formatted chats
def tokenize_function(batch):
    # Ensure this line correctly handles your data's structure
    return tokenizer(batch['formatted_chat'], padding=True, truncation=True, max_length=max_seq_length)

# Applying tokenization
dataset = dataset.map(tokenize_function, batched=True)

learning_rates = [4e-4, 4.5e-4, 5e-4]

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

for lr in learning_rates:

 # Load base model
  model = AutoModelForCausalLM.from_pretrained(
      model_id,
      attn_implementation="flash_attention_2",
      quantization_config=bnb_config,
      device_map=device_map
  )
  model.config.use_cache = False
  model.config.pretraining_tp = 1
  model.resize_token_embeddings(len(tokenizer))

  # Initialize data collator
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, return_tensors="pt", mlm=False)
  training_arguments = TrainingArguments(
      output_dir=output_dir,
      num_train_epochs=num_train_epochs,
      per_device_train_batch_size=per_device_train_batch_size,
      gradient_accumulation_steps=gradient_accumulation_steps,
      evaluation_strategy='steps',
      optim=optim,
      save_steps=save_steps,
      logging_steps=logging_steps,
      learning_rate=lr,
      weight_decay=weight_decay,
      fp16=fp16,
      bf16=bf16,
      max_grad_norm=max_grad_norm,
      max_steps=max_steps,
      warmup_ratio=warmup_ratio,
      group_by_length=group_by_length,
      lr_scheduler_type=lr_scheduler_type,
      tf32=True,
      neftune_noise_alpha=5,
      report_to="wandb"
  )

  # Set supervised fine-tuning parameters
  trainer = SFTTrainer(
      model=model,
      train_dataset=dataset['train'],
      eval_dataset=dataset['test'],
      data_collator=data_collator,
      peft_config=peft_config,
      dataset_text_field="formatted_chat",
      max_seq_length=max_seq_length,
      tokenizer=tokenizer,
      args=training_arguments,
      packing=packing,
  )

  # Setup Wandb
  wandb.init(project='TRACHI_Llama', entity='norygano', config=training_arguments.to_dict())

  # Start training and let SFTTrainer handle evaluation
  trainer.train()

  # Finish Wandb session
  wandb.finish()

  # Save trained model
  trainer.model.save_pretrained(new_model)

  # Cleanup
  del model, trainer
  torch.cuda.empty_cache()
  gc.collect()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading readme:   0%|          | 0.00/430 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.58k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/143 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/39 [00:00<?, ? examples/s]

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[34m[1mwandb[0m: Currently logged in as: [33mnorygano[0m. Use [1m`wandb login --relogin`[0m to force relogin


The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss,Validation Loss
5,4.2085,2.551119
10,2.1066,1.733364
15,1.6815,1.589628
20,1.4402,1.498604
25,1.1339,1.425871
30,0.9908,1.405004
35,0.783,1.404863
40,0.5576,1.570406
45,0.5134,1.650714
50,0.3184,1.649087


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▃▂▂▁▁▁▂▃▂▃▃▃▃
eval/runtime,█▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/samples_per_second,▁▇████████████
eval/steps_per_second,▁▇████████████
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train/grad_norm,█▂▁▂▅▂▃▂▅▃▂▂▁▁
train/learning_rate,███▇▇▆▅▄▃▃▂▂▁▁
train/loss,█▄▄▃▃▂▂▂▂▁▁▁▁▁

0,1
eval/loss,1.7561
eval/runtime,2.5529
eval/samples_per_second,15.277
eval/steps_per_second,1.959
total_flos,1.317639730987008e+16
train/epoch,8.0
train/global_step,72.0
train/grad_norm,0.76172
train/learning_rate,0.0
train/loss,0.1665


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss
5,4.1589,2.486262
10,2.0492,1.704173
15,1.6529,1.575486
20,1.4069,1.467543
25,1.0934,1.458351
30,0.9412,1.444205
35,0.7544,1.393081
40,0.5137,1.620442
45,0.4655,1.680198
50,0.2832,1.711103


VBox(children=(Label(value='0.003 MB of 0.017 MB uploaded\r'), FloatProgress(value=0.1583745622324754, max=1.0…

0,1
eval/loss,█▃▂▁▁▁▁▂▃▃▃▃▄▄
eval/runtime,█▃▁▄▄▂▂▃▄▂▂▃▂▄
eval/samples_per_second,▁▆█▅▄▇▇▆▅▇▇▆▇▅
eval/steps_per_second,▁▆█▅▄▇▇▆▅▇▇▆▇▅
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train/grad_norm,█▁▁▂▄▂▂▂▃▂▁▂▁▁
train/learning_rate,███▇▇▆▅▄▃▃▂▂▁▁
train/loss,█▄▄▃▃▂▂▂▂▁▁▁▁▁

0,1
eval/loss,1.79526
eval/runtime,2.5295
eval/samples_per_second,15.418
eval/steps_per_second,1.977
total_flos,1.317639730987008e+16
train/epoch,8.0
train/global_step,72.0
train/grad_norm,0.66797
train/learning_rate,0.0
train/loss,0.1313


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss
5,4.1199,2.274294
10,1.9665,1.706309
15,1.625,1.551174
20,1.3446,1.430356
25,1.0655,1.419365
30,0.8591,1.488384
35,0.7155,1.409225
40,0.4714,1.684338
45,0.4072,1.687758
50,0.2385,1.765239


VBox(children=(Label(value='0.003 MB of 0.018 MB uploaded\r'), FloatProgress(value=0.15377557078857884, max=1.…

0,1
eval/loss,█▃▂▁▁▂▁▃▃▄▄▄▄▄
eval/runtime,█▅▄▅▁▃▃▂▅▃▁▄▂▂
eval/samples_per_second,▁▄▅▄█▆▆▇▄▆█▅▇▇
eval/steps_per_second,▁▄▅▄█▆▆▇▄▆█▅▇▇
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train/grad_norm,▆▁▁▂▃▂▂▂▂▂▁█▁▁
train/learning_rate,███▇▇▆▅▄▃▃▂▂▁▁
train/loss,█▄▄▃▃▂▂▂▁▁▁▁▁▁

0,1
eval/loss,1.78836
eval/runtime,2.5112
eval/samples_per_second,15.53
eval/steps_per_second,1.991
total_flos,1.317639730987008e+16
train/epoch,8.0
train/global_step,72.0
train/grad_norm,0.50391
train/learning_rate,0.0
train/loss,0.1233


In [4]:
# Reload (FP16) -> merge w/ LoRA weights
from datetime import datetime

# Cleanup #CUDA-OOM
if 'model' in locals():
  del model
if 'pipe' in locals():
  del pipe
if 'trainer' in locals():
  del trainer
import gc
gc.collect()

# Reload the base model in bf16
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.bfloat16,
    device_map=device_map
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
base_model.resize_token_embeddings(len(tokenizer))
print(len(tokenizer))

# Assuming PeftModel is a custom or previously defined model class for handling post-training operations
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer consistent with the first step and apply the same configurations
tokenizer = AutoTokenizer.from_pretrained(model_id)

# The following lines were commented out in the first step but included here for consistency
# Uncomment and adjust if necessary based on your specific requirements
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# model.resize_token_embeddings(len(tokenizer))

# This setting was not changed in the first step, but keep it if needed for your use case
tokenizer.padding_side = "right"


# Get the current date and time
now = datetime.now()

# Format the date and time as a string
formatted_now = now.strftime("%Y-%m-%d %H:%M:%S")

# Print the formatted date and time
print("Current Date and Time:", formatted_now)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

128256


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Current Date and Time: 2024-04-27 01:11:14


In [5]:
model.save_pretrained(new_model)

In [6]:
# Cleanup #CUDA-OOM
if 'model' in locals():
  del model

# Quantize
import os
QUANTIZATION_METHODS = ["q5_k_m"]
new_model = "Llama-3-TRACHI-8B-Instruct"

# Convert to fp16
fp16 = f"{new_model}.fp16.bin"
model_path = os.path.join(new_model, fp16)
print(model_path)
!python llama.cpp/convert.py {new_model} --outtype f16 --outfile {model_path} --pad-vocab --vocab-type bpe

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    qtype = f"{new_model}/{new_model}.{method.upper()}.gguf"
    !llama.cpp/quantize {model_path} {qtype} {method}

Llama-3-TRACHI-8B-Instruct/Llama-3-TRACHI-8B-Instruct.fp16.bin
Loading model file Llama-3-TRACHI-8B-Instruct/model-00001-of-00004.safetensors
Loading model file Llama-3-TRACHI-8B-Instruct/model-00001-of-00004.safetensors
Loading model file Llama-3-TRACHI-8B-Instruct/model-00002-of-00004.safetensors
Loading model file Llama-3-TRACHI-8B-Instruct/model-00003-of-00004.safetensors
Loading model file Llama-3-TRACHI-8B-Instruct/model-00004-of-00004.safetensors
params = Params(n_vocab=128256, n_embd=4096, n_layer=32, n_ctx=8192, n_ff=14336, n_head=32, n_head_kv=8, n_experts=None, n_experts_used=None, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=500000.0, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=<GGMLFileType.MostlyF16: 1>, path_model=PosixPath('Llama-3-TRACHI-8B-Instruct'))
Loaded vocab file PosixPath('Llama-3-TRACHI-8B-Instruct/tokenizer.json'), type 'bpe'
Vocab info: <BpeVocab with 128000 base tokens and 256 added tokens>
Special vocab info: <SpecialVocab 

In [1]:
# Update Modelfile
import ollama
import os

path = os.path.join(os.getcwd(), 'modelfiles', 'Modelfile_TRACHI_L')
print(path)
ollama.create(model='TRACHI', path=path)

/home/nory/projects/discollama/modelfiles/Modelfile_TRACHI_L


{'status': 'success'}

In [8]:
# Push -> HF
from huggingface_hub import create_repo, HfApi
api = HfApi()

new_model = "Llama-3-TRACHI-8B-Instruct"
#api.create_repo(f'{new_model}-GGUF')

# Upload gguf files
api.upload_folder(
    folder_path=new_model,
    repo_id=f"norygano/{new_model}-GGUF",
    allow_patterns=f"*.gguf",
    repo_type="model",
)

In [10]:
template = (
    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
    "{{ .System }}<|eot_id|>\n"
    "<|start_header_id|>user<|end_header_id|>\n\n"
    "{{ .Prompt }}<|eot_id|>\n"
    "<|start_header_id|>assistant<|end_header_id|>\n\n"
)
print(f'{template}')

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{{ .System }}<|eot_id|>
<|start_header_id|>user<|end_header_id|>

{{ .Prompt }}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>




In [10]:
# DEBUG: Push
import locale
locale.getpreferredencoding = lambda: "UTF-8"


model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/norygano/TRACHI-Llama-3-8B-Instruct/commit/93ec7517d24cad2f256813852ef03aa9e5b59049', commit_message='Upload tokenizer', commit_description='', oid='93ec7517d24cad2f256813852ef03aa9e5b59049', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# DEBUG: Inference
from transformers import pipeline

model.eval()

# Set the prompt from user input
prompt = "Who are you?"

# Specify the character or context you want to prompt
character_name = "Ganymede"

# Initialize the text-generation pipeline with your fine-tuned model
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)

# Format the prompt to conform with ChatML template and include the system layer for the character
formatted_prompt = f"<conversation>\n  <system>{character_name}</system>\n  <exchange>\n    <user>{prompt}</user>\n    <assistant>"

# Generate the response using the pipeline
result = pipe(formatted_prompt)

# Extract the generated text. It's important to handle the output correctly based on how your model appends its response.
# Assuming the model generates the closing tags automatically. Adjust based on your model's behavior.
generated_text = result[0]['generated_text']

# Optionally, you might want to process `generated_text` to extract only the assistant's response.
# This processing step will depend on how the generated text structures the assistant's response and any closing tags.

print(generated_text)

In [None]:
# DEBUG: Empty VRAM
if 'new_model' in locals():
  del model
if 'pipe' in locals():
  del pipe
if 'trainer' in locals():
  del trainer
import gc
gc.collect()
gc.collect()

In [None]:
# DEBUG: Iterate through model parameters + print data types
for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Type: {param.dtype}")

In [None]:
# DEBUG: Check Files
import os
import glob

# Construct the pattern to match .gguf files
pattern = os.path.join(new_model, '*.gguf')

print(pattern)

# Use glob to find all files in the directory that match the pattern
gguf_files = glob.glob(pattern)

# Iterate over the list of gguf files and print each one
for file_path in gguf_files:
    print(file_path)

In [None]:
# DEBUG: CUDA capability
import torch
torch.cuda.is_available()

In [None]:
# DEBUG: Install Pytorch & other libraries

#pip install "torch==2.1.2" tensorboard

# Install Hugging Face libraries
!pip install  --upgrade \
  "transformers==4.38.2" \
  "datasets==2.16.1" \
  "accelerate==0.26.1" \
  "evaluate==0.4.1" \
  "bitsandbytes==0.41.1" \
  "trl==0.7.11" \
  "peft==0.8.2"