In [4]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
import json

tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct-bnb-4bit")
EOS_TOKEN = tokenizer.eos_token

TRAINING_DATA_PATH = 'SherlockDataset2.json'

hf_token= "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"
# Add special tokens if not already present
special_tokens = {'additional_special_tokens': ['<|start_header_id|>', '<|end_header_id|>', '<|eot_id|>']}
tokenizer.add_special_tokens(special_tokens)

0

In [5]:
# Load the dataset
with open(TRAINING_DATA_PATH, 'r') as f:
    dataset = json.load(f)

In [6]:
def prepare_training_data(conversations):
    formatted_data = []
    
    for conversation in conversations:
        full_prompt = "<|begin_of_text|>"
        for message in conversation['messages']:
            role = message['role']
            content = message['content']
            
            full_prompt += f"<|start_header_id|>{role}<|end_header_id|>\n{content}<|eot_id|>\n"
        
        formatted_data.append({"text": full_prompt})
    
    return formatted_data

# Prepare the training data
training_data = prepare_training_data(dataset)

# Print an example to verify
print("Example training instance:")
print(training_data[50]['text'])

Example training instance:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are Sherlock Holmes, the famous detective known for your proficiency in observation, deduction, forensic science, and logical reasoning that you employ when investigating cases for a wide variety of clients. Respond in character. Sherlock Holmes typically speaks in a direct, analytical, and often brusque manner. His conversational style is characterized by keen observations, logical deductions, and a tendency to be blunt or even impatient with those who can't follow his rapid thought processes. Holmes often delivers his insights in a confident, sometimes dramatic fashion, punctuated by moments of dry wit or sarcasm. He's prone to making sharp, incisive remarks and can be dismissive of ideas he finds illogical. While brilliant in his deductions, Holmes can come across as aloof or detached in social interactions, focusing intensely on the intellectual aspects of a case rather than emotional nuances

In [7]:
# Print the total number of training instances
print(f"\nTotal number of training instances: {len(training_data)}")


Total number of training instances: 107


In [8]:
# Create the dataset
dataset = Dataset.from_dict({"text": [ct["text"] for ct in training_data]})
print("\nExample instance from the dataset object:")
print(dataset[50]['text'])


Example instance from the dataset object:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are Sherlock Holmes, the famous detective known for your proficiency in observation, deduction, forensic science, and logical reasoning that you employ when investigating cases for a wide variety of clients. Respond in character. Sherlock Holmes typically speaks in a direct, analytical, and often brusque manner. His conversational style is characterized by keen observations, logical deductions, and a tendency to be blunt or even impatient with those who can't follow his rapid thought processes. Holmes often delivers his insights in a confident, sometimes dramatic fashion, punctuated by moments of dry wit or sarcasm. He's prone to making sharp, incisive remarks and can be dismissive of ideas he finds illogical. While brilliant in his deductions, Holmes can come across as aloof or detached in social interactions, focusing intensely on the intellectual aspects of a case rather than e

In [9]:
max_tokens = 0
for text in dataset['text']:
    tokens = tokenizer.encode(text, add_special_tokens=False, max_length=None)
    max_tokens = max(max_tokens, len(tokens))

# Add a small buffer to the maximum token count
buffer = 10
max_seq_length = max_tokens + buffer

print(f"\nMaximum number of tokens in a single instance: {max_tokens}")
print(f"Set max_seq_length to: {max_seq_length}")


Maximum number of tokens in a single instance: 678
Set max_seq_length to: 688


In [10]:
from unsloth import FastLanguageModel
import torch

dtype = None # None for auto detection. Bfloat16 for Ampere+. Float16 for Tesla T4 & V100.
load_in_4bit = True # Use 4-bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token, # add a Hugging Face access token if using a private or gated model
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4060 Laptop GPU. Max memory: 7.996 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.24. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [11]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 1,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 2,
        warmup_steps = 2,
        #num_train_epochs = 3,
        max_steps=200,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map: 100%|██████████| 107/107 [00:00<00:00, 2187.42 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [13]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 107 | Num Epochs = 4
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 200
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.5393
2,2.8242
3,2.7645
4,2.2673
5,2.1242
6,1.8105
7,1.5812
8,1.6022
9,1.5123
10,1.2283


TrainOutput(global_step=200, training_loss=0.5832809573225677, metrics={'train_runtime': 566.0044, 'train_samples_per_second': 0.707, 'train_steps_per_second': 0.353, 'total_flos': 8407769099378688.0, 'train_loss': 0.5832809573225677, 'epoch': 3.7383177570093458})

In [14]:
TOKEN = "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"

In [15]:
model_name = "Robo8998/SherelockV5"  # Your desired model name on Hugging Face
model.push_to_hub(model_name, token=TOKEN)
tokenizer.push_to_hub(model_name, token=TOKEN)
print(f"Saved model to https://huggingface.co/{model_name}")

adapter_model.safetensors: 100%|██████████| 168M/168M [00:16<00:00, 10.4MB/s] 


Saved model to https://huggingface.co/Robo8998/SherelockV5
Saved model to https://huggingface.co/Robo8998/SherelockV5


In [16]:
if True: model.save_pretrained_merged("model5/16bitVLLM", tokenizer, save_method = "merged_16bit",)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 3.92 out of 15.47 RAM for saving.


  0%|          | 0/32 [00:00<?, ?it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:14<00:00,  2.14it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


In [14]:
python llama.cpp/convert_hf_to_gguf.py /home/robo/unslothstuff/model5/16bitVLLM   --outfile V5Sherlock8bit.gguf   --outtype q8_0

SyntaxError: invalid decimal literal (1423099815.py, line 1)