In [1]:
import torch; torch.version.cuda
import torch; torch.__version__

'2.1.0+cu121'

!pip install --upgrade --force-reinstall --no-cache-dir torch==2.1.0 triton --index-url https://download.pytorch.org/whl/cu121
!pip install --upgrade pip
!pip install --no-deps --no-build-isolation packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes

!pip install "unsloth[cu121-ampere] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/home/mtman/Documents/Repos/yakwith.ai/models/Mistral-7B-OpenOrca",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth: Fast Mistral patching release 2024.3
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.65 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.22.post7. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.61it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
import datasets

from datasets import load_dataset
def format_text(text):
    return {'text':str(text)}
dataset = load_dataset('csv', data_files='/home/mtman/Documents/Repos/yakwith.ai/data/1711405462.204613_881/function_call_data.tsv', split='train')
dataset = dataset.map(format_text)

In [6]:
len(dataset); dataset[0]['text']

'{"<|im_start|>system\\nYou are a clever conversation analyst. You are given a conversation between a waiter (the \'assistant\') and a customer (the \'user\') at a restaurant. Your TASK is to generate three parameters required by a FUNCTION that keeps track of the current state of the customer\'s ORDER. To understand the context of the customers final request, you will need to review the conversation history. The parameters must be returned in JSON format using the following schema:\\n                     {\'item\': the name of the current menu_item being discussed,\\n                      \'action\': how the ORDER should be changed: \'add\',\'remove\',\'update\',\\n                      \'variation\': the choice or modification that the customer has asked for. If they have not made any choices then this value should be \'none\'\\n                      }<|im_end|>\\n<|im_start|>user\\nHello, can I see the menu for the cocktails?<|im_end|>\\n<|im_start|>analyst\\n[{\'action\': \'none\',

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 5,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        max_steps = 25,
        learning_rate = 1e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        warmup_ratio=0.05,
        seed = 3407,
        output_dir = "outputs",
    ),
)


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 83 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 5 | Gradient Accumulation steps = 1
\        /    Total batch size = 5 | Total steps = 25
 "-____-"     Number of trainable parameters = 83,886,080


Step,Training Loss
1,1.8556
2,1.8643
3,1.7701
4,1.3949
5,1.2518
6,1.0199
7,0.8178
8,0.6774
9,0.5433
10,0.3848


In [9]:
FastLanguageModel.for_inference(model)

In [11]:
message = [{"role":"system",'content':'{""food_items"": [{""name"": ""EGGS POACHED OR FRIED"", ""price"": ""$13"", ""descriptions"": [""Served on Sourdough or Ciabatta""], ""other"": [""Add Cheese - $1, Smashed Avo - $2""]}, {""name"": ""RUSTIC TOAST"", ""price"": ""$8.50"", ""descriptions"": [""Sourdough or Ciabatta"", ""Fig & Lemon Jam"", ""Orange Marmalade"", ""Strawberry Jam"", ""Peanut Butter or Vegemite""]}, {""name"": ""MEXICAN eggs"", ""price"": ""$23"", ""descriptions"": [""Mixed Beans Cooked with Chili, Garlic, Onion and Herbs in a Rich Tomato Sauce"", ""Baked with Mozzarella"", ""Two Poached Eggs"", ""Ciabatta""]}, {""name"": ""HOT CHEESE"", ""price"": ""$9.50"", ""descriptions"": [""With Tomato & Oregano on Sourdough""]}], ""tea"": {""T-NOMICS ORGANIC LEAF TEA"": ""$5"", ""Varieties"": [""English Breakfast"", ""Earl Grey"", ""Peppermint"", ""Lemongrass & Ginger"", ""Golden Oolong Green""]}, ""immune_booster_shake"": {""price"": ""$14"", ""Ingredients"": [""Green apple"", ""Banana"", ""Spinach"", ""Mint"", ""Lime"", ""Ginger""]}, ""protein_shake"": {""price"": ""$14"", ""Ingredients"": [""Peanut butter"", ""Modern pea protein"", ""Almond milk"", ""Banana"", ""Honey & ice""], ""Additions"": [""Add shot of coffee $0.5""]}, ""extra_items"": {""Salmon, Gourmet Pork Sausages"": ""$6"", ""Bacon, Haloumi"": ""$5"", ""Avocado, Grilled Tomato"": ""$4"", ""Free-Range Egg"": ""$3"", ""Gluten-Free Bread"": ""$2""}}'},{"role":"user","content":'For the Hot Cheese, can I have haloumi instead of the regular cheese?'}]

In [18]:
message = [{"role":"system",'content':'{""food_items"": [{""name"": ""EGGS POACHED OR FRIED"", ""price"": ""$13"", ""descriptions"": [""Served on Sourdough or Ciabatta""], ""other"": [""Add Cheese - $1, Smashed Avo - $2""]}, {""name"": ""RUSTIC TOAST"", ""price"": ""$8.50"", ""descriptions"": [""Sourdough or Ciabatta"", ""Fig & Lemon Jam"", ""Orange Marmalade"", ""Strawberry Jam"", ""Peanut Butter or Vegemite""]}, {""name"": ""MEXICAN eggs"", ""price"": ""$23"", ""descriptions"": [""Mixed Beans Cooked with Chili, Garlic, Onion and Herbs in a Rich Tomato Sauce"", ""Baked with Mozzarella"", ""Two Poached Eggs"", ""Ciabatta""]}, {""name"": ""HOT CHEESE"", ""price"": ""$9.50"", ""descriptions"": [""With Tomato & Oregano on Sourdough""]}], ""tea"": {""T-NOMICS ORGANIC LEAF TEA"": ""$5"", ""Varieties"": [""English Breakfast"", ""Earl Grey"", ""Peppermint"", ""Lemongrass & Ginger"", ""Golden Oolong Green""]}, ""immune_booster_shake"": {""price"": ""$14"", ""Ingredients"": [""Green apple"", ""Banana"", ""Spinach"", ""Mint"", ""Lime"", ""Ginger""]}, ""protein_shake"": {""price"": ""$14"", ""Ingredients"": [""Peanut butter"", ""Modern pea protein"", ""Almond milk"", ""Banana"", ""Honey & ice""], ""Additions"": [""Add shot of coffee $0.5""]}, ""extra_items"": {""Salmon, Gourmet Pork Sausages"": ""$6"", ""Bacon, Haloumi"": ""$5"", ""Avocado, Grilled Tomato"": ""$4"", ""Free-Range Egg"": ""$3"", ""Gluten-Free Bread"": ""$2""}}'},{"role":"user","content":'Hi, can I just get the rustic toast thanks?'}]

In [13]:
message = [{"role":"system",'content':'{""food_items"": [{""name"": ""eggs poached or fried"", ""price"": ""$13"", ""descriptions"": [""Served on Sourdough or Ciabatta""], ""other"": [""Add Cheese - $1, Smashed Avo - $2""]}, {""name"": ""RUSTIC TOAST"", ""price"": ""$8.50"", ""descriptions"": [""Sourdough or Ciabatta"", ""Fig & Lemon Jam"", ""Orange Marmalade"", ""Strawberry Jam"", ""Peanut Butter or Vegemite""]}, {""name"": ""MEXICAN eggs"", ""price"": ""$23"", ""descriptions"": [""Mixed Beans Cooked with Chili, Garlic, Onion and Herbs in a Rich Tomato Sauce"", ""Baked with Mozzarella"", ""Two Poached Eggs"", ""Ciabatta""]}, {""name"": ""HOT CHEESE"", ""price"": ""$9.50"", ""descriptions"": [""With Tomato & Oregano on Sourdough""]}], ""tea"": {""T-NOMICS ORGANIC LEAF TEA"": ""$5"", ""Varieties"": [""English Breakfast"", ""Earl Grey"", ""Peppermint"", ""Lemongrass & Ginger"", ""Golden Oolong Green""]}, ""immune_booster_shake"": {""price"": ""$14"", ""Ingredients"": [""Green apple"", ""Banana"", ""Spinach"", ""Mint"", ""Lime"", ""Ginger""]}, ""protein_shake"": {""price"": ""$14"", ""Ingredients"": [""Peanut butter"", ""Modern pea protein"", ""Almond milk"", ""Banana"", ""Honey & ice""], ""Additions"": [""Add shot of coffee $0.5""]}, ""extra_items"": {""Salmon, Gourmet Pork Sausages"": ""$6"", ""Bacon, Haloumi"": ""$5"", ""Avocado, Grilled Tomato"": ""$4"", ""Free-Range Egg"": ""$3"", ""Gluten-Free Bread"": ""$2""}}'},{"role":"user","content":'Which teas are organic?'}]

** Function Call Messages **

In [10]:
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

In [11]:
system_prompt = {"role":"system", "content":"You are a clever conversation analyst. You are given a conversation between a waiter (the 'assistant') and a customer (the 'user') at a restaurant. You have 3 TASKS to complete. You must return the answers in json format with the following schema: {'item': TASK 1 - identify the menu item that is being currently discussed; 'action': TASK 2 - decide if the customers intention is to 'order' ('add') an item from the menu , 'remove' from the ORDER or 'update' an item in the ORDER, 'variation': TASK 3 - state the list of selections, changes or variations to ingredients or size that the customer is requesting}.\nExample Response 1: {'item':'italian omlete','action':'update','variation':'['remove garlic', 'add chilli']'. If the customer refers to more than one menu item then repeat the 3 TASKS for each menu item."}

In [12]:
a=[system_prompt,{"role":"user","content":"I'd like to get the big breakfast thanks?"}]

In [13]:
b = [system_prompt, {"role":"user","content":"I'd like to get the big breakfast thanks?"},{'role':'assistant','content':'Sure. Anything else?'},{"role":"user","content":"Can I get tofu instead of sausages on that"}]

In [14]:
c = [system_prompt, {"role":"user","content":"Can I get the big breakfast thanks?"},{'role':'assistant','content':'Sure. Anything else?'},{"role":"user","content":"Can I get tofu instead of sausages on that"},{'role':'assistant','content':'Okay. Anything else?'}, {"role":"user","content":"Actually, don't worry about the tofu. The sausages will be fine."}]

In [15]:
d = [system_prompt, {"role":"user","content":"Can I get the big breakfast thanks?"},{'role':'assistant','content':'Sure. Anything else?'},{"role":"user","content":"Can I get tofu instead of sausages on that"},{'role':'assistant','content':'Okay. Anything else?'}, {"role":"user","content":"Actually, don't worry about the tofu. The sausages will be fine."},{'role':'assistant','content':'Okay. Anything else?'},{'role':'assistant','content':"Actually, forget the big breakfast. I'll just get a coffee instead.Sorry"}]

In [16]:
messages = [a,b,c,d]

In [17]:
for i in range(len(messages)):
    inputs = tokenizer.apply_chat_template(messages[i],tokenize=False, add_generation_prompt=False)
    inputs = inputs+'<|im_start|>analyst\n'
    inputs = tokenizer(inputs, padding=True, return_tensors='pt')
    input = inputs['input_ids'].to('cuda')
    _ = model.generate(input_ids = input, streamer=text_streamer, max_new_tokens = 256, use_cache = True, temperature = 0.01 )
    print ('==================================')

# tokenizer.batch_decode(outputs)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


<s><|im_start|> system
You are a clever conversation analyst. You are given a conversation between a waiter (the 'assistant') and a customer (the 'user') at a restaurant. You have 3 TASKS to complete. You must return the answers in json format with the following schema: {'item': TASK 1 - identify the menu item that is being currently discussed; 'action': TASK 2 - decide if the customers intention is to 'order' ('add') an item from the menu , 'remove' from the ORDER or 'update' an item in the ORDER, 'variation': TASK 3 - state the list of selections, changes or variations to ingredients or size that the customer is requesting}.
Example Response 1: {'item':'italian omlete','action':'update','variation':'['remove garlic', 'add chilli']'. If the customer refers to more than one menu item then repeat the 3 TASKS for each menu item.<|im_end|><|im_start|> user
I'd like to get the big breakfast thanks?<|im_end|><|im_start|> analyst


[{'action': 'order', 'item': 'big breakfast', 'variation': []}]<|im_end|>


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


<s><|im_start|> system
You are a clever conversation analyst. You are given a conversation between a waiter (the 'assistant') and a customer (the 'user') at a restaurant. You have 3 TASKS to complete. You must return the answers in json format with the following schema: {'item': TASK 1 - identify the menu item that is being currently discussed; 'action': TASK 2 - decide if the customers intention is to 'order' ('add') an item from the menu , 'remove' from the ORDER or 'update' an item in the ORDER, 'variation': TASK 3 - state the list of selections, changes or variations to ingredients or size that the customer is requesting}.
Example Response 1: {'item':'italian omlete','action':'update','variation':'['remove garlic', 'add chilli']'. If the customer refers to more than one menu item then repeat the 3 TASKS for each menu item.<|im_end|><|im_start|> user
I'd like to get the big breakfast thanks?<|im_end|><|im_start|> assistant
Sure. Anything else?<|im_end|><|im_start|> user
Can I get to

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


<s><|im_start|> system
You are a clever conversation analyst. You are given a conversation between a waiter (the 'assistant') and a customer (the 'user') at a restaurant. You have 3 TASKS to complete. You must return the answers in json format with the following schema: {'item': TASK 1 - identify the menu item that is being currently discussed; 'action': TASK 2 - decide if the customers intention is to 'order' ('add') an item from the menu , 'remove' from the ORDER or 'update' an item in the ORDER, 'variation': TASK 3 - state the list of selections, changes or variations to ingredients or size that the customer is requesting}.
Example Response 1: {'item':'italian omlete','action':'update','variation':'['remove garlic', 'add chilli']'. If the customer refers to more than one menu item then repeat the 3 TASKS for each menu item.<|im_end|><|im_start|> user
Can I get the big breakfast thanks?<|im_end|><|im_start|> assistant
Sure. Anything else?<|im_end|><|im_start|> user
Can I get tofu ins

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


<s><|im_start|> system
You are a clever conversation analyst. You are given a conversation between a waiter (the 'assistant') and a customer (the 'user') at a restaurant. You have 3 TASKS to complete. You must return the answers in json format with the following schema: {'item': TASK 1 - identify the menu item that is being currently discussed; 'action': TASK 2 - decide if the customers intention is to 'order' ('add') an item from the menu , 'remove' from the ORDER or 'update' an item in the ORDER, 'variation': TASK 3 - state the list of selections, changes or variations to ingredients or size that the customer is requesting}.
Example Response 1: {'item':'italian omlete','action':'update','variation':'['remove garlic', 'add chilli']'. If the customer refers to more than one menu item then repeat the 3 TASKS for each menu item.<|im_end|><|im_start|> user
Can I get the big breakfast thanks?<|im_end|><|im_start|> assistant
Sure. Anything else?<|im_end|><|im_start|> user
Can I get tofu ins

In [18]:
model.save_pretrained_merged("/home/mtman/Documents/Repos/yakwith.ai/models/OrderManager-Mistral-7B-OpenOrca", tokenizer, save_method="merged_16bit",)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 40.75 out of 61.94 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 112.50it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
