<a href="https://colab.research.google.com/github/nafi-rahman/ML-AI/blob/main/travel_chatml_unsloth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
    "unsloth/gemma-2b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


MY OWN CHATML DATASET

In [None]:
# import pandas as pd
# from google.colab import drive
# import json

# # Mount Google Drive
# drive.mount('/content/drive')

# # Path to the JSON file in your Google Drive
# file_path = '/content/drive/MyDrive/Database/structured_dialogues.json'

# # Load the JSON data
# with open(file_path, 'r') as file:
#     data = json.load(file)

# chatml_template = ""
# for dialogue in data:
#     for turn in dialogue["turns"]:
#         role = "user" if turn["author"] == "user" else "assistant"
#         chatml_template += f'<|im_start|>{role}\n{turn["text"]}\n<|im_end|>\n'

# print(chatml_template)

In [None]:
from unsloth.chat_templates import get_chat_template
from datasets import Dataset
import pandas as pd
from google.colab import drive
import json

# Mount Google Drive
drive.mount('/content/drive')

# Path to the JSON file in your Google Drive
file_path = '/content/drive/MyDrive/Database/structured_dialogues.json'

# Load the JSON data
df = pd.read_json(file_path)

# Ensure your dataset is in the correct format
data = df.to_dict(orient='records')

# Check the structure of the data
def check_and_reformat_data(data):
    formatted_data = []
    for conversation in data:
        formatted_conversation = []
        for turn in conversation['turns']:
            formatted_turn = {
                "role": "human" if turn['author'] == "user" else "gpt",
                "content": turn['text']
            }
            formatted_conversation.append(formatted_turn)
        formatted_data.append(formatted_conversation)
    return formatted_data

# Reformat the data
formatted_conversations = check_and_reformat_data(data)

# Step 3: Set up the tokenizer
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",  # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping={"role": "role", "content": "content", "user": "human", "assistant": "gpt"},  # ShareGPT style
    map_eos_token=True,  # Maps to </s> instead
)

# Step 4: Define the formatting function
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

# Step 5: Format the dataset
formatted_data = formatting_prompts_func({"conversations": formatted_conversations})

# Convert formatted data to a HuggingFace Dataset
dataset = Dataset.from_dict(formatted_data)

# Save or use the dataset
dataset.save_to_disk('formatted_dataset')


Unsloth: Will map <|im_end|> to EOS = </s>.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Saving the dataset (0/1 shards):   0%|          | 0/1369 [00:00<?, ? examples/s]

In [None]:
from datasets import Dataset
import pandas as pd
from google.colab import drive
import json
from unsloth.chat_templates import get_chat_template
# Mount Google Drive
drive.mount('/content/drive')

# Path to the JSON file in your Google Drive
file_path = '/content/drive/MyDrive/Database/formatted_dialogues.json'

# Load the JSON data
df = pd.read_json(file_path)

# Ensure your dataset is in the correct format
data = df.to_dict(orient='records')

# Function to format the intent and entities
def format_intent_and_entities(intent, entities):
    formatted_entities = ', '.join([f"{k}: {v}" for k, v in entities.items()]) if entities else "NONE"
    return f"Intent: {intent}, Entities: {formatted_entities}"

# Check the structure of the data
def check_and_reformat_data(data):
    formatted_data = []
    for conversation in data:
        formatted_conversation = []
        for turn in conversation['turns']:
            # Format intent and entities
            intent_and_entities = format_intent_and_entities(turn.get('intent', 'NONE'), turn.get('entities', {}))

            # Add user's utterance with intent and entities
            formatted_conversation.append({
                "role": "human",
                "content": f"{intent_and_entities}, {turn['utterance']}"
            })
            # Add assistant's response
            formatted_conversation.append({
                "role": "gpt",
                "content": turn['response']
            })
        formatted_data.append(formatted_conversation)
    return formatted_data

# Reformat the data
formatted_conversations = check_and_reformat_data(data)


# Step 4: Define the formatting function
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

# Step 5: Format the dataset
formatted_data = formatting_prompts_func({"conversations": formatted_conversations})

# Convert formatted data to a HuggingFace Dataset
dataset = Dataset.from_dict(formatted_data)

# Save or use the dataset
dataset.save_to_disk('formatted_dataset')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Saving the dataset (0/1 shards):   0%|          | 0/16142 [00:00<?, ? examples/s]

In [None]:
    # from unsloth.chat_templates import get_chat_template

# tokenizer = get_chat_template(
#     tokenizer,
#     chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
#     mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
#     map_eos_token = True, # Maps <|im_end|> to </s> instead
# )

# def formatting_prompts_func(examples):
#     convos = examples["conversations"]
#     texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
#     return { "text" : texts, }
# pass

# from datasets import load_dataset
# dataset = load_dataset("philschmid/guanaco-sharegpt-style", split = "train")
# dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.24M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9033 [00:00<?, ? examples/s]

Map:   0%|          | 0/9033 [00:00<?, ? examples/s]

Let's see how the `ChatML` format works by printing the 5th element

In [None]:
# dataset[5]["conversations"]

KeyError: 'conversations'

In [None]:
print(dataset[5]["text"])

<|im_start|>user
Hey, i Want to go to St. Louis on the 17th of August<|im_end|>
<|im_start|>assistant
For how many days?<|im_end|>
<|im_start|>user
I need to back by the 31st<|im_end|>
<|im_start|>assistant
What will your departure city be?<|im_end|>
<|im_start|>user
I’m from Calgary<|im_end|>
<|im_start|>assistant
I have a direct flight leaving from Calgary on August 18 and returning on August 28th. Does that work for you?<|im_end|>
<|im_start|>user
Yes that sounds perfect may you please tell me about the destination place<|im_end|>
<|im_start|>assistant
It is the Glorious Cloak inn hotel in St. Louis. It has a 3-star rating.<|im_end|>
<|im_start|>user
That sounds lavish, what type of flight would that be?<|im_end|>
<|im_start|>assistant
You would be flying business class, for a grand total of only $1858.14<|im_end|>
<|im_start|>user
Are there any possible flights with economy class?<|im_end|>
<|im_start|>assistant
I have a package at the Lunar whcih has a 5 star rating on an economy 

If you're looking to make your own chat template, that also is possible! You must use the Jinja templating regime. We provide our own stripped down version of the `Unsloth template` which we find to be more efficient, and leverages ChatML, Zephyr and Alpaca styles.

More info on chat templates on [our wiki page!](https://github.com/unslothai/unsloth/wiki#chat-templates)

In [None]:
unsloth_template = \
    "{{ bos_token }}"\
    "{{ 'You are a helpful assistant to the user\n' }}"\
    "{% endif %}"\
    "{% for message in messages %}"\
        "{% if message['role'] == 'user' %}"\
            "{{ '>>> User: ' + message['content'] + '\n' }}"\
        "{% elif message['role'] == 'assistant' %}"\
            "{{ '>>> Assistant: ' + message['content'] + eos_token + '\n' }}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}"\
        "{{ '>>> Assistant: ' }}"\
    "{% endif %}"
unsloth_eos_token = "eos_token"

if False:
    tokenizer = get_chat_template(
        tokenizer,
        chat_template = (unsloth_template, unsloth_eos_token,), # You must provide a template and EOS token
        mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
        map_eos_token = True, # Maps <|im_end|> to </s> instead
    )

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 40,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/16142 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
14.213 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,142 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 40
 "-____-"     Number of trainable parameters = 41,943,040


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

1080.1887 seconds used for training.
18.0 minutes used for training.
Peak reserved memory = 6.131 GB.
Peak reserved memory for training = 1.631 GB.
Peak reserved memory % of max memory = 41.572 %.
Peak reserved memory for training % of max memory = 11.059 %.


<a name="Inference"></a>
### Inference
Let's run the model! Since we're using `ChatML`, use `apply_chat_template` with `add_generation_prompt` set to `True` for inference.

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Is there a package to Mannheim which offers a cheaper hotel? I am flexible with my dates of arrival and departure."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['<|im_start|>user\nIs there a package to Mannheim which offers a cheaper hotel? I am flexible with my dates of arrival and departure.<|im_end|>\n<|im_start|>assistant\n\n<|im_start|>assistant\n\n<|im_start|>assistant\n\n<|im_start|>assistant\n\n<|im_start|>assistant\n\n<|im_start|>assistant\n\n<|im_start|>ass']

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Hello there i am looking to go on a vacation with my family to Gotham City, can you help me?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<|im_start|>user
Hello there i am looking to go on a vacation with my family to Gotham City, can you help me?<|im_end|> 
<|im_start|>assistant
Sure! Where are you leaving from?<|im_end|>


MY OWN INFERENCE

In [None]:
from unsloth.chat_templates import get_chat_template
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Initialize the tokenizer
tokenizer = get_chat_template(
    AutoTokenizer.from_pretrained('gpt2'),  # replace with the appropriate tokenizer
    chat_template="chatml",  # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
    map_eos_token=True,  # Maps to </s> instead
)

# Load the model
model = AutoModelForCausalLM.from_pretrained('gpt2').half().cuda()  # replace with the appropriate model

# Enable native 2x faster inference if available
# Note: 'FastLanguageModel.for_inference(model)' is specific to certain frameworks.
#       Here, we enable faster inference using standard PyTorch methods.

# Example message based on the data format
messages = [
    {
        "from": "human",
        "value": " I'm bored. Can you suggest something interesting to do?"
    }
]

# Prepare inputs for the model
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

# Debug: Print the structure and shape of the inputs
print("Inputs shape:", inputs.shape)

# Generate outputs from the model
outputs = model.generate(input_ids=inputs, max_new_tokens=64, use_cache=True)

# Decode the outputs
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(decoded_outputs)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Inputs shape: torch.Size([1, 33])
["<|im_start|>user\n I'm bored. Can you suggest something interesting to do?\n<|im_start|>assistant\n\n<|im_start|>assistant\n\n<|im_start|>assistant\n\n<|im_start|>assistant\n\n<|im_start|>assistant\n\n<|im_start|>assistant\n\n<|im_start|>ass"]


MY OWN INFRENCE
with context

In [None]:
from unsloth.chat_templates import get_chat_template
import torch

# Assuming you have already initialized `tokenizer` and `model`
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",  # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
    map_eos_token=True,  # Maps </s> to end-of-sentence
)

# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

# Initialize conversation history
conversation_history = []

# Function to add a new message to the conversation history
def add_message(role, content):
    conversation_history.append({"from": role, "value": content})

# Function to generate a response from the model
def generate_response():
    inputs = tokenizer.apply_chat_template(
        conversation_history,
        tokenize=True,
        add_generation_prompt=True,  # Must add for generation
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(input_ids=inputs, max_new_tokens=64, use_cache=True)
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    return response

# Main loop for conversation
while True:
    # Get user input
    user_input = input("You: ")

    # Add user input to the conversation history
    add_message("human", user_input)

    # Generate response from the model
    response = generate_response()

    # Print the model's response
    print(f"GPT: {response}")

    # Add model's response to the conversation history
    add_message("gpt", response)


Unsloth: Will map <|im_end|> to EOS = <|im_end|>.


You: hello sir


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


GPT: <|im_start|>user
hello sir 
<|im_start|>assistant
Hello!
You: do u know anything beautiful to visit in summer ?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


GPT: <|im_start|>user
hello sir 
<|im_start|>assistant
<|im_start|>user
hello sir 
<|im_start|>assistant
Hello! 
<|im_start|>user
do u know anything beautiful to visit in summer ? 
<|im_start|>assistant
I have a 10 day package to Punta Cana for 1200.00.
You: what does punta cana offer?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


GPT: <|im_start|>user
hello sir 
<|im_start|>assistant
<|im_start|>user
hello sir 
<|im_start|>assistant
Hello! 
<|im_start|>user
do u know anything beautiful to visit in summer ? 
<|im_start|>assistant
<|im_start|>user
hello sir 
<|im_start|>assistant
<|im_start|>user
hello sir 
<|im_start|>assistant
Hello! 
<|im_start|>user
do u know anything beautiful to visit in summer ? 
<|im_start|>assistant
I have a 10 day package to Punta Cana for 1200.00. 
<|im_start|>user
what does punta cana offer? 
<|im_start|>assistant
It is a 3 star hotel with a 6.8/10 guest rating. It is near a park and a museum.


KeyboardInterrupt: Interrupted by user