### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth

else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth
    !pip install torchinfo


### Unsloth

In [None]:

# is cuda version error persists, with incompatibility issue, try this !!
# import os
# os.environ["BNB_CUDA_VERSION"] = "117"

In [None]:
import unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    cache_dir='/content/unsloth-models'
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    # instructions = examples["instruction"]
    # inputs       = examples["input"]
    # outputs      = examples["output"]

    # print(examples)

    instructions = "Answer the user's question accurately, thoroughly, and helpfully. Provide clear explanations with relevant details. If asked about medical or health-related topics, give informative responses while maintaining a balanced and educational tone."

    inputs = examples['text'].split('<HUMAN>:')[-1].split('<ASSISTANT>:')[0].strip()
    outputs = examples['text'].split('<HUMAN>:')[-1].split('<ASSISTANT>:')[1].strip()

    # print(inputs)
    texts = []
    # for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
    text = alpaca_prompt.format(instructions, inputs, outputs) + EOS_TOKEN
        # texts.append(text)
    return { "text" : text, }
pass

from datasets import load_dataset
dataset = load_dataset("heliosbrahma/mental_health_chatbot_dataset", split = "train")
dataset = dataset.map(formatting_prompts_func)

In [None]:
split_dataset = dataset.train_test_split(test_size=0.2)

train_dataset = split_dataset['train']
test_dataset = split_dataset['test']
len(train_dataset['text'])

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
# import os
# # from dotenv import load_dotenv
# from huggingface_hub import login
# import wandb

# # load_dotenv()
# # hf_token = os.environ['HF_TOKEN']
# # wb_key = os.environ['WANDB_KEY']


# # login(token=hf_token)
# wandb.login()

# # from datasets import load_dataset
# # data = load_dataset("heliosbrahma/mental_health_chatbot_dataset", split='train')
# # data


In [None]:
# '''
# # to change the /cache/hugging_face download folder to custom path
# !echo 'export HF_HOME=/DATA1/sayantan/hf_cache/' >> ~/.bashrc
# !source ~/.bashrc
# '''

In [None]:
print(dataset['text'][0])

In [None]:
# dataset[0:100]['text']

<a name="Train"></a>
### Train the model


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-6,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = 'none', #"wandb", # Use this for WandB etc
    ),
)

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# import torch

# # Inspect the data types of normalization layers without changing them
# for name, module in trainer.model.named_modules():
#     if "norm" in name:
#         print(f"Layer: {name}, Data Type: {module.weight.dtype if hasattr(module, 'weight') else 'No weight parameter'}")

# trainer.model


import torch
from torchinfo import summary

# Assuming your model is already initialized as 'model'
# Define input shape based on your model requirements
# For a Mistral model, this would typically be [batch_size, sequence_length]
batch_size = 1
seq_length = 512

# Generate the summary
model_summary = summary(
    trainer.model,  # Your PeftModelForCausalLM instance
    input_data=torch.ones(batch_size, seq_length, dtype=torch.long),
    col_names=["input_size", "output_size", "num_params", "trainable"],
    depth=10,  # Adjust depth to control how detailed the hierarchy is shown
    device=next(model.parameters()).device,  # Use the same device as your model
    verbose=1
)

# print(model_summary)

In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Unsloth_Studio.ipynb)**

In [None]:
import re

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Answer the user's question accurately, thoroughly, and helpfully. Provide clear explanations with relevant details. If asked about medical or health-related topics, give informative responses while maintaining a balanced and educational tone.", # instruction
        "How can I prevent anxiety and depression?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
text = tokenizer.batch_decode(outputs)

# Pattern to match everything between "### Response:" and "<|end_of_text|>"
pattern = r"### Response:(.*?)(?=###|\Z)"

# Extract with re.DOTALL to include newlines in the match
match = re.search(pattern, text[0], re.DOTALL)

if match:
    response_text = match.group(1).strip()
    print(response_text)
else:
    print("No response found")

In [None]:
print(text)

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Answer the user's question accurately, thoroughly, and helpfully. Provide clear explanations with relevant details. If asked about medical or health-related topics, give informative responses while maintaining a balanced and educational tone.", # instruction
        "How can I prevent anxiety and depression?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 1024)

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Answer the user's question accurately, thoroughly, and helpfully. Provide clear explanations with relevant details. If asked about medical or health-related topics, give informative responses while maintaining a balanced and educational tone. If the topic asked other than medical or health-related topics, do not provide any answer", # instruction
        "How to overcome depression", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256)

<a name="Save"></a>
### Saving, loading finetuned models

In [None]:
model.save_pretrained("mistral-7b-v0.3")  # Local saving
tokenizer.save_pretrained("mistral-7b-v0.3")
# model.push_to_hub("sayantanBiswas/mistral-7b-v0.3") # Online saving
# tokenizer.push_to_hub("sayantanBiswas/mistral-7b-v0.3") # Online saving

### Testing

In [None]:
from unsloth import FastLanguageModel
# message history part remains
import re
import os
# is cuda version error persists, with incompatibility issue, try this !!
# os.environ["BNB_CUDA_VERSION"] = "117"
# print(os.environ)


import torch
from transformers import TextIteratorStreamer

# Set environment variable to control model cache location (optional)
# Uncomment and modify the path if you want to change the default cache location
# os.environ["HF_HOME"] = "/path/to/your/model/cache"



# Model configuration
MODEL_ID = "sayantanBiswas/mistral-7b-v0.3"  # Change this to your preferred model
MAX_SEQ_LENGTH = 2048
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

class ChatModel:
    def __init__(self, model_id):
        print(f"Loading model {model_id} on {DEVICE} with {DTYPE}...")

        # Load the model with Unsloth
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_id,
            max_seq_length=MAX_SEQ_LENGTH,
            dtype=DTYPE,
            cache_dir='/content/unsloth-models'
        )

        # Configure generation parameters
        self.generation_config = {
            "max_new_tokens": 256,
            "temperature": 0.7,
            "top_p": 0.9,
            "top_k": 50,
            "repetition_penalty": 1.2,
            "do_sample": True,
            "use_cache": True,
        }

        print("Model loaded successfully!")

    def generate_response(self, history):
        # Format the chat history for the model
        # formatted_prompt = self.format_chat_history(history)
        # print('model input: ', history)
        original = ''
        # Extract Response sections independently
        response_pattern = r'### Response:\s+(.*?)(?=###|\Z)'
        response_matches = re.findall(response_pattern, history, re.DOTALL)

        # Clean and store response matches
        if response_matches:
            cleaned_responses = [match.strip() for match in response_matches if match.strip()]
            if cleaned_responses:
                original = cleaned_responses



        # Tokenize the input
        inputs = self.tokenizer([history], return_tensors="pt")
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

        # Generate without streaming
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                **self.generation_config
            )

        # Decode the generated tokens
        generated_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Return the full response
        return original[0], generated_response

In [None]:

chat = ChatModel(MODEL_ID)
tokenizer = chat.tokenizer
from tqdm import tqdm, trange
val_data = []
for i in trange(len(test_dataset)):

  original, response = chat.generate_response(test_dataset[i]['text'])

  # Extract only the response part using regex
  pattern = r"### Response:(.*?)(?=###|\Z)"
  match = re.search(pattern, response, re.DOTALL)

  if match:
      clean_response = match.group(1).strip()
  else:
      clean_response = "Failed to extract response"

  val_data.append((clean_response,original))

  # print(response)
  # print(original)

In [None]:
import math
import re
from collections import Counter
import sacrebleu
from tqdm import tqdm

def simple_tokenize(text):
    """Simple whitespace + punctuation tokenizer."""
    text = re.sub(r'[^\w\s]', ' ', text)
    return text.lower().split()

def calculate_bleu_simple(reference, candidate, max_n=4):
    """
    Simplified BLEU score (no external dependencies).
    """
    precisions = []
    for n in range(1, min(max_n + 1, len(candidate) + 1)):
        candidate_ngrams = [tuple(candidate[i:i+n]) for i in range(len(candidate) - n + 1)]
        candidate_counts = Counter(candidate_ngrams)

        max_counts = Counter()
        for ref in reference:
            ref_ngrams = [tuple(ref[i:i+n]) for i in range(len(ref) - n + 1)]
            ref_counts = Counter(ref_ngrams)
            for ngram, count in ref_counts.items():
                max_counts[ngram] = max(max_counts[ngram], count)

        clipped = {ngram: min(count, max_counts[ngram]) for ngram, count in candidate_counts.items()}
        numerator = sum(clipped.values())
        denominator = sum(candidate_counts.values())
        precision = numerator / denominator if denominator > 0 else 0
        precisions.append(precision)

    # Brevity penalty
    ref_lens = [len(ref) for ref in reference]
    closest_ref_len = min(ref_lens, key=lambda x: abs(x - len(candidate)))
    bp = math.exp(1 - closest_ref_len / len(candidate)) if len(candidate) < closest_ref_len else 1.0

    if all(p > 0 for p in precisions):
        bleu = bp * math.exp(sum(math.log(p) for p in precisions) / len(precisions))
    else:
        bleu = 0
    return bleu

def calculate_sacrebleu(reference, hypothesis):
    return sacrebleu.sentence_bleu(hypothesis, [reference]).score

def calculate_perplexity_4gram(text1, text2):
    """4-gram Laplace-smoothed perplexity between machine-generated (text1) and original (text2)."""
    def tokenize(text):
        text = text.replace('</s>', '')
        return re.findall(r'\w+|[^\w\s]', text.lower())

    def build_ngrams(tokens, n):
        return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]

    tokens1 = tokenize(text1)
    tokens2 = tokenize(text2)

    fourgram_counts = Counter(build_ngrams(tokens2, 4))
    trigram_counts = Counter(build_ngrams(tokens2, 3))

    vocab = set(tokens1 + tokens2)
    vocab_size = len(vocab)

    test_fourgrams = build_ngrams(tokens1, 4)
    if not test_fourgrams:
        return float('inf')  # Prevent division by zero

    log_sum = 0
    for fg in test_fourgrams:
        prefix = fg[:-1]
        prob = (fourgram_counts.get(fg, 0) + 1) / (trigram_counts.get(prefix, 0) + vocab_size)
        log_sum += math.log2(prob)

    avg_log_prob = log_sum / len(test_fourgrams)
    perplexity = math.pow(2, -avg_log_prob)
    return perplexity

def calculate_jaccard_similarity(text1, text2):
    tokens1 = set(re.findall(r'\w+|[^\w\s]', text1.lower()))
    tokens2 = set(re.findall(r'\w+|[^\w\s]', text2.lower()))
    intersection = len(tokens1 & tokens2)
    union = len(tokens1 | tokens2)
    return intersection / union if union > 0 else 1.0

def score_calculator(machine_generated, original):
    machine_tokens = simple_tokenize(machine_generated)
    original_tokens = simple_tokenize(original)
    reference = [original_tokens]
    candidate = machine_tokens

    bleu_score = calculate_bleu_simple(reference, candidate)
    sacre_score = calculate_sacrebleu(original, machine_generated)
    perplexity_score = calculate_perplexity_4gram(machine_generated, original)
    jaccard_score = calculate_jaccard_similarity(machine_generated, original)

    return bleu_score, sacre_score, perplexity_score, jaccard_score


In [None]:
scores = []
for machine, original in tqdm(val_data):
    bleu, sacre, perplexity, jaccard = score_calculator(machine, original)
    scores.append((bleu, sacre, perplexity, jaccard))


In [None]:
# scores
import numpy as np
np.mean(scores, axis = 0)