# Load Fine Tuned Models

In [1]:
# Import necessary libraries
import os
from unsloth import FastLanguageModel
from transformers import BitsAndBytesConfig


# Function to load and prepare the model for inference
def load_model(model_name):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=512,
        dtype=None,  # Auto-detect dtype
        load_in_4bit=True,
        device_map="auto",
    )
    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
    return model, tokenizer

# Load the sleep model
sleep_model_name = "thinkersloop/sleep-llm-model"
sleep_model, sleep_tokenizer = load_model(sleep_model_name)

# Load the car model
car_model_name = "thinkersloop/car-llm-model"
car_model, car_tokenizer = load_model(car_model_name)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth: Fast Mistral patching release 2024.7
   \\   /|    GPU: NVIDIA A40. Max memory: 44.352 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth: Will load unsloth/mistral-7b-v0.3-bnb-4bit as a legacy tokenizer.
Unsloth 2024.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


==((====))==  Unsloth: Fast Mistral patching release 2024.7
   \\   /|    GPU: NVIDIA A40. Max memory: 44.352 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth: Will load unsloth/mistral-7b-v0.3-bnb-4bit as a legacy tokenizer.


In [2]:
# Function to perform inference
def generate_response(model, tokenizer, prompt):
    try:
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=64,
            pad_token_id=tokenizer.eos_token_id,
        )
        response = tokenizer.decode(
            outputs[0][len(inputs["input_ids"][0]) :], skip_special_tokens=True
        ).replace("\n", "")
        return response
    except Exception as e:
        print(f"Error generating response: {e}")
        return None

In [3]:
# Example prompts for inference
sleep_prompt = "What are the benefits of sleep for mental health?"
car_prompt = "What were the key innovations that led to the development of the first gasoline-powered automobiles?"

# Generate responses
sleep_response = generate_response(sleep_model, sleep_tokenizer, sleep_prompt)
car_response = generate_response(car_model, car_tokenizer, car_prompt)

# Print the responses
print("Sleep Model Response:")
print(sleep_response)
print("-" * 50)
print("Car Model Response:")
print(car_response)

Sleep Model Response:
Sleep is essential for overall health, including mental health. Poor sleep quality or quantity has been linked to various mental health issues, such as anxiety, depression, and cognitive impairment. On the other hand, good sleep can improve mental health by promoting emotional regulation, memory consolidation, and stress management. In this article
--------------------------------------------------
Car Model Response:
The first gasoline-powered automobiles were developed through a series of key innovations, including the development of the first practical internal combustion engine by Nicolas-Joseph Cugnot in 1769, the creation of the first modern automobile by Carl Benz in 


# Data Prep for Evaluation

In [4]:
from datasets import load_dataset

# Load test datasets
sleep_test_dataset = load_dataset("thinkersloop/sleep-dataset-llm", split="test")
car_test_dataset = load_dataset("thinkersloop/car-dataset-llm", split="test")

In [5]:
import json
from sklearn.metrics import accuracy_score, classification_report


# Function to evaluate the model
def evaluate_model(model, tokenizer, dataset):
    predictions = []
    references = []

    for example in dataset:
        messages = json.loads(example["messages"])
        prompt = messages["messages"][0]["content"]
        reference = messages["messages"][1]["content"]
        response = generate_response(model, tokenizer, prompt).replace("\n", "")
        predictions.append(response)
        references.append(reference)

    return predictions, references


# Evaluate sleep model
print("Evaluating sleep model...")
sleep_predictions, sleep_references = evaluate_model(
    sleep_model, sleep_tokenizer, sleep_test_dataset
)

# Evaluate car model
print("Evaluating car model...")
car_predictions, car_references = evaluate_model(
    car_model, car_tokenizer, car_test_dataset
)

Evaluating sleep model...
Evaluating car model...


In [6]:
sleep_predictions

['Insomnia , hypersomnia , and narcolepsy are all sleep disorders that can have a significant impact on an individual ’s quality of life . However , they differ in terms of their underlying causes , symptoms , and treatment approaches .Insomnia is a sleep disorder characterized by difficulty falling',
 'The different stages of sleep are NREM and REM sleep . NREM sleep is further divided into stages N1 , N2 , and N3 . N1 is the lightest stage of sleep , where the person is drowsy and can be easily awakened . N2 is a',
 'Sleep spindles and slow oscillations are two important brain wave patterns that occur during non-rapid eye movement (NREM) sleep. Sleep spindles are brief bursts of high-frequency brain activity that typically last for 0.5 to 1.5 seconds and occur every']

In [7]:
sleep_references

['Insomnia , hypersomnia , and narcolepsy are distinct sleep disorders with different diagnostic criteria . Insomnia is characterized by difficulty falling asleep or staying asleep , while hypersomnia involves excessive sleepiness even after adequate sleep . Narcolepsy is a neurological disorder characterized by excessive daytime sleepiness , hallucinations , and cataplexy ( sudden loss of muscle tone ) . The diagnostic criteria for these disorders do not overlap directly , but they may co - occur in some individuals . For instance , a person with insomnia may also experience excessive daytime sleepiness , which is a symptom of hypersomnia or narcolepsy . However , a proper diagnosis would require a comprehensive evaluation by a healthcare professional .',
 'The different stages of sleep include N1 , N2 , N3 ( also referred to as N4 ) , and REM sleep . N1 is the drowsy stage where brain waves and muscle activity begin to decrease . N2 is a light sleep stage where eye movement stops , b

### Performation Evaluation

In [8]:
sample_inputs = [
    "What are the benefits of sleep for mental health?",
    "What were the key innovations that led to the development of the first gasoline-powered automobiles?",
]

# Tokenize inputs
encoded_inputs = [
    sleep_tokenizer.encode(text, return_tensors="pt").to("cuda")
    for text in sample_inputs
]

In [9]:
import time as python_time
import numpy as np
import torch


def measure_inference_time(model, input_ids, num_repeats=3):
    starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(
        enable_timing=True
    )
    timings = np.zeros((num_repeats, 1))

    # GPU warm-up
    for _ in range(3):
        _ = model.generate(input_ids, max_new_tokens=64)

    # Measure performance
    with torch.no_grad():
        for rep in range(num_repeats):
            torch.cuda.synchronize()
            starter.record()
            _ = model.generate(input_ids, max_new_tokens=64)
            ender.record()
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)
            timings[rep] = curr_time

    return timings.flatten()


def calculate_throughput(model, input_ids, batch_size, duration=10):
    start_time = python_time.time()
    count = 0
    while python_time.time() - start_time < duration:
        _ = model.generate(input_ids.repeat(batch_size, 1), max_new_tokens=20)
        count += batch_size

    throughput = count / duration
    return throughput


def measure_gpu_utilization():
    if torch.cuda.is_available():
        return torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated()
    else:
        return None

In [10]:
# Measure inference time
inference_times = []
for input_ids in encoded_inputs:
    times = measure_inference_time(sleep_model, input_ids)
    inference_times.extend(times)

# Calculate statistics
mean_time = np.mean(inference_times)
std_time = np.std(inference_times)
percentile_95 = np.percentile(inference_times, 95)

# Measure throughput (QPS)
batch_size = 4  # Adjust based on your GPU memory
qps = calculate_throughput(sleep_model, encoded_inputs[0], batch_size)

# Measure GPU utilization
gpu_utilization = measure_gpu_utilization()

# Print results
print(f"Average Inference Time: {mean_time:.2f} ms")
print(f"Standard Deviation of Inference Time: {std_time:.2f} ms")
print(f"95th Percentile Latency: {percentile_95:.2f} ms")
print(f"Queries Per Second (QPS): {qps:.2f}")
print(
    f"GPU Utilization: {gpu_utilization:.2%}"
    if gpu_utilization
    else "GPU Utilization: N/A"
)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Average Inference Time: 2259.29 ms
Standard Deviation of Inference Time: 10.02 ms
95th Percentile Latency: 2274.62 ms
Queries Per Second (QPS): 2.80
GPU Utilization: 98.75%


In [11]:
rpm = qps * 60
print(f"Requests Per Minute (RPM): {rpm:.2f}")

Requests Per Minute (RPM): 168.00


### ROUGE Evaluation

In [12]:
from rouge_score import rouge_scorer


def calculate_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = []

    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)  # Note that reference comes first here
        scores.append(score)

    # Average scores
    avg_scores = {
        key: sum(score[key].fmeasure for score in scores) / len(scores)
        for key in scores[0].keys()
    }
    return avg_scores


sleep_rouge_scores = calculate_rouge(sleep_predictions, sleep_references)
print("Sleep Model ROUGE Scores:", sleep_rouge_scores)


car_rouge_scores = calculate_rouge(car_predictions, car_references)
print("Car Model ROUGE Scores:", car_rouge_scores)


Sleep Model ROUGE Scores: {'rouge1': 0.3012691012691013, 'rouge2': 0.1251693017946506, 'rougeL': 0.18873348873348875}
Car Model ROUGE Scores: {'rouge1': 0.33112472515337554, 'rouge2': 0.10982220912707506, 'rougeL': 0.21737507194154593}
