In [None]:
# Import necessary libraries
import os
from unsloth import FastLanguageModel
from transformers import BitsAndBytesConfig


# Function to load and prepare the model for inference
def load_model(model_name):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=512,
        dtype=None,  # Auto-detect dtype
        load_in_4bit=True,
        device_map="auto",
    )
    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
    return model, tokenizer


# Function to perform inference
def generate_response(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=512)
    response = tokenizer.decode(
        outputs[0][len(inputs["input_ids"][0]) :], skip_special_tokens=True
    )
    return response

In [None]:
# Load the sleep model
sleep_model_name = "thinkersloop/sleep-llm-model"
sleep_model, sleep_tokenizer = load_model(sleep_model_name)

# Load the car model
car_model_name = "thinkersloop/car-llm-model"
car_model, car_tokenizer = load_model(car_model_name)


In [None]:
# Example prompts for inference
sleep_prompt = "What are the benefits of sleep for mental health?"
car_prompt = "What were the key innovations that led to the development of the first gasoline-powered automobiles?"

# Generate responses
sleep_response = generate_response(sleep_model, sleep_tokenizer, sleep_prompt)
car_response = generate_response(car_model, car_tokenizer, car_prompt)

# Print the responses
print("Sleep Model Response:")
print(sleep_response)
print("\nCar Model Response:")
print(car_response)