In [1]:
!pip install --no-deps accelerate peft trl==0.15.2 cut_cross_entropy
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install torchinfo gradio

Collecting trl==0.15.2
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Downloading trl-0.15.2-py3-none-any.whl (318 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cut_cross_entropy-25.1.1-py3-none-any.whl (22 kB)
Installing collected packages: trl, cut_cross_entropy
Successfully installed cut_cross_entropy-25.1.1 trl-0.15.2
Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.meta

In [2]:
pip uninstall bitsandbytes triton xformers -y

[0mFound existing installation: triton 3.2.0
Uninstalling triton-3.2.0:
  Successfully uninstalled triton-3.2.0
[0m

In [3]:
# message history part remains
import re
import os
# is cuda version error persists, with incompatibility issue, try this !!
# os.environ["BNB_CUDA_VERSION"] = "117"
# print(os.environ)


import torch
import gradio as gr
from transformers import TextIteratorStreamer

# Set environment variable to control model cache location (optional)
# Uncomment and modify the path if you want to change the default cache location
# os.environ["HF_HOME"] = "/path/to/your/model/cache"



# Model configuration
MODEL_ID = "sayantanBiswas/mistral-7b-v0.3"  # Change this to your preferred model
MAX_SEQ_LENGTH = 4096
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

class ChatModel:
    def __init__(self, model_id, max_length=4096):
        print(f"Loading model {model_id} on {DEVICE} with {DTYPE}...")



        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            #device_map={"": "cpu"},
            torch_dtype=DTYPE
        ).to('cpu')

        self.generation_config = {
            "max_new_tokens": 1024,
            "temperature": 0.7,
            "top_p": 0.9,
            "top_k": 50,
            "repetition_penalty": 1.2,
            "do_sample": True,
            "use_cache": True,
        }

        print("Model loaded successfully!")

    def generate_response(self, history):
        formatted_prompt = self.format_chat_history(history)
        print('model input: ', formatted_prompt)

        inputs = self.tokenizer([formatted_prompt], return_tensors="pt").to(self.model.device)

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                **self.generation_config
            )

        generated_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        yield generated_response

    # Keep `extract_input_response` and `format_chat_history` unchanged



    def extract_input_response(self, text):
        results = {}

        # Extract Input sections independently
        input_pattern = r'### Input:\s+(.*?)(?=###|\Z)'
        input_matches = re.findall(input_pattern, text, re.DOTALL)

        # Clean and store input matches
        if input_matches:
            cleaned_inputs = [match.strip() for match in input_matches if match.strip()]
            if cleaned_inputs:
                results['Input'] = cleaned_inputs

        # Extract Response sections independently
        response_pattern = r'### Response:\s+(.*?)(?=###|\Z)'
        response_matches = re.findall(response_pattern, text, re.DOTALL)

        # Clean and store response matches
        if response_matches:
            cleaned_responses = [match.strip() for match in response_matches if match.strip()]
            if cleaned_responses:
                results['Response'] = cleaned_responses

        # If nothing was found, return None
        if not results:
            return None

        return results

    def format_chat_history(self, history):
        with open('history.txt', 'a') as f:
            f.write(str(history)+'\n'+'*'*10+'\n')
        """Format prompt using only the last turn, optionally with one prior turn for context."""

        alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

        ### Instruction:
        {}

        ### Input:
        {}

        ### Response:
        """

        # Initialize variables to handle previous user and assistant messages
        prev_user = ""
        prev_assistant = ""
        context = ""

        # Include one previous exchange as context (optional)
        if len(history) > 1:
            print('history: ', history, len(history))

            # Get the previous exchange
            prev_exchange = history[-2][1]

            # Extract input and response from previous exchange
            results = self.extract_input_response(prev_exchange)

            if results is not None:
                prev_user_list = results.get('Input')
                prev_assistant_list = results.get('Response')

                prev_user = '\n'.join(prev_user_list) if prev_user_list else ''
                prev_assistant = '\n'.join(prev_assistant_list) if prev_assistant_list else ''

                # Create context with previous exchange
                context = f"\n{prev_user}\n\n{prev_assistant}\n"

        current_user_input = history[-1][0]
        full_input = context + f"\n{current_user_input}"

        return alpaca_prompt.format(
            "Answer the user's question accurately, thoroughly, and helpfully. Provide clear explanations with relevant details. If asked about medical or health-related topics, give informative responses while maintaining a balanced and educational tone.",
            full_input
        )


# Initialize the model
chat_model = ChatModel(MODEL_ID)

# Define the Gradio interface
with gr.Blocks(css="footer {visibility: hidden}") as demo:
    gr.HTML("""
        <div style="text-align: center; margin-bottom: 1rem">
            <h1>Unsloth LLM Chat</h1>
            <p>Chat with a large language model powered by Unsloth's FastLanguageModel.</p>
        </div>
    """)

    chatbot = gr.Chatbot(height=600)
    msg = gr.Textbox(
        label="Message",
        placeholder="Type your message here and press Enter",
        lines=1,
        submit_btn=True
    )
    clear = gr.Button("Clear")

    def user(user_message, history):
        print("user_message, history: ", user_message, history)
        return "", history + [[user_message, ""]]

    def bot(history):
        # Generate response based on full history
        response_generator = chat_model.generate_response(history)

        # Get the full response
        for full_response in response_generator:
            # Extract only the response part using regex
            pattern = r"### Response:(.*?)(?=###|\Z)"
            match = re.search(pattern, full_response, re.DOTALL)

            if match:
                clean_response = match.group(1).strip()
            else:
                clean_response = "Failed to extract response"

            # Update only the latest response
            history[-1][1] = clean_response
            yield history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )

    clear.click(lambda: [], None, chatbot, queue=False)

# Launch the app
if __name__ == "__main__":
    demo.queue().launch(share=True)  # Set share=False for no public link



Loading model sayantanBiswas/mistral-7b-v0.3 on cpu with torch.float16...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.67M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/862 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

PackageNotFoundError: No package metadata was found for bitsandbytes

In [None]:

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

class ChatModel:
    def __init__(self, model_id, max_length=4096):
        print(f"Loading model {model_id} on {DEVICE} with {DTYPE}...")

        # Optional: If you're using 4-bit model
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=DTYPE,
        )

        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=DTYPE,
            quantization_config=bnb_config  # Remove this line if model is not quantized
        )

        self.generation_config = {
            "max_new_tokens": 1024,
            "temperature": 0.7,
            "top_p": 0.9,
            "top_k": 50,
            "repetition_penalty": 1.2,
            "do_sample": True,
            "use_cache": True,
        }

        print("Model loaded successfully!")

    def generate_response(self, history):
        formatted_prompt = self.format_chat_history(history)
        print('model input: ', formatted_prompt)

        inputs = self.tokenizer([formatted_prompt], return_tensors="pt").to(self.model.device)

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                **self.generation_config
            )

        generated_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        yield generated_response

    # Keep `extract_input_response` and `format_chat_history` unchanged
    ...