In [1]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install torchinfo gradio

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting xformers==0.0.29.post3
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl==0.15.2
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.5.6-py3-none-any.whl.metadata (8.0 kB)
Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.15.2-py3-none-any.whl (318 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━

In [2]:
import unsloth

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:

from unsloth import FastLanguageModel
# message history part remains
import re
import os
# is cuda version error persists, with incompatibility issue, try this !!
# os.environ["BNB_CUDA_VERSION"] = "117"
# print(os.environ)


import torch
import gradio as gr
from transformers import TextIteratorStreamer

# Set environment variable to control model cache location (optional)
# Uncomment and modify the path if you want to change the default cache location
# os.environ["HF_HOME"] = "/path/to/your/model/cache"



# Model configuration
MODEL_ID = "sayantanBiswas/mistral-7b-v0.3"  # Change this to your preferred model
MAX_SEQ_LENGTH = 4096
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

class ChatModel:
    def __init__(self, model_id, max_length=4096):
        print(f"Loading model {model_id} on {DEVICE} with {DTYPE}...")

        # Load the model with Unsloth
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_id,
            max_seq_length=max_length,
            dtype=DTYPE
        )

        # Configure generation parameters
        self.generation_config = {
            "max_new_tokens": 1024,
            "temperature": 0.7,
            "top_p": 0.9,
            "top_k": 50,
            "repetition_penalty": 1.2,
            "do_sample": True,
            "use_cache": True,
        }

        print("Model loaded successfully!")

    def generate_response(self, history):
        # Format the chat history for the model
        formatted_prompt = self.format_chat_history(history)
        print('model input: ', formatted_prompt)

        # Tokenize the input
        inputs = self.tokenizer([formatted_prompt], return_tensors="pt")
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

        # Generate without streaming
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                **self.generation_config
            )

        # Decode the generated tokens
        generated_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Return the full response
        yield generated_response

    def extract_input_response(self, text):
        results = {}

        # Extract Input sections independently
        input_pattern = r'### Input:\s+(.*?)(?=###|\Z)'
        input_matches = re.findall(input_pattern, text, re.DOTALL)

        # Clean and store input matches
        if input_matches:
            cleaned_inputs = [match.strip() for match in input_matches if match.strip()]
            if cleaned_inputs:
                results['Input'] = cleaned_inputs

        # Extract Response sections independently
        response_pattern = r'### Response:\s+(.*?)(?=###|\Z)'
        response_matches = re.findall(response_pattern, text, re.DOTALL)

        # Clean and store response matches
        if response_matches:
            cleaned_responses = [match.strip() for match in response_matches if match.strip()]
            if cleaned_responses:
                results['Response'] = cleaned_responses

        # If nothing was found, return None
        if not results:
            return None

        return results

    def format_chat_history(self, history):
        with open('history.txt', 'a') as f:
            f.write(str(history)+'\n'+'*'*10+'\n')
        """Format prompt using only the last turn, optionally with one prior turn for context."""

        alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

        ### Instruction:
        {}

        ### Input:
        {}

        ### Response:
        """

        # Initialize variables to handle previous user and assistant messages
        prev_user = ""
        prev_assistant = ""
        context = ""

        # Include one previous exchange as context (optional)
        if len(history) > 1:
            print('history: ', history, len(history))

            # Get the previous exchange
            prev_exchange = history[-2][1]

            # Extract input and response from previous exchange
            results = self.extract_input_response(prev_exchange)

            if results is not None:
                prev_user_list = results.get('Input')
                prev_assistant_list = results.get('Response')

                prev_user = '\n'.join(prev_user_list) if prev_user_list else ''
                prev_assistant = '\n'.join(prev_assistant_list) if prev_assistant_list else ''

                # Create context with previous exchange
                context = f"\n{prev_user}\n\n{prev_assistant}\n"

        current_user_input = history[-1][0]
        full_input = context + f"\n{current_user_input}"

        return alpaca_prompt.format(
            "Answer the user's question accurately, thoroughly, and helpfully. Provide clear explanations with relevant details. If asked about medical or health-related topics, give informative responses while maintaining a balanced and educational tone.",
            full_input
        )


# Initialize the model
chat_model = ChatModel(MODEL_ID)

# Define the Gradio interface
with gr.Blocks(css="footer {visibility: hidden}") as demo:
    gr.HTML("""
        <div style="text-align: center; margin-bottom: 1rem">
            <h1>Unsloth LLM Chat</h1>
            <p>Chat with a large language model powered by Unsloth's FastLanguageModel.</p>
        </div>
    """)

    chatbot = gr.Chatbot(height=600)
    msg = gr.Textbox(
        label="Message",
        placeholder="Type your message here and press Enter",
        lines=1,
        submit_btn=True
    )
    clear = gr.Button("Clear")

    def user(user_message, history):
        print("user_message, history: ", user_message, history)
        return "", history + [[user_message, ""]]

    def bot(history):
        # Generate response based on full history
        response_generator = chat_model.generate_response(history)

        # Get the full response
        for full_response in response_generator:
            # Extract only the response part using regex
            pattern = r"### Response:(.*?)(?=###|\Z)"
            match = re.search(pattern, full_response, re.DOTALL)

            if match:
                clean_response = match.group(1).strip()
            else:
                clean_response = "Failed to extract response"

            # Update only the latest response
            history[-1][1] = clean_response
            yield history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )

    clear.click(lambda: [], None, chatbot, queue=False)

# Launch the app
if __name__ == "__main__":
    demo.queue().launch(share=True)  # Set share=False for no public link

Loading model sayantanBiswas/mistral-7b-v0.3 on cuda with torch.float16...
==((====))==  Unsloth 2025.5.4: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Unsloth 2025.5.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Model loaded successfully!


  chatbot = gr.Chatbot(height=600)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://10dd9d7dfcd2f68836.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
