In [5]:
pip install gradio



In [1]:
pip install unsloth



In [8]:
import gradio as gr
from transformers import TextStreamer
import torch
from unsloth import FastLanguageModel

# Model Initialization
max_seq_length = 2048
dtype = None
model_name = "mana0/lora_model"

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)

# Response generation function
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    # Construct conversation messages
    messages = [{"role": "system", "content": system_message}]
    for val in history:
        if val[0]:  # User message
            messages.append({"role": "user", "content": val[0]})
        if val[1]:  # Assistant response
            messages.append({"role": "assistant", "content": val[1]})

    # Add the latest user input
    messages.append({"role": "user", "content": message})

    # Tokenize the input
    inputs = tokenizer.apply_chat_template(
        conversation=messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    attention_mask = (inputs != tokenizer.pad_token_id).long()

    # Generate the assistant's response
    response = ""
    output_ids = model.generate(
        input_ids=inputs,
        attention_mask=attention_mask,
        max_new_tokens=max_tokens,
        use_cache=True,
        temperature=temperature,
        top_p=top_p,
    )

    # Decode the assistant's part of the response
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract only the assistant's reply
    assistant_reply = response.split("assistant")[-1].strip()
    return assistant_reply


# Gradio ChatInterface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful and friendly assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
    title="Custom Chatbot",
    description="Interact with an AI assistant using a custom unsloth model.",
)

if __name__ == "__main__":
    demo.launch(debug=True)


==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://89048e7bb537e76e86.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7863 <> https://89048e7bb537e76e86.gradio.live
