<a href="https://colab.research.google.com/github/peremartra/FinLLMOpt/blob/Llama-FinSent-SI/Llama-FinSent-S-Instruct/Gradio_Interface_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q gradio


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.0/322.0 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.6/12.6 MB[0m [31m107.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.5/71.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_mac

In [3]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Detect device: use CUDA if available, otherwise check for MPS (Apple Silicon), else CPU.
device = "cuda" if torch.cuda.is_available() else "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else "cpu"

# Cache for loaded models to avoid reloading on every request.
model_cache = {}


In [4]:
def load_model(model_name):
    """
    Loads and caches the tokenizer and model from Hugging Face.
    """
    if model_name not in model_cache:
        print(f"Loading {model_name} on {device} ...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)
        model.to(device)
        model_cache[model_name] = (tokenizer, model)
    return model_cache[model_name]

def load_model_action(model_choice):
    """
    Action triggered by the 'Load Model' button.
    Loads the selected model and returns a status message.
    """
    try:
        load_model(model_choice)
        return f"Model '{model_choice}' loaded successfully."
    except Exception as e:
        return f"Error loading model '{model_choice}': {str(e)}"

def chat(model_choice, message, history):
    """
    Appends the new message to the conversation history, builds the input context,
    generates a response from the chosen model, and returns the updated conversation.
    """
    if history is None:
        history = []

    # Build conversation context from history.
    conversation = ""
    for user_msg, bot_msg in history:
        conversation += f"User: {user_msg}\nBot: {bot_msg}\n"
    conversation += f"User: {message}\nBot:"

    tokenizer, model = load_model(model_choice)

    # Encode the conversation and generate a response.
    input_ids = tokenizer.encode(conversation, return_tensors="pt").to(device)
    output_ids = model.generate(
        input_ids,
        max_length=input_ids.shape[1] + 100,  # adjust max_length as needed
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the output and strip unwanted tokens.
    output_text = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
    output_text = output_text.strip()

    # Append the new turn to the conversation history.
    history.append((message, output_text))
    return history, history

# Build the Gradio interface using Blocks.
with gr.Blocks() as demo:
    gr.Markdown("# Hugging Face Model Chatbot")
    # Row for model selection and load button.
    with gr.Row():
        model_choice = gr.Dropdown(
            choices=["meta-llama/Llama-3.2-1B-Instruct", "oopere/Llama-FinSent-S"],
            value="meta-llama/Llama-3.2-1B-Instruct",
            label="Select Model"
        )
        load_button = gr.Button("Load Model")
        load_status = gr.Textbox(label="Model Status", interactive=False)

    chatbot = gr.Chatbot(label="Chat Conversation")

    # Row for user input and send button.
    with gr.Row():
        message = gr.Textbox(label="Your Message", placeholder="Type your message here...", lines=1)
        send_button = gr.Button("Send")

    # State to store conversation history.
    state = gr.State([])

    # Link the "Load Model" button to load the model and update the status.
    load_button.click(fn=load_model_action, inputs=model_choice, outputs=load_status)

    # Link both the Send button and pressing Enter in the textbox to send a message.
    send_button.click(fn=chat, inputs=[model_choice, message, state], outputs=[chatbot, state])
    message.submit(fn=chat, inputs=[model_choice, message, state], outputs=[chatbot, state])



In [6]:
# Launch the interface.
demo.launch(debug=True)

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://2a5d6c8ccee4e29215.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Loading meta-llama/Llama-3.2-1B-Instruct on cuda ...


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Loading oopere/Llama-FinSent-S on cuda ...


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/888 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.83G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://2a5d6c8ccee4e29215.gradio.live


