<a href="https://colab.research.google.com/github/peremartra/FinLLMOpt/blob/FinChat-XS-Instruct/Gradio_Interface_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# FinChat-XS

Example of use for FinChat. This notebook uses a Gradio interface to chat with FinChat and

In [1]:
!pip install -q gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.0/322.0 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.6/12.6 MB[0m [31m96.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.5/71.5 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Detect device: use CUDA if available, otherwise check for MPS (Apple Silicon), else CPU.
device = "cuda" if torch.cuda.is_available() else "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else "cpu"

# Cache for loaded models to avoid reloading on every request.
model_cache = {}


In [3]:
def load_model(model_name):
    """
    Loads and caches the tokenizer and model from Hugging Face.
    """
    if model_name not in model_cache:
        print(f"Loading {model_name} on {device} ...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)
        model.to(device)
        model_cache[model_name] = (tokenizer, model)
    return model_cache[model_name]

def load_model_action(model_choice):
    """
    Action triggered by the 'Load Model' button.
    Loads the selected model and returns a status message.
    """
    try:
        load_model(model_choice)
        return (f"Model '{model_choice}' loaded successfully.", [])
    except Exception as e:
        return (f"Error loading model '{model_choice}': {str(e)}", [])



In [4]:
def chat(model_choice, message, history):
    """
    Appends the new message to the conversation history, builds the chat input
    using the tokenizer's apply_chat_template method, generates a response, and returns
    the updated conversation.
    """
    # Initialize history as list of (user, assistant) tuples if not provided
    if history is None:
        history = []

    # Convert the history (stored as tuples) into a list of message dictionaries
    conversation = []
    conversation.append({"role": "system", "content": "You are FinChat, a helpful and concise AI assistant. Respond in a friendly, professional tone. Prioritize accuracy, and if unsure, ask clarifying questions."})
    for user_text, bot_text in history:
        conversation.append({"role": "user", "content": user_text})
        conversation.append({"role": "assistant", "content": bot_text})
    # Append the new user message
    conversation.append({"role": "user", "content": message})

    tokenizer, model = load_model(model_choice)

    # Format the conversation using the chat template method provided by the tokenizer.
    # This method converts the list of message dicts into a model-specific formatted string.
    input_text = tokenizer.apply_chat_template(conversation, tokenize=False)

    # Encode the formatted input and generate a response
    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
    output_ids = model.generate(
        inputs,
        max_new_tokens=150,  # adjust as needed
        do_sample=True,
        temperature=0.2,
        top_p=0.9,
    )

    # Decode only the newly generated tokens
    output_text = tokenizer.decode(output_ids[0][inputs.shape[1]:], skip_special_tokens=True).strip()
    # Add code to remove leading "assistant:" or similar prefixes
    output_text = output_text.lstrip("assistant").strip()  # Simple removal

    # Append the assistant's reply to history (as a tuple for display)
    history.append((message, output_text))
    return history, history

In [13]:
# Build the Gradio interface using Blocks.
with gr.Blocks() as demo:
    gr.Markdown("# Hugging Face Model Chatbot")
    # Row for model selection and load button.
    with gr.Row():
        model_choice = gr.Dropdown(
            choices=["oopere/your-own-llm-model", "HuggingFaceTB/SmolLM2-360M-Instruct", "meta-llama/Llama-3.2-1B-Instruct"],
            value="oopere/your-own-llm-model",
            label="Select Model"
        )
        load_button = gr.Button("Load Model")
        load_status = gr.Textbox(label="Model Status", interactive=False)

    chatbot = gr.Chatbot(label="Chat Conversation", height=200)

    # Row for user input and send button.
    with gr.Row():
        message = gr.Textbox(label="Your Message", placeholder="Type your message here...", lines=1)
        send_button = gr.Button("Send")

    # State to store conversation history.
    state = gr.State([])

    # Link the "Load Model" button to load the model and update the status.
    load_button.click(fn=load_model_action, inputs=model_choice, outputs=[load_status, state])

    # Link both the Send button and pressing Enter in the textbox to send a message.
    send_button.click(fn=chat, inputs=[model_choice, message, state], outputs=[chatbot, state])
    message.submit(fn=chat, inputs=[model_choice, message, state], outputs=[chatbot, state])

In [14]:
# Launch the interface.
demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://1390e43419b83a9ed5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Loading oopere/your-own-llm-model on cuda ...


tokenizer_config.json:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/922 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/724M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 2096, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 1643, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
      

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://1390e43419b83a9ed5.gradio.live


