# L5: Chat with any LLM! 💬

Load your HF API key and relevant Python libraries

In [None]:
import os
import io
import IPython.display
from PIL import Image
import base64 
import requests 
requests.adapters.DEFAULT_TIMEOUT = 60

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
hf_api_key = os.environ['HF_API_KEY']

In [None]:
# Helper function
import requests, json
from huggingface_hub import InferenceClient

# SmolLM-360M-Instruct
client = InferenceClient(
    "HuggingFaceTB/SmolLM-360M-Instruct",
    token=hf_api_key,
)

# # non-streamable chat_completion
# message =client.chat_completion(
# 	messages=[{"role": "user", "content": "What is the capital of France?"}],
# 	max_tokens=500,
# 	stream=False,
# )
# print(message.choices[0].message.content)

# streamable chat_completion
for message in client.chat_completion(
	messages=[{"role": "user", "content": "What is the capital of France?"},{"role": "assistant", "content": "Paris, the city of light and glamour! "},{"role": "user", "content": "what about the capital of Spain?"}],
	max_tokens=50,
	stream=True
):
    print(message.choices[0].delta.content, end="",flush=True)

## Building an app to chat with any LLM

Here we'll be using an [HuggingFaceTB/SmolLM-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM-360M-Instruct)

In [None]:
prompt = "Has math been invented or discovered?"
client.chat_completion(messages=[{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": prompt}], max_tokens=100,stream=False).choices[0].message.content

In [None]:
#Back to Lesson 2, time flies!
import gradio as gr
def generate(input, slider):
    # 使用新的方式来获取消息内容
    response = client.chat_completion(messages=[{"role": "user", "content": input}], max_tokens=slider)
    output = response.choices[0].message.content  # 使用数据类属性
    return output

demo = gr.Interface(fn=generate, 
                    inputs=[gr.Textbox(label="User input:"), 
                            gr.Slider(label="Max tokens", 
                                      value=50,  
                                      maximum=1024, 
                                      minimum=1,
                                      step=10)], 
                    outputs=[gr.Textbox(label="ChatBot output:")])

gr.close_all()
demo.launch(share=True)

## `gr.Chatbot()`

- `gr.Chatbot()` allows you to save the chat history (between the user and the LLM) as well as display the dialogue in the app.
- Define your `fn` to take in a `gr.Chatbot()` object.  
  - Within your defined `fn` function, append a tuple (or a list) containing the user message and the LLM's response:
`chatbot_object.append( (user_message, llm_message) )`

- Include the chatbot object in both the inputs and the outputs of the app.
- Use Qingyan's glms API to support the chatbot.

In [None]:
import random

def respond(message, chat_history):
        #No LLM here, just respond with a random pre-made message
        bot_message = random.choice(["Tell me more about it", 
                                     "Cool, but I'm not interested", 
                                     "Hmmmm, ok then"]) 
        chat_history.append((message, bot_message))
        return "", chat_history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=240) #just to fit the notebook
    msg = gr.Textbox(label="Prompt")
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot]) #Press enter to submit

gr.close_all()
demo.launch(share=True)

In [None]:
# As the inference is not available, I have commented out the code.
# def format_chat_prompt(message, chat_history):
#     prompt = ""
#     for turn in chat_history:
#         user_message, bot_message = turn
#         prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}"
#     prompt = f"{prompt}\nUser: {message}\nAssistant:"
#     return prompt
import requests, json
from huggingface_hub import InferenceClient
import gradio as gr

# SmolLM-360M-Instruct
client = InferenceClient(
    "HuggingFaceTB/SmolLM-360M-Instruct",
    token=hf_api_key,
)

def respond(message, chat_history):
    chat_history.append((message, None))
    
    try:

        messages = [{"role": "user" if i % 2 == 0 else "assistant", "content": m} for i, (m, _) in enumerate(chat_history)]

        response= client.chat_completion(messages=messages,
                                        max_tokens=128,
                                        stream=False
                                        )
        bot_message = response.choices[0].message.content
        chat_history[-1] = (message, bot_message)
    except Exception as e:
        print(e)
        chat_history[-1] = (message, f"Error: {str(e)}")
    
    return "", chat_history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=240) #just to fit the notebook
    msg = gr.Textbox(label="Prompt")
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot]) #Press enter to submit

gr.close_all()
demo.launch(share=True)

### Adding other advanced features

In [None]:
import requests, json
from huggingface_hub import InferenceClient
import gradio as gr

# SmolLM-360M-Instruct
client = InferenceClient(
    "HuggingFaceTB/SmolLM-360M-Instruct",
    token=hf_api_key,
)

def respond(message, chat_history,instruction="You are a helpful assistant", temperature=0.7, top_p=0.9,stop=None,stream=False,max_tokens=128):
    chat_history.append((message, None))
    
    try:

        messages = [{"role": "user" if i % 2 == 0 else "assistant", "content": m} for i, (m, _) in enumerate(chat_history)]
        message_zero=[{"role":"system","content":"You are a helpful assistant and you always provides short and concise response."}]
        messages=message_zero+messages

        response= client.chat_completion(messages=messages,
                                        max_tokens=max_tokens,
                                        stream=stream,
                                        temperature=temperature,
                                        top_p=top_p,
                                        stop=stop
                
                                        )
        bot_message = response.choices[0].message.content
        chat_history[-1] = (message, bot_message)
    except Exception as e:
        print(e)
        chat_history[-1] = (message, f"Error: {str(e)}")
    
    return "", chat_history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=240) #just to fit the notebook
    msg = gr.Textbox(label="Prompt")
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot]) #Press enter to submit

gr.close_all()
demo.launch(share=True)

### Streaming

- If your LLM can provide its tokens one at a time in a stream, you can accumulate those tokens in the chatbot object.
- The `for` loop in the following function goes through all the tokens that are in the stream and appends them to the most recent conversational turn in the chatbot's message history.

In [50]:
def respond(message, chat_history,system="You are a helpful assistant", temperature=0.7, top_p=0.9,stop=["\n"],stream=True,max_tokens=128):
    chat_history.append((message, None))
    
    try:

        messages = [{"role": "user" if i % 2 == 0 else "assistant", "content": m} for i, (m, _) in enumerate(chat_history)]
        message_zero=[{"role":"system","content":system}]
        messages=message_zero+messages

        response= client.chat_completion(messages=messages,
                                        max_tokens=max_tokens,
                                        stream=stream,
                                        temperature=temperature,
                                        top_p=top_p,
                                        stop=stop
                
                                        )
        bot_message = ""
        for chunk in response:
            if chunk.choices[0].delta.content is not None:
                bot_message += chunk.choices[0].delta.content
                chat_history[-1] = (message, bot_message)
                yield "", chat_history
       
    except Exception as e:
        print(e)
        chat_history[-1] = (message, f"Error: {str(e)}")

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=240) #just to fit the notebook
    msg = gr.Textbox(label="Prompt")
    with gr.Accordion(label="Advanced options",open=False):
        system = gr.Textbox(label="System message", lines=2, value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.")
        temperature = gr.Slider(label="temperature", minimum=0.1, maximum=1, value=0.7, step=0.1)
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    btn.click(respond, inputs=[msg, chatbot, system,temperature], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot, system,temperature], outputs=[msg, chatbot]) #Press enter to submit

gr.close_all()
demo.queue().launch(share=True)   

Closing server running on port: 7860
Running on local URL:  http://127.0.0.1:7883
Running on public URL: https://3b836e5eee11b486fa.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Notice, in the cell above, you have used `demo.queue().launch()` instead of `demo.launch()`. "queue" helps you to boost up the performance for your demo. You can read [setting up a demo for maximum performance](https://www.gradio.app/guides/setting-up-a-demo-for-maximum-performance) for more details.

In [None]:
gr.close_all()