In [2]:
import requests
import time
import gradio as gr

def ollama_llm(history, question):
    url = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}
    payload = {
        "model": "deepseek-r1:70b",
        # 將 prompt 進行截斷處理，避免過長
        "prompt": f"用繁體中文簡潔回答！\n\nQuestion: {question[:1000]}",
        "stream": False,
        # 增加 max_tokens 參數限制輸出長度
        "options": {
            "max_tokens": 1000  # 限制輸出 Token 數量
        }
    }
    
    try:
        start_time = time.time()
        response = requests.post(url, json=payload, headers=headers)
        end_time = time.time()
        
        if response.status_code == 200:
            response_data = response.json()
            
            # 截斷過長的回覆
            generated_text = response_data.get("response", "")[:2000]
            generated_text = generated_text.replace('\n', '\n\n')  # 增加換行
            
            # 計算 Token 資訊
            eval_count = response_data.get("eval_count", 0)
            prompt_eval_count = response_data.get("prompt_eval_count", 0)
            total_duration = response_data.get("total_duration", 0) / 1e9
            
            total_tokens = eval_count + prompt_eval_count
            tokens_per_second = total_tokens / total_duration if total_duration > 0 else 0
            elapsed_time = end_time - start_time
            
            # 更新對話歷史
            # history.append((question, generated_text))
            
            history.append((
                question, 
                f"```\n{generated_text}\n```"  # 使用代碼塊格式, 提供模型可以提供的更多訊息
            ))
            
            
            # 準備 Token 使用資訊
            token_info = (
                f"執行時間：{elapsed_time:.2f} 秒\n"
                f"Token 使用：\n"
                f"- 輸入 Tokens: {prompt_eval_count}\n"
                f"- 輸出 Tokens: {eval_count}\n"
                f"- 總 Tokens: {total_tokens}\n"
                f"- Token 處理速度: {tokens_per_second:.2f} tokens/秒"
            )
            
            return history, token_info
        
        else:
            error_msg = f"API 請求失敗：{response.status_code} - {response.text}"
            history.append((question, error_msg))
            return history, error_msg
    
    except Exception as e:
        error_msg = f"系統異常：{str(e)}"
        history.append((question, error_msg))
        return history, error_msg

# Gradio 介面設定
def create_interface():
    with gr.Blocks() as demo:
        history = gr.State([])
        # 增加更多設定來處理長文本
        chatbot = gr.Chatbot(
            label="對話歷史", 
            height=600,  # 增加高度
            layout="bubble",  # 使用氣泡佈局
            bubble_full_width=False,  # 允許氣泡寬度自適應
            render_markdown=True,  # 啟用 Markdown 渲染
            show_copy_button=True,  # 增加複製按鈕
            # 設定文字換行
            line_breaks=True
        )
        
        msg = gr.Textbox(label="輸入您的問題")
        submit_btn = gr.Button("發送")
        token_info = gr.Textbox(label="Token 使用資訊", lines=5)
        
        submit_btn.click(
            ollama_llm, 
            inputs=[history, msg], 
            outputs=[chatbot, token_info]
        )
        
        msg.submit(
            ollama_llm, 
            inputs=[history, msg], 
            outputs=[chatbot, token_info]
        )
    
    return demo


# 啟動 Gradio 應用程式
iface = create_interface()
iface.launch(show_error=True)



* Running on local URL:  http://127.0.0.1:7878

To create a public link, set `share=True` in `launch()`.


