In [None]:
import requests
import time
import gradio as gr

# Function to interact with Ollama API
def ollama_llm(question):
    url = "http://localhost:11434/api/generate"  # Adjust the URL if your Ollama server is hosted elsewhere
    headers = {
        "Content-Type": "application/json"
    }
    payload = {
        "model": "llama3.3",
        "prompt": f"總是用繁體中文回答！\n\nQuestion: {question}",
        "stream": False  # Ensure the response is not streamed
    }
    
    try:
        start_time = time.time()
        response = requests.post(url, json=payload, headers=headers)
        end_time = time.time()
        
        if response.status_code == 200:
            response_data = response.json()
            generated_text = response_data.get("response", "")


            eval_count = response_data.get("eval_count", False)
            eval_duration = response_data.get("eval_duration", False) / 1e9  # 將 ns 轉為秒
            prompt_eval_count = response_data.get("prompt_eval_count", False)
            prompt_eval_duration = response_data.get("prompt_eval_duration", False) / 1e9  # 將 ns 轉為秒
            total_duration = response_data.get("total_duration", False) / 1e9  # 將 ns 轉為秒

            # 計算 token 數量和時間
            total_tokens = eval_count + prompt_eval_count
            tokens_per_second = total_tokens / total_duration if total_duration > 0 else 0
            
            elapsed_time = end_time - start_time
                                    
            return (
                f"回答內容：\n{generated_text}\n\n"
                f"完整執行時間：{elapsed_time:.2f} 秒\n"
                f"Token 使用量：\n"
                f"- Prompt Tokens: {prompt_eval_count}\n"
                f"- Completion Tokens: {eval_count}\n"
                f"- Total Tokens: {total_tokens}\n"
                f"- Total Duration: {total_duration:.6f} seconds\n"
                f"Tokens per Second: {tokens_per_second:.2f}"
            )
        else:
            return f"An error occurred: {response.status_code} - {response.text}"
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Define Gradio interface
def get_important_facts(question):
    return ollama_llm(question)

# Create Gradio app
iface = gr.Interface(
    fn=get_important_facts,
    inputs=gr.Textbox(lines=2, placeholder="請輸入您的問題"),
    outputs="text",
    title="Ollama Chat",
    description="使用 Llama3 模型直接回答您的問題，並顯示執行時間和 Token 使用量。",
)

# Launch Gradio app
iface.launch()