# All imports and inits

In [10]:
# import pandas as pd
# import glob
# import concurrent.futures
import gradio as gr

# import numpy as np
# import textwrap
# import PyPDF2
# import requests
import os

# import pinecone
import time
# import asyncio

# ------------------------- Streaming Implementation -------------------------
# from groq import AsyncGroq
# from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from dotenv import load_dotenv

# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain.schema.document import Document
# from langchain.schema import AIMessage, HumanMessage, SystemMessage
# from tkinter import scrolledtext, messagebox
from transformers import AutoModel, AutoTokenizer

# from typing import List, Tuple
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec, CloudProvider, AwsRegion, VectorType
from groq import Groq

load_dotenv()

DATA_PATH = os.getenv("DATA_PATH")
PINECONE_API = os.getenv("PINECONE_API")
# PINECONE_ENV = os.getenv("PINECONE_ENV")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GROQ_CHAT_URL = os.getenv("GROQ_CHAT_URL")

NVIDIA_API = os.getenv("NVIDIA_API")
NVIDIA_BASE_URL = os.getenv("NVIDIA_BASE_URL")

# Configure headers for Groq API requests
GROQ_HEADERS = {
    "Authorization": f"Bearer {GROQ_API_KEY}",
    "Content-Type": "application/json",
}
# LLM_MODEL = "llama3-70b-8192"
LLM_MODEL = "llama-3.3-70b-versatile"


# NVidia Embedding import
client = OpenAI(
    api_key=NVIDIA_API,
    base_url=NVIDIA_BASE_URL,
)

"""
Input:
    - Context window: 128K
Ouput:
    - Output Max Tokens: 32,768

"""


def track_time(func):
    def wrapper(*args, **kwargs):
        start = time.perf_counter()
        result = func(*args, **kwargs)
        end = time.perf_counter()
        print(f"[Time Tracker] `{func.__name__}` took {end - start:.4f} seconds")
        return result

    return wrapper


# # EMBEDDING_MODEL = "llama3-405b-8192-embed"

# vo = voyageai.Client()


# Init Pinecone

In [None]:
pc = Pinecone(api_key=PINECONE_API)


# Embedding Function



In [None]:
# Connect to the index
# index = pc.Index("ai-coach")
# index = pc.Index("ahsan-400pg-pdf-doc-test")
index = pc.Index("surgical-tech-complete")  # -- COMPLETE SURGICAL TECH BOOTCAMP


# embedding_model = AutoModel.from_pretrained(
#     'jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)


# # Function to generate embeddings without tokenization
# def get_embedding(data):
#     embeddings = embedding_model.encode(data).tolist()
#     return embeddings


@track_time
def get_embedding(text="None"):
    response = client.embeddings.create(
        input=text,
        model="nvidia/nv-embed-v1",
        encoding_format="float",
        extra_body={"input_type": "query", "truncate": "NONE"},
    )

    # print(response.data[0].embedding)
    # print(count_tokens(response.data[0].embedding))
    return response.data[0].embedding


get_embedding("None")


# Query Pinecone


In [None]:
# Function to query Pinecone index using embeddings
@track_time
def query_pinecone(embedding):
    # Use keyword arguments to pass the embedding and other parameters
    result = index.query(vector=embedding, top_k=5, include_metadata=True)
    return result["matches"]


print(query_pinecone(get_embedding("Pediatric surgery definition")))


[Time Tracker] `process_user_query` took 0.0000 seconds
[Time Tracker] `count_tokens` took 0.0002 seconds
User Query Tokens: 7
[Time Tracker] `get_embedding` took 0.6402 seconds
[Time Tracker] `query_pinecone` took 1.3696 seconds
[Time Tracker] `query_groq` took 0.5144 seconds


# Query Groq Inference

In [None]:
# Function to query Groq LLM
# def query_groq(prompt: str) -> str:
#     response = requests.post(
#         GROQ_CHAT_URL,
#         headers=GROQ_HEADERS,
#         json={
#             "model": LLM_MODEL,
#             "messages": [{"role": "user", "content": prompt}],
#             "temperature": 0.5,
#             # "max_tokens": 8192  # max from groq website
#         },
#     )

#     if response.status_code != 200:
#         raise Exception(f"Error querying Groq: {response.text}")

#     return response.json()["choices"][0]["message"]["content"]


# def query_groq(prompt: str) -> str:
#     client = Groq()
#     completion = client.chat.completions.create(
#         messages=[{"role": "user", "content": prompt}],
#         # Change model to compound-beta to use agentic tooling
#         # model: "llama-3.3-70b-versatile",
#         model=LLM_MODEL,
#     )
#     # print(completion.choices[0].message.content)
#     return completion.choices[0].message.content


# @track_time
# def query_groq(prompt: str) -> str:
#     client = Groq()
#     completion = client.chat.completions.create(
#         messages=[{"role": "user", "content": prompt}],
#         model=LLM_MODEL,
#         temperature=1,  # Set temperature to 0.5
#     )
#     return completion.choices[0]


# query_groq("Hello")


# # Modified query_groq function with more explicit streaming handling
@track_time
def query_groq(prompt):
    client = Groq(api_key=os.environ["GROQ_API_KEY"])

    # Always use streaming mode
    return client.chat.completions.create(
        model=LLM_MODEL,  # or whichever model you're using
        temperature=0.5,
        messages=[{"role": "user", "content": prompt}],
        stream=True,
    )


# Print all tool calls
# print(completion.choices[0].message.executed_tools)


# Tokenizer to count number of tokens
"""
Putting tokenizer outside of the function to avoid reinitialization and optimize performance.
"""
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-base-en")


@track_time
def count_tokens(text: str) -> int:
    # Encode the text into tokens
    tokens = tokenizer.encode(text)
    return len(tokens)


# Process User Query

### Gradio GUI TEST

In [None]:
# # ------------------------------------------- WORKING 3 Enter key submits user query -------------------------------------------
# # Initialize empty conversation history (list of tuples)
# conversation_history = []


# @track_time
# def process_user_query(user_query: str, conversation_history: list):
#     print(f"User Query Tokens: {count_tokens(user_query)}")

#     # Generate embedding and get relevant context
#     embedding = get_embedding(user_query)
#     relevant_chunks = query_pinecone(embedding)
#     context = "\n".join(chunk["metadata"]["text"] for chunk in relevant_chunks)
#     # print("CONTEXT:", context)

#     # Format conversation history for the prompt
#     history_str = "\n".join(
#         f"User: {user}\nCoach: {response}" for user, response in conversation_history
#     )

#     # Create structured prompt
#     prompt = f"""You are an expert, knowledgeable, and friendly coach. Follow these guidelines carefully:

#     1. Provide clear, step-by-step explanations to ensure deep understanding.
#     2. Use chain-of-thought reasoning to thoroughly evaluate the provided context before responding.
#     3. Ask guiding questions to encourage critical thinking.
#     4. Adapt your explanation to match the student's knowledge level.
#     5. Strictly use terminologies provided in the given context.
#     6. Provide short, ideal examples (2-3) to illustrate your points clearly.
#     7. Only answer based on the provided context—do not speculate or include external information.
#     8. Always provide all specific relevant sources from the context in your responses urls, video names, video timestamps , links , resources , ebook names, lesson names , lesson numbers and anything else you think would be relevant to the user query.
#     9. Perform sentiment analysis based on conversation history and user queries to adapt your responses empathetically and effectively.
#     10. Must provide all relevant video timestamp from where to start watching and where to end watching
#     Context from learning materials:
#     {context}

#     Conversation history:
#     {history_str}

#     New student question:
#     "{user_query}"

#     Provide a thoughtful and contextually accurate response now:"""

#     # --------------------- Reasoning Prompt --------------------------------------------------------------
#     # prompt = f"""You are an expert, knowledgeable, and friendly coach. Follow this structured response framework:

#     # ### Response Requirements
#     # 1. **Reasoning Process**:
#     # - Analyze the question against provided context
#     # - Identify relevant context snippets with source citations
#     # - Perform sentiment analysis on conversation history

#     # 2. **Response Format**:
#     # - [Understanding] Paraphrase the question to confirm comprehension
#     # - [Relevant Context] Cite exact source material with location references
#     # - [Step-by-Step Explanation] Break down concepts using chain-of-thought
#     # - [Examples] Provide 2-3 ideal examples from context
#     # - [Guiding Questions] Pose 1-2 reflective questions to deepen learning
#     # - [Summary] Concise answer reiterating key points

#     # 3. **Style Guidelines**:
#     # - Use terminology strictly from context
#     # - Adapt complexity to student's history
#     # - Maintain empathetic tone based on sentiment analysis

#     # ### Context Materials:
#     # {context}

#     # ### Conversation History:
#     # {history_str}

#     # ### New Question:
#     # "{user_query}"

#     # ### Your Response:
#     # [Understanding] First, let me clarify what you're asking...
#     # [Relevant Context] According to [Source X, Section Y]...
#     # [Step-by-Step Explanation] The process works as follows:
#     # 1. First concept...
#     # 2. Second concept...
#     # 3. Practical application...
#     # [Examples] For instance:
#     # - Example 1...
#     # - Example 2...
#     # [Guiding Questions] Have you considered...? How might this apply to...?
#     # [Summary] To recap the key points..."""

#     # Get LLM response
#     groq_response = query_groq(prompt)
#     print(f"Response Toke   ns: {count_tokens(groq_response.message.content)}")

#     # Return updated history with new interaction
#     return conversation_history + [(user_query, groq_response.message.content)]


# # Gradio Interface
# with gr.Blocks() as interface:
#     gr.Markdown("# 🧑‍🏫 AI Coaching Assistant")
#     gr.Markdown("Welcome! I'm here to help you learn. Type your question below.")

#     # State management
#     chat_history = gr.State(conversation_history)

#     with gr.Row():
#         chatbot = gr.Chatbot(height=500)
#         with gr.Column(scale=0.5):
#             context_display = gr.Textbox(label="Relevant Context", interactive=False)

#     user_input = gr.Textbox(label="Your Question", placeholder="Type here...")

#     with gr.Row():
#         submit_btn = gr.Button("Submit", variant="primary")
#         undo_btn = gr.Button("Undo Last")
#         clear_btn = gr.Button("Clear History")

#     def handle_submit(user_input, history):
#         if not user_input.strip():
#             return gr.update(), history, ""

#         # Process query and update history
#         new_history = process_user_query(user_input, history)

#         # Get latest context for display
#         latest_context = "\n".join(
#             [
#                 chunk["metadata"]["text"]
#                 for chunk in query_pinecone(get_embedding(user_input))
#             ][:]
#         )  # Show top 3 context snippets

#         return "", new_history, latest_context

#     # Component interactions
#     submit_btn.click(
#         handle_submit,
#         [user_input, chat_history],
#         [user_input, chat_history, context_display],
#     ).then(lambda x: x, [chat_history], [chatbot])

#     # Add submit on Enter key press
#     user_input.submit(
#         handle_submit,
#         [user_input, chat_history],
#         [user_input, chat_history, context_display],
#     ).then(lambda x: x, [chat_history], [chatbot])

#     undo_btn.click(
#         lambda history: history[:-1] if history else [], [chat_history], [chat_history]
#     ).then(lambda x: x, [chat_history], [chatbot])

#     clear_btn.click(lambda: [], None, [chat_history]).then(
#         lambda: ([], ""), None, [chatbot, context_display]
#     )

# interface.launch(share=True)

# ------------------------- Gradio converted into Gradio Function -------------------------------------------------------
# Initialize empty conversation history (list of tuples)
# conversation_history = []


# @track_time
# def process_user_query(user_query: str, conversation_history: list):
#     print(f"User Query Tokens: {count_tokens(user_query)}")

#     # Generate embedding and get relevant context
#     embedding = get_embedding(user_query)
#     relevant_chunks = query_pinecone(embedding)
#     context = "\n".join(chunk["metadata"]["text"] for chunk in relevant_chunks)

#     # Format conversation history for the prompt
#     history_str = "\n".join(
#         f"User: {user}\nCoach: {response}" for user, response in conversation_history
#     )

#     # Create structured prompt
#     prompt = f"""You are an expert, knowledgeable, and friendly coach. Follow these guidelines carefully:

#     1. Provide clear, step-by-step explanations to ensure deep understanding.
#     2. Use chain-of-thought reasoning to thoroughly evaluate the provided context before responding.
#     3. Ask guiding questions to encourage critical thinking.
#     4. Adapt your explanation to match the student's knowledge level.
#     5. Strictly use terminologies provided in the given context.
#     6. Provide short, ideal examples (2-3) to illustrate your points clearly.
#     7. Only answer based on the provided context—do not speculate or include external information.
#     8. Always provide all specific relevant sources from the context in your responses urls, video names, video timestamps , links , resources , ebook names, lesson names , lesson numbers and anything else you think would be relevant to the user query.
#     9. Perform sentiment analysis based on conversation history and user queries to adapt your responses empathetically and effectively.
#     10. Must provide all relevant video timestamp from where to start watching and where to end watching
#     Context from learning materials:
#     {context}

#     Conversation history:
#     {history_str}

#     New student question:
#     "{user_query}"

#     Provide a thoughtful and contextually accurate response now:"""

#     # Get LLM response
#     groq_response = query_groq(prompt)
#     print(f"Response Tokens: {count_tokens(groq_response.message.content)}")

#     # Return updated history with new interaction
#     return conversation_history + [(user_query, groq_response.message.content)]


# @track_time
# def create_gradio_interface(conversation_history):
#     with gr.Blocks() as interface:
#         gr.Markdown("# 🧑‍🏫 AI Coaching Assistant")
#         gr.Markdown("Welcome! I'm here to help you learn. Type your question below.")

#         # State management
#         chat_history = gr.State(conversation_history)

#         with gr.Row():
#             chatbot = gr.Chatbot(height=500)
#             with gr.Column(scale=0.5):
#                 context_display = gr.Textbox(
#                     label="Relevant Context", interactive=False
#                 )

#         user_input = gr.Textbox(label="Your Question", placeholder="Type here...")

#         with gr.Row():
#             submit_btn = gr.Button("Submit", variant="primary")
#             undo_btn = gr.Button("Undo Last")
#             clear_btn = gr.Button("Clear History")

#         def handle_submit(user_input, history):
#             if not user_input.strip():
#                 return gr.update(), history, ""

#             # Process query and update history
#             new_history = process_user_query(user_input, history)

#             # Get latest context for display
#             latest_context = "\n".join(
#                 [
#                     chunk["metadata"]["text"]
#                     for chunk in query_pinecone(get_embedding(user_input))
#                 ][:]
#             )

#             return "", new_history, latest_context

#         # Component interactions
#         submit_btn.click(
#             handle_submit,
#             [user_input, chat_history],
#             [user_input, chat_history, context_display],
#         ).then(lambda x: x, [chat_history], [chatbot])

#         # Add submit on Enter key press
#         user_input.submit(
#             handle_submit,
#             [user_input, chat_history],
#             [user_input, chat_history, context_display],
#         ).then(lambda x: x, [chat_history], [chatbot])

#         undo_btn.click(
#             lambda history: history[:-1] if history else [],
#             [chat_history],
#             [chat_history],
#         ).then(lambda x: x, [chat_history], [chatbot])

#         clear_btn.click(lambda: [], None, [chat_history]).then(
#             lambda: ([], ""), None, [chatbot, context_display]
#         )

#     return interface


# def main():
#     # Initialize conversation history
#     initial_conversation_history = []

#     # Create and launch the interface
#     interface = create_gradio_interface(initial_conversation_history)
#     interface.launch(share=True)


# if __name__ == "__main__":
#     main()


## Groq and Gradio with Streaming Enabled

- ### i.e. will start showing text as soon as it gets generated from groq inference
- ### faster than optimized version

## Query:
- pediatic surgery
## Response Time:
User Query Tokens: 6
[Time Tracker] `get_embedding` took 0.4752 seconds
[Time Tracker] `query_pinecone` took 0.2222 seconds
[Time Tracker] `query_groq` took 0.5060 seconds

Total time: 1.19 seconds

In [None]:
# # Modified query_groq function with more explicit streaming handling
# @track_time
# def query_groq(prompt):
#     client = Groq(api_key=os.environ["GROQ_API_KEY"])

#     # Always use streaming mode
#     return client.chat.completions.create(
#         model="llama3-70b-8192",  # or whichever model you're using
#         messages=[{"role": "user", "content": prompt}],
#         stream=True,
#     )

# --------------------------------------------------------- ## Groq and Gradio with Streaming Enabled -----------------------------------------------------
# Modified process_user_query to properly yield streaming updates
@track_time
def process_user_query(user_query: str, conversation_history: list):
    print(f"User Query Tokens: {count_tokens(user_query)}")

    # Generate embedding and get relevant context
    embedding = get_embedding(user_query)
    relevant_chunks = query_pinecone(embedding)
    context = "\n".join(chunk["metadata"]["text"] for chunk in relevant_chunks)

    # Format conversation history for the prompt
    history_str = "\n".join(
        f"User: {user}\nCoach: {response}" for user, response in conversation_history
    )

    # Create structured prompt
    prompt = f"""You are an expert, knowledgeable, and friendly coach. Follow these guidelines carefully:

    1. Provide clear, step-by-step explanations to ensure deep understanding.
    2. Use chain-of-thought reasoning to thoroughly evaluate the provided context before responding.
    3. Ask guiding questions to encourage critical thinking.
    4. Adapt your explanation to match the student's knowledge level.
    5. Strictly use terminologies provided in the given context.
    6. Provide short, ideal examples (2-3) to illustrate your points clearly.
    7. Only answer based on the provided context—do not speculate or include external information.
    8. Always provide all specific relevant sources from the context in your responses urls, video names, video timestamps , links , resources , ebook names, lesson names , lesson numbers and anything else you think would be relevant to the user query.
    9. Perform sentiment analysis based on conversation history and user queries to adapt your responses empathetically and effectively.
    10. Must provide all relevant video timestamp from where to start watching and where to end watching 
    Context from learning materials:
    {context}

    Conversation history:
    {history_str}

    New student question:
    "{user_query}"
    
    Provide a thoughtful and contextually accurate response now:"""

    # Get streaming LLM response
    stream_response = query_groq(prompt)

    # The function now directly yields the stream chunks for the Gradio interface to use
    full_response = ""

    # First, yield a response with empty text to set up the message
    # This creates the user message immediately
    temp_history = conversation_history.copy()
    temp_history.append((user_query, ""))
    yield temp_history, context

    # Process the stream
    for chunk in stream_response:
        if (
            hasattr(chunk.choices[0].delta, "content")
            and chunk.choices[0].delta.content is not None
        ):
            content_chunk = chunk.choices[0].delta.content
            full_response += content_chunk

            # Create a temporary history with the current response
            temp_history = conversation_history.copy()
            temp_history.append((user_query, full_response))

            # Yield the updated history for display
            yield temp_history, context

    # Return the final history with the complete response
    final_history = conversation_history.copy()
    final_history.append((user_query, full_response))
    yield final_history, context


@track_time
def create_gradio_interface(conversation_history):
    with gr.Blocks() as interface:
        gr.Markdown("# 🧑‍🏫 AI Coaching Assistant")
        gr.Markdown("Welcome! I'm here to help you learn. Type your question below.")

        # State management
        chat_history = gr.State(conversation_history)

        with gr.Row():
            chatbot = gr.Chatbot(height=500)
            with gr.Column(scale=0.5):
                context_display = gr.Textbox(
                    label="Relevant Context", interactive=False
                )

        user_input = gr.Textbox(label="Your Question", placeholder="Type here...")

        with gr.Row():
            submit_btn = gr.Button("Submit", variant="primary")
            undo_btn = gr.Button("Undo Last")
            clear_btn = gr.Button("Clear History")

        def handle_submit(user_query, history):
            if not user_query.strip():
                return gr.update(), history, ""

            # Use the generator directly from process_user_query
            # This will yield incremental updates as they arrive
            response_generator = process_user_query(user_query, history)

            for updated_history, context in response_generator:
                # Directly update the chatbot with each streaming chunk
                yield "", updated_history, context, updated_history

        # Component interactions with streaming support
        submit_btn.click(
            handle_submit,
            [user_input, chat_history],
            [user_input, chat_history, context_display, chatbot],
        )

        # Add submit on Enter key press
        user_input.submit(
            handle_submit,
            [user_input, chat_history],
            [user_input, chat_history, context_display, chatbot],
        )

        undo_btn.click(
            lambda history: history[:-1] if history else [],
            [chat_history],
            [chat_history],
        ).then(lambda x: x, [chat_history], [chatbot])

        clear_btn.click(lambda: [], None, [chat_history]).then(
            lambda: ([], ""), None, [chatbot, context_display]
        )

    return interface


# def main():
#     # Initialize conversation history
#     initial_conversation_history = []

#     # Create and launch the interface
#     interface = create_gradio_interface(initial_conversation_history)
#     interface.launch(share=True)


# if __name__ == "__main__":
#     main()


def main():
    """
    Main entry point for the application.

    Initializes the conversation history with a welcome message,
    creates the Gradio interface, and launches the web app.
    """
    # Initialize conversation history with welcome message
    welcome_message = "Hi there! I'm your AI coach. I can help answer questions about your course materials, explain difficult concepts, and guide your learning journey. What would you like to know today?"
    initial_conversation_history = [("", welcome_message)]

    # Create and launch the interface
    interface = create_gradio_interface(initial_conversation_history)
    interface.launch(share=True)


if __name__ == "__main__":
    main()


  chatbot = gr.Chatbot(height=500)


[Time Tracker] `create_gradio_interface` took 0.1209 seconds
* Running on local URL:  http://127.0.0.1:7862
* Running on public URL: https://26250d83b41b1ff906.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


[Time Tracker] `process_user_query` took 0.0000 seconds
[Time Tracker] `count_tokens` took 0.0002 seconds
User Query Tokens: 4
[Time Tracker] `get_embedding` took 0.8142 seconds
[Time Tracker] `query_pinecone` took 1.5735 seconds
[Time Tracker] `query_groq` took 0.5627 seconds
[Time Tracker] `process_user_query` took 0.0000 seconds
[Time Tracker] `count_tokens` took 0.0002 seconds
User Query Tokens: 4
[Time Tracker] `get_embedding` took 0.5136 seconds
[Time Tracker] `query_pinecone` took 0.2343 seconds
[Time Tracker] `query_groq` took 0.5483 seconds


## Optimized Version
- ### slower than previous version
- ### slightly better responses and source citations

## Query:
- pediatic surgery
## Response Time:
User Query Tokens: 7
[Time Tracker] `get_embedding` took 0.7417 seconds
[Time Tracker] `query_pinecone` took 1.4344 seconds
[Time Tracker] `query_groq` took 0.4740 seconds

Total time = 2.64 seconds

In [None]:
"""
-- Optimized Version ---
Streaming AI Coaching Assistant

This module implements a Gradio-based chatbot that uses streaming responses from Groq API
to provide real-time feedback to the user. It uses RAG (Retrieval-Augmented Generation)
to provide contextually relevant answers from a knowledge base.

Key components:
- query_groq: Interfaces with Groq API for streaming LLM responses
- process_user_query: Manages the streaming process and context retrieval 
- create_gradio_interface: Creates and configures the Gradio UI

The streaming implementation uses Python generators (with yield statements) to provide
real-time updates to the Gradio UI as tokens arrive from the LLM, rather than waiting
for the complete response.
"""


# Modified query_groq function with optimized streaming
@track_time
def query_groq(prompt):
    """
    Send a prompt to Groq API with streaming enabled.

    Args:
        prompt (str): The prompt to send to the LLM

    Returns:
        A stream of response chunks from the Groq API
    """
    client = Groq(api_key=os.environ["GROQ_API_KEY"])

    # Always use streaming mode
    return client.chat.completions.create(
        model="llama3-70b-8192",  # or whichever model you're using
        messages=[{"role": "user", "content": prompt}],
        stream=True,
        # Add parameters to potentially improve response time
        temperature=0.7,
        top_p=0.9,
        max_tokens=2048,  # Adjust based on your needs
    )


# Optimized process_user_query with better streaming performance
@track_time
def process_user_query(user_query: str, conversation_history: list):
    """
    Process a user query using streaming responses.

    This generator function:
    1. Retrieves relevant context from Pinecone
    2. Sends the query to Groq
    3. Yields updates as chunks arrive from the LLM

    Args:
        user_query (str): The user's question
        conversation_history (list): List of previous (query, response) tuples

    Yields:
        Tuples of (updated_history, context) as response chunks arrive
    """
    print(f"User Query Tokens: {count_tokens(user_query)}")

    # Generate embedding and get relevant context - do this once upfront
    embedding = get_embedding(user_query)
    relevant_chunks = query_pinecone(embedding)
    context = "\n".join(chunk["metadata"]["text"] for chunk in relevant_chunks)

    # Format conversation history for the prompt - do this once
    history_str = "\n".join(
        f"User: {user}\nCoach: {response}" for user, response in conversation_history
    )

    # Create structured prompt
    prompt = f"""You are an expert, knowledgeable, and friendly coach. Follow these guidelines carefully:

    1. Provide clear, step-by-step explanations to ensure deep understanding.
    2. Use chain-of-thought reasoning to thoroughly evaluate the provided context before responding.
    3. Ask guiding questions to encourage critical thinking.
    4. Adapt your explanation to match the student's knowledge level.
    5. Strictly use terminologies provided in the given context.
    6. Provide short, ideal examples (2-3) to illustrate your points clearly.
    7. Only answer based on the provided context—do not speculate or include external information.
    8. Always provide all specific relevant sources from the context in your responses urls, video names, video timestamps, links, resources, ebook names, lesson names, lesson numbers and anything else you think would be relevant to the user query.
    9. Perform sentiment analysis based on conversation history and user queries to adapt your responses empathetically and effectively.
    10. Must provide all relevant video timestamp from where to start watching and where to end watching 
    Context from learning materials:
    {context}

    Conversation history:
    {history_str}

    New student question:
    "{user_query}"
    
    Provide a thoughtful and contextually accurate response now:"""

    # Get streaming LLM response
    stream_response = query_groq(prompt)

    # Initialize response
    full_response = ""

    # First, yield a response with empty text to set up the message
    # This creates the user message immediately
    temp_history = conversation_history.copy()
    temp_history.append((user_query, ""))
    yield temp_history, context

    # For efficiency, create these objects once outside the loop
    buffer = ""
    buffer_size = 10  # Characters to buffer before updating UI
    update_frequency = 0  # Counter to track updates

    # Process the stream
    for chunk in stream_response:
        if (
            hasattr(chunk.choices[0].delta, "content")
            and chunk.choices[0].delta.content is not None
        ):
            content_chunk = chunk.choices[0].delta.content
            buffer += content_chunk
            full_response += content_chunk

            # Only update UI after buffer reaches certain size or every few chunks
            # This reduces the number of updates while still maintaining responsiveness
            update_frequency += 1
            if len(buffer) >= buffer_size or update_frequency >= 5:
                temp_history = conversation_history.copy()
                temp_history.append((user_query, full_response))
                yield temp_history, context
                buffer = ""
                update_frequency = 0

    # Ensure final state is yielded if buffer has content
    if buffer or update_frequency > 0:
        final_history = conversation_history.copy()
        final_history.append((user_query, full_response))
        yield final_history, context


@track_time
def create_gradio_interface(conversation_history):
    """
    Create a Gradio interface for the coaching assistant.

    This function sets up the UI components and defines how they interact.
    The interface includes real-time streaming of LLM responses.

    Args:
        conversation_history (list): Initial conversation history

    Returns:
        gr.Blocks: The configured Gradio interface
    """
    # Welcome message to display when a new user starts
    welcome_message = "Hi there! I'm your AI learning coach. I can help answer questions about your course materials, explain difficult concepts, and guide your learning journey. What would you like to know today?"

    # Add initial welcome message to conversation history if it's empty
    if not conversation_history:
        conversation_history = [("", welcome_message)]

    with gr.Blocks() as interface:
        gr.Markdown("# 🧑‍🏫 AI Coaching Assistant")
        gr.Markdown(
            "Ask questions about your course materials and get real-time, personalized help."
        )

        # State management
        chat_history = gr.State(conversation_history)

        with gr.Row():
            chatbot = gr.Chatbot(value=conversation_history, height=500)
            with gr.Column(scale=0.5):
                context_display = gr.Textbox(
                    label="Relevant Learning Materials", interactive=False, visible=True
                )

        user_input = gr.Textbox(
            label="Your Question",
            placeholder="Type your question here and press Enter...",
            autofocus=True,
        )

        with gr.Row():
            submit_btn = gr.Button("Submit", variant="primary")
            undo_btn = gr.Button("Undo Last")
            clear_btn = gr.Button("Clear History")

        def handle_submit(user_query, history):
            """
            Handle user query submission with streaming response.

            This generator function processes the user query and yields
            updates to the UI as they become available.

            Args:
                user_query (str): The user's question
                history (list): Current conversation history

            Yields:
                Updates to the UI components
            """
            if not user_query.strip():
                return gr.update(), history, ""

            # Use the generator directly from process_user_query
            # This will yield incremental updates as they arrive
            response_generator = process_user_query(user_query, history)

            for updated_history, context in response_generator:
                # Directly update the chatbot with each streaming chunk
                yield "", updated_history, context, updated_history

        # Component interactions with streaming support
        submit_btn.click(
            handle_submit,
            [user_input, chat_history],
            [user_input, chat_history, context_display, chatbot],
        )

        # Add submit on Enter key press
        user_input.submit(
            handle_submit,
            [user_input, chat_history],
            [user_input, chat_history, context_display, chatbot],
        )

        undo_btn.click(
            lambda history: history[:-1] if history else [],
            [chat_history],
            [chat_history],
        ).then(lambda x: x, [chat_history], [chatbot])

        clear_btn.click(lambda: [("", welcome_message)], None, [chat_history]).then(
            lambda x: x, [chat_history], [chatbot]
        ).then(lambda: "", None, [context_display])

    return interface


def main():
    """
    Main entry point for the application.

    Initializes the conversation history with a welcome message,
    creates the Gradio interface, and launches the web app.
    """
    # Initialize conversation history with welcome message
    welcome_message = "Hi there! I'm your AI learning coach. I can help answer questions about your course materials, explain difficult concepts, and guide your learning journey. What would you like to know today?"
    initial_conversation_history = [("", welcome_message)]

    # Create and launch the interface
    interface = create_gradio_interface(initial_conversation_history)
    interface.launch(share=True)


if __name__ == "__main__":
    main()


  chatbot = gr.Chatbot(value=conversation_history, height=500)


[Time Tracker] `create_gradio_interface` took 0.1014 seconds
* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://8eb664d3e4cd4dc734.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


[Time Tracker] `process_user_query` took 0.0000 seconds
[Time Tracker] `count_tokens` took 0.0002 seconds
User Query Tokens: 4
[Time Tracker] `get_embedding` took 0.9302 seconds
[Time Tracker] `query_pinecone` took 1.4134 seconds
[Time Tracker] `query_groq` took 0.5820 seconds
[Time Tracker] `process_user_query` took 0.0000 seconds
[Time Tracker] `count_tokens` took 0.0001 seconds
User Query Tokens: 4
[Time Tracker] `get_embedding` took 0.6390 seconds
[Time Tracker] `query_pinecone` took 0.2117 seconds
[Time Tracker] `query_groq` took 0.5676 seconds
[Time Tracker] `process_user_query` took 0.0000 seconds
[Time Tracker] `count_tokens` took 0.0001 seconds
User Query Tokens: 4
[Time Tracker] `get_embedding` took 0.6471 seconds
[Time Tracker] `query_pinecone` took 0.2543 seconds
[Time Tracker] `query_groq` took 0.6574 seconds
[Time Tracker] `process_user_query` took 0.0000 seconds
[Time Tracker] `count_tokens` took 0.0002 seconds
User Query Tokens: 4
[Time Tracker] `get_embedding` took 0.5

## V3 -- Working well but total time is incorrect


In [None]:
import os
import time
import gradio as gr
from groq import Groq
# Make sure you have these implemented elsewhere:
#   get_embedding(text: str) -> List[float]
#   query_pinecone(embedding: List[float]) -> List[{"metadata": {"text": str}}]
#   count_tokens(text: str) -> int

# — track_time decorator (unchanged, accumulates total) —
_total_tracked = 0.0


def track_time(func):
    def wrapper(*args, **kwargs):
        global _total_tracked
        start = time.perf_counter()
        result = func(*args, **kwargs)
        elapsed = time.perf_counter() - start
        _total_tracked += elapsed
        print(f"[Time Tracker] {func.__name__} took {elapsed:.4f} seconds")
        print(f"[Time Tracker] total accumulated: {_total_tracked:.4f} seconds")
        return result

    return wrapper


# Groq streaming query
@track_time
def query_groq(prompt: str):
    client = Groq(api_key=os.environ.get("GROQ_API_KEY", ""))
    return client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[{"role": "user", "content": prompt}],
        stream=True,
        temperature=0.7,
        top_p=0.9,
        max_tokens=2048,
    )


@track_time
def process_user_query(user_query: str, history: list):
    # 1) Show user message immediately
    temp_history = history + [(user_query, "")]
    yield temp_history, ""

    # 2) Retrieval: embeddings + Pinecone
    print(f"Token count: {count_tokens(user_query)}")
    emb = get_embedding(user_query)
    chunks = query_pinecone(emb)
    context = "\n".join(c["metadata"]["text"] for c in chunks)

    # 3) Build prompt once
    hist_str = "\n".join(f"User: {u}\nCoach: {r}" for u, r in history)
    prompt = f"""You are an expert coach. Follow these rules:
1. Step-by-step explanations
2. Chain-of-thought reasoning
3. Guiding questions
4. Adapt to student's level
5. Use only provided context
6. Give 2–3 examples
7. Answer ONLY from context
8. Cite videos, timestamps, links, ebooks, lessons
9. Adapt sentiment & tone
10. Provide all relevant video timestamps

Context:
{context}

History:
{hist_str}

New question:
"{user_query}"

Respond now:"""

    # 4) Stream and yield per token
    full = ""
    for chunk in query_groq(prompt):
        delta = getattr(chunk.choices[0].delta, "content", None)
        if not delta:
            continue
        full += delta
        temp_history[-1] = (user_query, full)
        yield temp_history, context


@track_time
def create_gradio_interface(initial_history):
    welcome = "Hi! I'm your AI coach—ask me anything about your course materials."
    if not initial_history:
        initial_history = [("", welcome)]

    # CSS for modern dark UI
    css = """
    body { background: #1e1e2e; color: #e0e0e0; }
    .header { text-align:center; padding:1rem 0; }
    .header h1 { margin:0; font-size:2.5rem; }
    .header p { margin:0.25rem 0 1rem; color:#a0a0a0; }

    .chat-sidebar { display:grid; grid-template-columns:3fr 1fr; gap:1rem; padding:1rem; }
    .gr-chatbot { background:#2a2a3b !important; border-radius:12px !important; }
    .sidebar { background:#2a2a3b; border-radius:12px; padding:1rem; }
    .sidebar h3 { margin-top:0; color:#fff; }
    .context-box { background:#1e1e2e; border:1px solid #3b3b4f; border-radius:8px; color:#e0e0e0; padding:0.5rem; }
    .time-badge { margin-top:1rem; padding:0.5rem 1rem; background:#ff825c; color:#fff; font-weight:500; text-align:center; border-radius:8px; }

    .input-area { display:flex; gap:0.5rem; padding:1rem; align-items:stretch; }
    .input-area .gr-textbox { flex:1; background:#2a2a3b; border:1px solid #3b3b4f; border-radius:12px; color:#e0e0e0; padding:0.75rem 1rem; font-size:1rem; min-height:4rem; }
    .input-area .gr-button { border-radius:12px; padding:0 1.5rem; font-weight:600; transition:background 0.2s ease; }
    .submit { background:#ff825c !important; color:#fff !important; }
    .submit:hover { background:#e06d45 !important; }
    .undo, .clear { background:#44475a !important; color:#fff !important; }
    .undo:hover, .clear:hover { background:#3b3b4f !important; }
    .message .timestamp { display:none !important; }
    """

    with gr.Blocks(css=css) as demo:
        # Header
        gr.HTML(
            "<div class='header'><h1>🧑‍🏫 AI Coaching Assistant</h1>"
            "<p>Your personal learning coach with real-time guidance</p></div>"
        )

        history_state = gr.State(initial_history)

        # Chat & Sidebar
        with gr.Row(elem_classes="chat-sidebar"):
            chatbot = gr.Chatbot(value=initial_history, height=550)
            with gr.Column(elem_classes="sidebar"):
                gr.HTML("<h3>Relevant Learning Materials</h3>")
                context_box = gr.Textbox(
                    interactive=False,
                    lines=15,
                    elem_classes="context-box",
                    show_label=False,
                )
                time_display = gr.HTML("<div class='time-badge'>⏱ 0.0000s</div>")

        # Input row
        with gr.Row(elem_classes="input-area"):
            user_input = gr.Textbox(
                placeholder="Type your question here and press Enter…",
                show_label=False,
                autofocus=True,
            )
            submit_btn = gr.Button("Submit", elem_classes="submit")
            undo_btn = gr.Button("Undo", elem_classes="undo")
            clear_btn = gr.Button("Clear", elem_classes="clear")

        # Submission logic: track until first token
        def handle_submit(q, hist):
            if not q.strip():
                return (
                    gr.update(),
                    hist,
                    "",
                    hist,
                    "<div class='time-badge'>⏱ 0.0000s</div>",
                )
            start = time.perf_counter()
            first_time = None
            for new_hist, ctx in process_user_query(q, hist):
                # on first non-empty response, record time
                resp = new_hist[-1][1]
                if first_time is None and resp:
                    first_time = time.perf_counter() - start
                badge = (
                    f"<div class='time-badge'>⏱ {first_time:.4f}s</div>"
                    if first_time is not None
                    else "<div class='time-badge'>⏱ ...</div>"
                )
                yield "", new_hist, ctx, new_hist, badge

        submit_btn.click(
            handle_submit,
            [user_input, history_state],
            [user_input, history_state, context_box, chatbot, time_display],
        )
        user_input.submit(
            handle_submit,
            [user_input, history_state],
            [user_input, history_state, context_box, chatbot, time_display],
        )

        undo_btn.click(
            lambda h: h[:-1] if h else [], [history_state], [history_state]
        ).then(lambda x: x, [history_state], [chatbot])

        clear_btn.click(lambda: [("", welcome)], None, [history_state]).then(
            lambda x: x, [history_state], [chatbot]
        )
        clear_btn.click(lambda: "", None, [context_box])

    return demo


def main():
    initial = [
        ("", "Hi! I'm your AI coach—ask me anything about your course materials.")
    ]
    demo = create_gradio_interface(initial)
    demo.launch(share=True)


if __name__ == "__main__":
    main()


## V4


In [None]:
import os
import time
import gradio as gr
from groq import Groq
# Make sure these are implemented elsewhere:
#   get_embedding(), query_pinecone(), count_tokens()

# -- track_time decorator --
_total_tracked = 0.0


def track_time(func):
    def wrapper(*args, **kwargs):
        global _total_tracked
        start = time.perf_counter()
        result = func(*args, **kwargs)
        elapsed = time.perf_counter() - start
        _total_tracked += elapsed
        print(f"[Time Tracker] {func.__name__} took {elapsed:.4f} seconds")
        print(f"[Time Tracker] total accumulated: {_total_tracked:.4f} seconds")
        return result

    return wrapper


# -- Core AI Functions --
@track_time
def query_groq(prompt: str):
    client = Groq(api_key=os.environ.get("GROQ_API_KEY", ""))
    return client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[{"role": "user", "content": prompt}],
        stream=True,
        temperature=0.7,
        top_p=0.9,
        max_tokens=2048,
    )


@track_time
def process_user_query(user_query: str, history: list):
    # Immediate user message display
    temp_history = history + [(user_query, "")]
    yield temp_history, ""

    # Context retrieval
    emb = get_embedding(user_query)
    chunks = query_pinecone(emb)
    context = "\n".join(c["metadata"]["text"] for c in chunks)

    # Prompt construction
    hist_str = "\n".join(f"User: {u}\nCoach: {r}" for u, r in history)
    prompt = f"""You are an expert coach. Follow these rules:
1. Step-by-step explanations
2. Chain-of-thought reasoning
3. Guiding questions
4. Adapt to student's level
5. Use only provided context
6. Give 2–3 examples
7. Answer ONLY from context
8. Cite videos, timestamps, links, ebooks, lessons
9. Adapt sentiment & tone
10. Provide all relevant video timestamps

Context:
{context}

History:
{hist_str}

New question:
"{user_query}"

Respond now:"""

    # Buffered streaming response
    full_response = ""
    buffer = ""
    buffer_size = 12  # Characters per chunk
    update_counter = 0

    for chunk in query_groq(prompt):
        delta = getattr(chunk.choices[0].delta, "content", "")
        if delta:
            full_response += delta
            buffer += delta
            update_counter += 1

            # Update UI when buffer fills or every 3 chunks
            if len(buffer) >= buffer_size or update_counter >= 3:
                temp_history[-1] = (user_query, full_response)
                yield temp_history, context
                buffer = ""
                update_counter = 0

    # Final update for remaining content
    if buffer or update_counter > 0:
        temp_history[-1] = (user_query, full_response)
        yield temp_history, context


# -- Gradio Interface --
@track_time
def create_gradio_interface(initial_history):
    welcome = "Hi! I'm your AI coach—ask me anything about your course materials."
    if not initial_history:
        initial_history = [("", welcome)]

    # Modern dark UI CSS
    css = """
    body { background: #1e1e2e; color: #e0e0e0; }
    .header { text-align:center; padding:1rem 0; }
    .header h1 { margin:0; font-size:2.5rem; }
    .header p { margin:0.25rem 0 1rem; color:#a0a0a0; }

    .chat-sidebar { display:grid; grid-template-columns:3fr 1fr; gap:1rem; padding:1rem; }
    .gr-chatbot { background:#2a2a3b !important; border-radius:12px !important; }
    .sidebar { background:#2a2a3b; border-radius:12px; padding:1rem; }
    .sidebar h3 { margin-top:0; color:#fff; }
    .context-box { background:#1e1e2e; border:1px solid #3b3b4f; border-radius:8px; color:#e0e0e0; padding:0.5rem; }
    .time-badge { margin-top:1rem; padding:0.5rem 1rem; background:#ff825c; color:#fff; font-weight:500; text-align:center; border-radius:8px; }

    .input-area { display:flex; gap:0.5rem; padding:1rem; align-items:stretch; }
    .input-area .gr-textbox { flex:1; background:#2a2a3b; border:1px solid #3b3b4f; border-radius:12px; color:#e0e0e0; padding:0.75rem 1rem; font-size:1rem; min-height:4rem; }
    .input-area .gr-button { border-radius:12px; padding:0 1.5rem; font-weight:600; transition:background 0.2s ease; }
    .submit { background:#ff825c !important; color:#fff !important; }
    .submit:hover { background:#e06d45 !important; }
    .undo, .clear { background:#44475a !important; color:#fff !important; }
    .undo:hover, .clear:hover { background:#3b3b4f !important; }
    .message .timestamp { display:none !important; }
    """

    with gr.Blocks(css=css) as demo:
        # Header
        gr.HTML(
            "<div class='header'><h1>🧑‍🏫 AI Coaching Assistant</h1>"
            "<p>Your personal learning coach with real-time guidance</p></div>"
        )

        history_state = gr.State(initial_history)

        # Chat & Sidebar
        with gr.Row(elem_classes="chat-sidebar"):
            chatbot = gr.Chatbot(value=initial_history, height=550)
            with gr.Column(elem_classes="sidebar"):
                gr.HTML("<h3>Relevant Learning Materials</h3>")
                context_box = gr.Textbox(
                    interactive=False,
                    lines=15,
                    elem_classes="context-box",
                    show_label=False,
                )
                time_display = gr.HTML("<div class='time-badge'>⏱ 0.0000s</div>")

        # Input row
        with gr.Row(elem_classes="input-area"):
            user_input = gr.Textbox(
                placeholder="Type your question here and press Enter…",
                show_label=False,
                autofocus=True,
            )
            submit_btn = gr.Button("Submit", elem_classes="submit")
            undo_btn = gr.Button("Undo", elem_classes="undo")
            clear_btn = gr.Button("Clear", elem_classes="clear")

        # Submission handler with streaming
        def handle_submit(q, hist):
            if not q.strip():
                return (
                    gr.update(),
                    hist,
                    "",
                    hist,
                    "<div class='time-badge'>⏱ 0.0000s</div>",
                )

            start_time = time.perf_counter()
            first_token_received = False

            for new_hist, ctx in process_user_query(q, hist):
                # Track time to first token
                if not first_token_received and new_hist[-1][1]:
                    first_token_time = time.perf_counter() - start_time
                    first_token_received = True

                # Update time badge
                badge_time = (
                    first_token_time
                    if first_token_received
                    else (time.perf_counter() - start_time)
                )
                badge = f"<div class='time-badge'>⏱ {badge_time:.4f}s</div>"

                yield "", new_hist, ctx, new_hist, badge

        # Event handlers
        submit_btn.click(
            handle_submit,
            [user_input, history_state],
            [user_input, history_state, context_box, chatbot, time_display],
        )
        user_input.submit(
            handle_submit,
            [user_input, history_state],
            [user_input, history_state, context_box, chatbot, time_display],
        )

        undo_btn.click(
            lambda h: h[:-1] if h else [], [history_state], [history_state]
        ).then(lambda x: x, [history_state], [chatbot])

        clear_btn.click(lambda: [("", welcome)], None, [history_state]).then(
            lambda x: x, [history_state], [chatbot]
        )
        clear_btn.click(lambda: "", None, [context_box])

    return demo


def main():
    initial = [
        ("", "Hi! I'm your AI coach—ask me anything about your course materials.")
    ]
    demo = create_gradio_interface(initial)
    demo.launch(share=True)


if __name__ == "__main__":
    main()
