# All imports and inits

In [None]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from tkinter import scrolledtext, messagebox
from transformers import AutoModel , AutoTokenizer
#from pinecone import Pinecone, ServerlessSpec
import pinecone
from pinecone import (
    Pinecone,
    ServerlessSpec,
    CloudProvider,
    AwsRegion,
    VectorType
)

import os
import requests
import PyPDF2
import textwrap
import numpy as np
import streamlit as st
import tkinter as tk


# Important: Import pinecone-client properly
# Load environment variables from .env file
load_dotenv()

DATA_PATH = os.getenv("DATA_PATH")
PINECONE_API = os.getenv("PINECONE_API")
PINECONE_ENV = os.getenv("PINECONE_ENV")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
print("PINECONE_API", PINECONE_API)


# Groq API settings
GROQ_EMBED_URL = "https://api.groq.com/openai/v1/embeddings"
GROQ_CHAT_URL = "https://api.groq.com/openai/v1/chat/completions"
EMBEDDING_MODEL = "llama3-405b-8192-embed"
LLM_MODEL = "llama3-70b-8192"


# Configure headers for Groq API requests
GROQ_HEADERS = {
    "Authorization": f"Bearer {GROQ_API_KEY}",
    "Content-Type": "application/json"
}


# PDF loader

In [None]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()


# documents = load_documents()
# documents[0]


# def extract_text_from_pdf(pdf_path: str) -> str:
#     """Extract text from a PDF file."""
#     with open(pdf_path, 'r') as file:
#         pdf_reader = PyPDF2.PdfReader(file)
#         text = ""
#         for page_num in range(len(pdf_reader.pages)):
#             page = pdf_reader.pages[page_num]
#             text += page.extract_text() + "\n"
#     return text
# extract_text_from_pdf(DATA_PATH)


# Text Splitting \ Chunking using Langchain

In [None]:


def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False  # considers separators like '\n\n'if true
    )
    docs = text_splitter.split_documents(documents)
    return docs


# chunks = split_documents(documents)
# chunks


# Init Pinecone

In [None]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=PINECONE_API)
print(PINECONE_API)


#  --------------- initialize pinecone -----------------------------
# pc.create_index_for_model(
#     name="test-index",
#     cloud="aws",
#     region="us-east-1",
#     embed={
#         "model":"llama-text-embed-v2",
#         "field_map":{"text": "page_content"}
#     }
# )


### When to Use What:
**Use Upsert:**

When you're adding new vectors or want to replace existing vectors with new data (including changing the vector values).
When you need to add a completely new document or vector.
When you want to update both the vector values and metadata.

**Use Update:**

When you're only modifying the metadata of an existing vector.
When the vector values (embeddings) themselves are correct and only extra information like text, author, or document-related metadata needs to be updated.
Summary:
Upsert: Adds or replaces both the vector values and metadata. Use when inserting or completely replacing data.
Update: Modifies the metadata without changing the vector values. Use when the vectors are correct, but metadata needs an update.
For your case, if you just want to add or update the page_content or any other metadata for existing vectors, use update. If you want to re-upload vectors with new embeddings or metadata, use upsert.









## Creating Embeddings Via AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en'  and Upsert each to Pinecone one by one


In [None]:
# Connect to the index
index = pc.Index("test-index")


embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
#user_query = "user query"
# Function to generate embeddings without tokenization
def get_embedding(data):
    embeddings = embedding_model.encode(data).tolist()
    return embeddings

# def upsert_chunks_to_pinecone(index, chunks):
#   count = 1
#   for chunk in chunks:
#     #embedding = embedding_model.encode(chunk.page_content).tolist()
#     embedding = get_embedding(chunk.page_content)
#     # Extract metadata
#     metadata = chunk.metadata
#     text = chunk.page_content
#     # Create a unique vector ID for each chunk (e.g., based on count or some unique identifier)
#     vector_id = f"vec_{count}"
    
#     # Upsert the embedding along with its metadata
#     index.upsert(vectors=[(vector_id, embedding, metadata, text)])
    
#     print(f"Embedding {count} upserted to Pinecone with metadata")
#     count += 1
#       # Ensure data is written immediately
#   print(f"All {count} Embeddings have been upserted to pinecone")


def upsert_chunks_to_pinecone(index, chunks):
    count = 1
    for chunk in chunks:
        # Get the embedding for the chunk
        embedding = get_embedding(chunk.page_content)
        
        # Extract metadata and add text as part of the metadata
        metadata = chunk.metadata
        metadata["text"] = chunk.page_content  # Store text in metadata
        
        # Create a unique vector ID for each chunk (e.g., based on count or some unique identifier)
        vector_id = f"vec_{count}"
        
        # Upsert the embedding along with its metadata
        index.upsert(vectors=[(vector_id, embedding, metadata)])
        
        print(f"Embedding {count} upserted to Pinecone with metadata")
        count += 1
    
    print(f"All {count-1} Embeddings have been upserted to Pinecone")

# upsert_chunks_to_pinecone(index, chunks)

# query_embeddings = embedding_model.encode(user_query).tolist()
# query_embeddings


# Update Vectors Function

In [None]:
def update_pinecone_chunks(index, chunks):
    count = 1
    for chunk in chunks:
        # Get updated embedding
        embedding = get_embedding(chunk.page_content)
        
        # Extract metadata and page content
        metadata = chunk.metadata
        text = chunk.page_content
        
        # Create a unique vector ID for each chunk (e.g., based on count or some unique identifier)
        vector_id = f"vec_{count}"
        
        # Update the embedding and metadata
        index.update(id=vector_id, values=embedding, set_metadata=metadata)
        
        print(f"Embedding {count} updated in Pinecone with new metadata")
        count += 1
    
    print(f"All {count-1} embeddings have been updated in Pinecone")

#update_pinecone_chunks(index, chunks)


Since your application is designed to answer a wide range of student queries and suggest relevant material, you want to retrieve enough content to cover different facets of a topic without overwhelming the LLM with too much information.

# Starting Point:
- A common starting point is to set top_k between **5 and 10.**
- **top_k=5:** This can work well if your curated content is highly relevant and precise, ensuring that the top 5 matches are very close to the query.
-  **top_k=10:** If you want the coach to consider a broader range of content—perhaps to provide diverse perspectives or cover a topic more comprehensively—increasing top_k to around 10 might be beneficial.

# Experiment and Adjust:
- The “best” value depends on factors such as the diversity of your content, how densely your data covers the topics, and the quality of the embedding matches. It’s a good idea to experiment with different top_k values and evaluate the quality and relevance of the responses in your specific


# Query Pinecone


In [None]:
# Function to query Pinecone index using embeddings
def query_pinecone(embedding):
    # Use keyword arguments to pass the embedding and other parameters
    result = index.query(vector=embedding, top_k=10, include_metadata=True)
    return result['matches']


# Query Groq Inference

In [None]:
# Function to query Groq LLM
def query_groq(prompt: str) -> str:
    response = requests.post(
        GROQ_CHAT_URL,
        headers=GROQ_HEADERS,
        json={
            "model": LLM_MODEL,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.5,
            "max_tokens": 8192 # max from groq website
        }
    )

    if response.status_code != 200:
        raise Exception(f"Error querying Groq: {response.text}")

    return response.json()["choices"][0]["message"]["content"]


# Tokenizer to count number of tokens
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-base-en")

def count_tokens(text: str) -> int:
    # Encode the text into tokens
    tokens = tokenizer.encode(text)
    return len(tokens)



# Process User Query

In [47]:
# Main function to handle user query
# def process_user_query(user_query: str):
#     print(f"User Query Tokens : {count_tokens(user_query)}")
#     # Step 1: Generate embedding for the user query
#     embedding = get_embedding(user_query)

#     # Step 2: Query Pinecone for relevant chunks
#     relevant_chunks = query_pinecone(embedding)
#     print(f"Relevant Chunks : {relevant_chunks[0]}")
#     # Step 3: Prepare the content for the Groq LLM
#     context = "\n".join([chunk['metadata']["text"] for chunk in relevant_chunks])
#     print("------------------------------------ Context ------------------------------------------ : ", context)
#     # Step 4: Send the retrieved content as the prompt to Groq LLM
#     groq_response = query_groq(context)
#     print(f"Groq Response Tokens : {count_tokens(groq_response)}")
#     return groq_response


# # Example usage
# if __name__ == "__main__":
#     user_query = "What are the Link Layer?"
#     response = process_user_query(user_query)
#     print(response)

# def process_user_query(user_query: str):
#     print(f"User Query Tokens : {count_tokens(user_query)}")

#     # Step 1: Generate embedding for the user query
#     embedding = get_embedding(user_query)

#     # Step 2: Query Pinecone for relevant chunks
#     relevant_chunks = query_pinecone(embedding)
#     #print(f"Relevant Chunks : {relevant_chunks}")
    
#     # Step 3: Prepare the content (context) for the LLM
#     #context = "\n".join([chunk['metadata']["text"] for chunk in relevant_chunks])
#     #print("------------------------------------ Context ------------------------------------------ : ", context)

#     # Step 4: Craft a good coach prompt for the LLM
#     prompt = f"""
#     You are a knowledgeable and friendly coach. Your goal is to help students understand concepts in a detailed and easy-to-understand manner. 
#     Be patient, ask guiding questions, and provide step-by-step explanations where needed. Adapt your responses to the student's knowledge level 
#     and help them build confidence in their learning. Refer relevant material to the student and encourage them to explore further.

#     Context from the student's material:
#     {relevant_chunks}

#     The student has asked the following question:
#     "{user_query}"

#     Based on the context and the student's question, provide a thoughtful and detailed explanation. Encourage them to think about the topic and 
#     offer further guidance if needed.
#     """
    
#     # Step 5: Send the prepared prompt (with context and user query) to the LLM
#     groq_response = query_groq(prompt)
#     print(f"Groq Response Tokens : {count_tokens(groq_response)}")
    
#     return groq_response


# # # Example usage
# if __name__ == "__main__":
#     while True:
#         user_query = input("Enter your query or press 0 to exit: ")
#         if user_query == "0":
#             break
#         response = process_user_query(user_query)
#         print(response)


# Initialize an empty list to store conversation history
conversation_history = []

def process_user_query(user_query: str, conversation_history: list):
    print(f"User Query Tokens : {count_tokens(user_query)}")

    # Step 1: Generate embedding for the user query
    embedding = get_embedding(user_query)

    # Step 2: Query Pinecone for relevant chunks
    relevant_chunks = query_pinecone(embedding)

    # Prepare the context from relevant chunks
    context = "\n".join([chunk['metadata']["text"] for chunk in relevant_chunks])

    # Step 3: Combine conversation history with current user query
    conversation_history_str = "\n".join(conversation_history)

    # Step 4: Craft a good coach prompt for the LLM
    prompt = f"""
    You are a knowledgeable and friendly coach. Your goal is to help students understand concepts in a detailed and easy-to-understand manner. 
    Be patient, ask guiding questions, and provide step-by-step explanations where needed. Adapt your responses to the student's knowledge level 
    and help them build confidence in their learning. Refer relevant material to the student and encourage them to explore further.

    Context from the student's material:
    {context}

    Conversation history:
    {conversation_history_str}

    The student has asked the following question:
    "{user_query}"

    Based on the context and the student's question, provide a thoughtful and detailed explanation. Encourage them to think about the topic and 
    offer further guidance if needed.
    """

    # Step 5: Send the prepared prompt (with context and user query) to the LLM
    groq_response = query_groq(prompt)
    print(f"Groq Response Tokens : {count_tokens(groq_response)}")

    # Step 6: Append the user query and model's response to conversation history
    conversation_history.append(f"User: {user_query}")
    conversation_history.append(f"Coach: {groq_response}")

    return groq_response


# Example usage
if __name__ == "__main__":
    while True:
        user_query = input("Enter your query or press 0 to exit: ")
        if user_query == "0":
            break
        response = process_user_query(user_query, conversation_history)
        print(response)




User Query Tokens : 88
Groq Response Tokens : 329
I totally understand how you're feeling! It's like you're staring at a puzzle, and you can see all the pieces, but you're not quite sure how to put them together. That's completely normal, especially when dealing with complex concepts like Cyclic Redundancy Check (CRC) and error-detection techniques.

Let's take a step back and break it down together. From what I understand, you're trying to understand how to compute the remainder R such that D . 2r XOR R = nG. You're correct that the pieces are there, but they're not quite fitting together.

Here's a suggestion: let's focus on the equation D . 2r = nG XOR R. Can you tell me what you think this equation is saying? What do you think the variables D, 2r, nG, and R represent? 

Also, have you tried working through some examples or exercises related to CRC? Sometimes, seeing how the concepts are applied in different scenarios can help clarify things. The online interactive exercises provide

Exception: Error querying Groq: {"error":{"message":"Rate limit reached for model `llama3-70b-8192` in organization `org_01jnzr4w48e9p9y1m591dxwa6x` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Used 4052, Requested 4154. Please try again in 22.053s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing","type":"tokens","code":"rate_limit_exceeded"}}


# Streamlit Interface (Too heavy)

In [None]:
# Separate function to handle the Streamlit interface
# def run_streamlit_app():
#     st.title("AI Student Coach")
#     st.write("Ask your question to the AI student coach, and it will provide a thoughtful response based on your learning material.")

#     # Chatbox: User's input
#     user_input = st.text_input("Enter your question:", "")

#     # Button to submit the query
#     if st.button("Submit"):
#         if user_input:
#             with st.spinner('Processing your query...'):
#                 # Call the function to process user query
#                 response = handle_user_query(user_input)
#                 # Display the response from the LLM
#                 st.markdown(f"### AI Response:")
#                 st.write(response)
#         else:
#             st.warning("Please enter a question before submitting.")


# # Separate function to handle user query and call process_user_query
# def handle_user_query(user_query: str):
#     try:
#         # Process the user query using the relevant function
#         response = process_user_query(user_query)
#         return response
#     except Exception as e:
#         # Handle any errors that occur during query processing
#         st.error(f"An error occurred: {str(e)}")
#         return "Sorry, something went wrong."


# # Main entry point for the app
# if __name__ == "__main__":
#     print("Running Streamlit app...")
#     run_streamlit_app()


**How to fix it:**
To run Streamlit properly, you need to execute it with the streamlit run command from the terminal:

- Open your terminal (or command prompt).
- Navigate to the folder where your Streamlit script is located.

**Run the following command:**
- streamlit run your_script.py

- Replace your_script.py with the name of your Python file containing the Streamlit code.

**For example:**
- streamlit run app.py

Once you run Streamlit in this way, the warning should disappear, and the app will launch in your browser, providing full functionality.

# GUI Interface

In [None]:
# import tkinter as tk
# from tkinter import scrolledtext

# # Function to process user query (replace this with your actual implementation)
# # def process_user_query(query):
# #     # Example: Simulate actual processing of the query
# #     response = process_user_query(query)
# #     #print(response)
# #     return response

# # Function to handle the submit button click
# def handle_user_query():
#     user_input = query_entry.get("1.0", "end-1c").strip()
    
#     if user_input:
#         # Update status
#         status_label.config(text="Processing...")
#         submit_button.config(state=tk.DISABLED)
        
#         # Clear previous results
#         result_box.config(state=tk.NORMAL)
#         result_box.delete(1.0, tk.END)
        
#         # Process the query
#         response = process_user_query(user_input)  # Correctly call process_user_query
        
#         # Display the response
#         result_box.insert(tk.END, response)
#         result_box.config(state=tk.DISABLED)
        
#         # Reset status
#         status_label.config(text="Ready")
#         submit_button.config(state=tk.NORMAL)
#     else:
#         result_box.config(state=tk.NORMAL)
#         result_box.delete(1.0, tk.END)
#         result_box.insert(tk.END, "Please enter a query.")
#         result_box.config(state=tk.DISABLED)

# # Set up the main window
# window = tk.Tk()
# window.title("AI Coach")
# window.geometry("600x500")
# window.configure(bg="#f5f5f5")

# # Create padding frame
# main_frame = tk.Frame(window, bg="#f5f5f5")
# main_frame.pack(fill=tk.BOTH, expand=True, padx=20, pady=15)

# # Title label
# title_label = tk.Label(main_frame, text="AI Coach", font=("Arial", 16, "bold"), bg="#f5f5f5")
# title_label.pack(pady=(0, 15))

# # Query section
# query_frame = tk.Frame(main_frame, bg="#f5f5f5")
# query_frame.pack(fill=tk.X, pady=5)

# query_label = tk.Label(query_frame, text="Your question:", font=("Arial", 11), bg="#f5f5f5", anchor="w")
# query_label.pack(fill=tk.X)

# query_entry = scrolledtext.ScrolledText(main_frame, height=4, wrap=tk.WORD, font=("Arial", 11))
# query_entry.pack(fill=tk.X, pady=5)
# query_entry.focus_set()

# # Button frame
# button_frame = tk.Frame(main_frame, bg="#f5f5f5")
# button_frame.pack(fill=tk.X, pady=10)

# # Submit button
# submit_button = tk.Button(button_frame, text="Ask Coach", font=("Arial", 11), 
#                          bg="#4a86e8", fg="white", padx=15, pady=8,
#                          command=handle_user_query)
# submit_button.pack(side=tk.RIGHT)

# # Status label
# status_label = tk.Label(button_frame, text="Ready", font=("Arial", 10), fg="#555555", bg="#f5f5f5")
# status_label.pack(side=tk.LEFT, pady=10)

# # Response section
# response_label = tk.Label(main_frame, text="AI Coach's Response:", font=("Arial", 11), bg="#f5f5f5", anchor="w")
# response_label.pack(fill=tk.X, pady=(10, 5))

# result_box = scrolledtext.ScrolledText(main_frame, height=10, wrap=tk.WORD, font=("Arial", 11), state=tk.DISABLED)
# result_box.pack(fill=tk.BOTH, expand=True)

# # Start the GUI event loop
# window.mainloop()
