### This file now only contains the upsert and updating Pipeline


# All imports and inits

In [20]:
import pandas as pd
import glob
import concurrent.futures
import gradio as gr
import numpy as np
import textwrap
import PyPDF2
import requests
import os
import pinecone
import time
import pickle
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from tkinter import scrolledtext, messagebox
from transformers import AutoModel, AutoTokenizer
from typing import List, Tuple
from openai import OpenAI
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone_text.sparse import BM25Encoder
from pinecone import Pinecone, ServerlessSpec, CloudProvider, AwsRegion, VectorType

# from pinecone import Pinecone, ServerlessSpec


# import voyageai


# Important: Import pinecone-client properly

# Load environment variables from .env file

load_dotenv()


DATA_PATH = os.getenv("DATA_PATH")

PINECONE_API = os.getenv("PINECONE_API")

# PINECONE_ENV = os.getenv("PINECONE_ENV")

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
NVIDIA_API = os.getenv("NVIDIA_API")

# NVidia Embedding import
client = OpenAI(
    api_key=NVIDIA_API,
    base_url="https://integrate.api.nvidia.com/v1",
)

"""
Input:
    - Context window: 128K
Ouput:
    - Output Max Tokens: 32,768

"""


def track_time(func):
    def wrapper(*args, **kwargs):
        start = time.perf_counter()
        result = func(*args, **kwargs)
        end = time.perf_counter()
        print(f"[Time Tracker] `{func.__name__}` took {end - start:.4f} seconds")
        return result

    return wrapper


# Initialize BM25 encoder once and fit it on your corpus
bm25_encoder = BM25Encoder()


# PDF Parser


In [None]:
def pdf_load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()


# documents = pdf_load_documents()
# documents


# def extract_text_from_pdf(pdf_path: str) -> str:
#     """Extract text from a PDF file."""
#     with open(pdf_path, 'r') as file:
#         pdf_reader = PyPDF2.PdfReader(file)
#         text = ""
#         for page_num in range(len(pdf_reader.pages)):
#             page = pdf_reader.pages[page_num]
#             text += page.extract_text() + "\n"
#     return text
# extract_text_from_pdf(DATA_PATH)


### Single Directory Parser

In [None]:
def load_documents():
    """
    Load PDF and Excel files from DATA_PATH.
    Returns a list of documents with content and metadata and a list of filenames.
    """
    documents = []
    file_names = []

    # Load PDFs
    pdf_loader = PyPDFDirectoryLoader(DATA_PATH)
    pdf_docs = pdf_loader.load()
    for doc in pdf_docs:
        documents.append(
            {
                "content": doc.page_content,
                "metadata": {
                    "source": doc.metadata.get("source", "unknown"),
                    "file_type": "pdf",
                },
            }
        )
        file_names.append(doc.metadata.get("source", "unknown"))

    # Load Excel files
    excel_files = glob.glob(os.path.join(DATA_PATH, "*.xlsx"))
    for file in excel_files:
        df = pd.read_excel(file)
        headers = df.columns.tolist()
        for _, row in df.iterrows():
            content = " ".join([f"{col}: {str(row[col])}" for col in headers])
            documents.append(
                {"content": content, "metadata": {"source": file, "file_type": "excel"}}
            )
        file_names.append(file)

    if not documents:
        print(f"No PDF or Excel files found in {DATA_PATH}")
    else:
        print(f"Loaded {len(documents)} documents")

    return documents, file_names


# Load documents
# documents, file_names = load_documents()

# # Print file names
# print("Files parsed:")
# for name in file_names:
#     print(f"- {name}")

# print(f"\nTotal documents loaded: {len(documents)}")


### Multiple Directory Parser

In [21]:
def load_documents():
    """
    Load PDF and Excel files from DATA_PATH and its subdirectories.
    Returns a list of documents with content and metadata, and a list of filenames.
    """
    documents = []
    file_names = []
    processed_files = []

    # Load PDFs from all subdirectories
    pdf_loader = PyPDFDirectoryLoader(DATA_PATH, recursive=True)
    pdf_docs = pdf_loader.load()
    print("PDF DOCS len: \n\n", len(pdf_docs))
    # print("PDF DOCS: \n\n",pdf_docs)
    count = 0
    for doc in pdf_docs:
        source = doc.metadata.get("source", "unknown")
        if source not in processed_files:
            documents.append(
                {
                    "content": doc.page_content,
                    "metadata": {"source": source, "file_type": "pdf"},
                }
            )
            # print(f"PARSED DATA {source}: \n",documents[-1])

            processed_files.append(source)
            file_names.append(source)
            # print(f"Parsed PDF file : {source}")
        # print("COUNT: ", count)
        count += 1
    # return

    # Load Excel files from all subdirectories
    for root, _, files in os.walk(DATA_PATH):
        for file in files:
            if file.endswith(".xlsx"):
                file_path = os.path.join(root, file)
                if file_path not in processed_files:
                    df = pd.read_excel(file_path)
                    headers = df.columns.tolist()
                    for _, row in df.iterrows():
                        content = " ".join(
                            [f"{col}: {str(row[col])}" for col in headers]
                        )
                        documents.append(
                            {
                                "content": content,
                                "metadata": {"source": file_path, "file_type": "excel"},
                            }
                        )
                        # print("PARSED DATA: \n\n\n",documents[-1])
                    processed_files.append(file_path)
                    file_names.append(file_path)
                    print(f"Parsed Excel file: {file_path}")

    if not documents:
        print(f"No PDF or Excel files found in {DATA_PATH}")
    else:
        print(f"Loaded {len(documents)} documents")
    # Print file names
    print("Files parsed:")
    for name in processed_files:
        print(f"- {name}")
    return documents, file_names


# Load documents
documents, file_names = load_documents()

# Extract content for BM25 fitting (corpus must be a list of strings)
corpus = [doc["content"] for doc in documents]

# Fit the BM25 encoder on the corpus (which is a list of strings) so it can adjust weights according to the raw data
bm25_encoder.fit(corpus)

# Save the fitted BM25 model so it can also be userd in query pipeline
with open("bm25_model.pkl", "wb") as f:
    pickle.dump(bm25_encoder, f)

print("BM25 encoder fitted on the corpus")
print(f"\nTotal documents loaded: {len(documents)}")


PDF DOCS len: 

 3631
Parsed Excel file: D:\Disrupt Labs\Rag Experiments\env\Rag-pipelines-experiments\Surgical Technologist Bootcamp\Course 1 - Introduction to Surgical Technology\Assessments\Exam 01.xlsx
Parsed Excel file: D:\Disrupt Labs\Rag Experiments\env\Rag-pipelines-experiments\Surgical Technologist Bootcamp\Course 1 - Introduction to Surgical Technology\Assessments\Lesson 01 - Quiz.xlsx
Parsed Excel file: D:\Disrupt Labs\Rag Experiments\env\Rag-pipelines-experiments\Surgical Technologist Bootcamp\Course 1 - Introduction to Surgical Technology\Assessments\Lesson 02 - Quiz.xlsx
Parsed Excel file: D:\Disrupt Labs\Rag Experiments\env\Rag-pipelines-experiments\Surgical Technologist Bootcamp\Course 1 - Introduction to Surgical Technology\Assessments\Lesson 03 - Quiz.xlsx
Parsed Excel file: D:\Disrupt Labs\Rag Experiments\env\Rag-pipelines-experiments\Surgical Technologist Bootcamp\Course 1 - Introduction to Surgical Technology\Assessments\Lesson 04 - Quiz.xlsx
Parsed Excel file: D:\

  0%|          | 0/1098 [00:00<?, ?it/s]

BM25 encoder fitted on the corpus

Total documents loaded: 1098


## Text Splitting \ Chunking for llama text embed v2 via pinecone

In [None]:
def count_tokens(text: str) -> int:
    tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-base-en")
    # Encode the text into tokens
    tokens = tokenizer.encode(text)
    return len(tokens)


def split_documents(documents):
    # Each chunk is ~800-1000 tokens to leave room for metadata tokens if needed

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=32000,  # fit comfortably within 2048 token limit
        chunk_overlap=16000,  # helps retain context between chunks
        length_function=len,  # use token length if tokenizer available
        is_separator_regex=True,  # respect newline and semantic breaks
    )

    chunks = []

    for doc in documents:
        split_texts = text_splitter.split_text(doc["content"])

        for i, chunk in enumerate(split_texts):
            chunks.append(
                {"content": chunk, "metadata": {**doc["metadata"], "chunk_id": i}}
            )
    return chunks


chunks = split_documents(documents)

# print(len(chunks))
print(chunks[0])
print(
    "Tokens: ", count_tokens(chunks[0]["content"])
)  # Check if splitting looks reasonable


## Manually Creating Pinecone index and Init Pinecone


In [None]:
index_name = "hybrid-search-ai-coach"
pc = Pinecone(api_key=PINECONE_API)
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=4096,  # dimension of dense vector
        metric="dotproduct",  # setting for sparse values
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )


# Init Pinecone

In [None]:
# pc = Pinecone(api_key=PINECONE_API)
# # pc = Pinecone(api_key=PINECONE_API, headers={
# #     "X-Pinecone-API-Version": "2025-04"
# # })

# print(PINECONE_API)


# -------- Integrated Emebeddings ---------------------------
# # Create a dense index with integrated inference
# index_name = "llama-text-embed-v2"

# pc.create_index_for_model(
#     name=index_name,
#     cloud="aws",
#     region="us-east-1",
#     embed={
#         "model": "llama-text-embed-v2",
#         "dimensions": 2046,
#         "field_map": {
#             "text": "text"  # Map the record field to be embedded
#         }
#     }
# )
# index = pc.Index(index_name)


### When to Use What:
**Use Upsert:**

When you're adding new vectors or want to replace existing vectors with new data (including changing the vector values).
When you need to add a completely new document or vector.
When you want to update both the vector values and metadata.

**Use Update:**

When you're only modifying the metadata of an existing vector.
When the vector values (embeddings) themselves are correct and only extra information like text, author, or document-related metadata needs to be updated.
Summary:
Upsert: Adds or replaces both the vector values and metadata. Use when inserting or completely replacing data.
Update: Modifies the metadata without changing the vector values. Use when the vectors are correct, but metadata needs an update.
For your case, if you just want to add or update the page_content or any other metadata for existing vectors, use update. If you want to re-upload vectors with new embeddings or metadata, use upsert.









### Creating Embeddings Via **nvidia/nv-embed-v1** and Sparse Vectors **Bm25Encoder** for Upserting **Hybrid Vectors** to **Pinecone**

In [None]:
# # Connect to the index
# # index = pc.Index("ai-coach") -- COURSE 1 ONLY
# index = pc.Index("surgical-tech-complete")  # -- COMPLETE SURGICAL TECH BOOTCAMP


# def get_dense_embedding(text="None"):
#     response = client.embeddings.create(
#         input=text,
#         model="nvidia/nv-embed-v1",
#         encoding_format="float",
#         extra_body={"input_type": "query", "truncate": "NONE"},
#     )

#     # print(response.data[0].embedding)
#     # print(count_tokens(response.data[0].embedding))
#     return response.data[0].embedding


# def get_sparse_embedding(text="None"):
#     bm25_encoder = BM25Encoder()
#     # applying tfidf values on sentences
#     bm25_encoder.fit(text)
#     sparse_vector = bm25_encoder.encode_documents(text)
#     # Access indices and values
#     return sparse_vector


# print(get_sparse_embedding("Surgical Conscience"))


# def upsert_chunks_to_pinecone(index, chunks):
#     count = 0
#     for chunk in chunks:
#         # Ensure the chunk has the correct structure
#         content = chunk.get("content")
#         metadata = chunk.get("metadata", {})

#         # Get the embedding for the chunk
#         # embedding = get_embedding(content).data[0]['values']
#         embedding = get_embedding(content)

#         # Add the text as part of the metadata
#         metadata["text"] = content  # Store text in metadata
#         # metadata["token_count"] = count_tokens(content)

#         # Create a unique vector ID for each chunk (e.g., based on count or some unique identifier)
#         vector_id = f"vec_{count}"

#         # Upsert the embedding along with its metadata
#         index.upsert(vectors=[(vector_id, embedding, metadata)])

#         count += 1
#         print(f"Embedding {count} upserted to Pinecone with metadata")

#     print(f"All {count} embeddings have been upserted to Pinecone")


# upsert_chunks_to_pinecone(index, chunks)


# from pinecone_text.sparse import BM25Encoder


# # Fit BM25 encoder on your corpus
# bm25_encoder = BM25Encoder()
# bm25_encoder.fit(corpus)  # 'corpus' should be a list of documents


def get_dense_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="nvidia/nv-embed-v1",
        encoding_format="float",
        extra_body={"input_type": "query", "truncate": "NONE"},
    )
    return response.data[0].embedding


# def get_dense_embedding(text="None"):
#     response = client.embeddings.create(
#         input=text,
#         model="nvidia/nv-embed-v1",
#         encoding_format="float",
#         extra_body={"input_type": "query", "truncate": "NONE"},
#     )

#     # print(response.data[0].embedding)
#     # print(count_tokens(response.data[0].embedding))
#     return response.data[0].embedding


def get_sparse_embedding(text="None"):
    # bm25_encoder = BM25Encoder()
    # # applying tfidf values on sentences
    # bm25_encoder.fit(text)
    # Load the pre-trained BM25 model
    with open('bm25_model.pkl', 'rb') as f:
        bm25_encoder = pickle.load(f)
    sparse_vector = bm25_encoder.encode_documents(text)
    # Access indices and values
    return sparse_vector


# print(get_sparse_embedding("Surgical Conscience"))

# -------------------------- Chatgpt code trying to upsert all at once ------------------------
# def upsert_chunks_to_pinecone(index, chunks):
#     vectors = []
#     for count, chunk in enumerate(chunks):
#         content = chunk.get("content")
#         metadata = chunk.get("metadata", {})
#         metadata["text"] = content  # Store text in metadata

#         dense_vector = get_dense_embedding(content)
#         sparse_vector = get_sparse_embedding(content)

#         vector = {
#             "id": f"vec_{count}",
#             "values": dense_vector,
#             "sparse_values": sparse_vector,
#             "metadata": metadata,
#         }
#         vectors.append(vector)

#     index.upsert(vectors=vectors)
#     print(f"Upserted {len(vectors)} hybrid vectors to Pinecone.")


# ------------------------------------ Personal Code I modified -----------------------------------------
def upsert_chunks_to_pinecone(index, chunks):
    count = 0
    for chunk in chunks:
        # Ensure the chunk has the correct structure
        content = chunk.get("content")
        metadata = chunk.get("metadata", {})

        # Get the embedding for the chunk
        # embedding = get_embedding(content).data[0]['values']
        # embedding = get_embedding(content)
        dense_vector = get_dense_embedding(content)
        sparse_vector = get_sparse_embedding(content)

        # Add the text as part of the metadata
        metadata["text"] = content  # Store text in metadata
        # metadata["token_count"] = count_tokens(content)

        # Create a unique vector ID for each chunk (e.g., based on count or some unique identifier)
        vector_id = f"vec_{count}"

        vector = {
            "id": vector_id,
            "values": dense_vector,
            "sparse_values": sparse_vector,
            "metadata": metadata,
        }

        # Upsert the embedding along with its metadata
        index.upsert(vectors=[vector])

        count += 1
        print(f"Hybrid Embedding {count} upserted to Pinecone with metadata")

    print(f"All {count} embeddings have been upserted to Pinecone")


upsert_chunks_to_pinecone(index, chunks)


# Update Vectors Function

In [None]:
def update_pinecone_chunks(index, chunks):
    count = 0
    for chunk in chunks:
        # Get updated embedding
        embedding = get_embedding(chunk.page_content)

        # Extract metadata and page content
        metadata = chunk.metadata
        text = chunk.page_content

        # Create a unique vector ID for each chunk (e.g., based on count or some unique identifier)
        vector_id = f"vec_{count}"

        # Update the embedding and metadata
        index.update(id=vector_id, values=embedding, set_metadata=metadata)

        count += 1
        print(f"Embedding {count} updated in Pinecone with new metadata")

    print(f"All {count} embeddings have been updated in Pinecone")


# update_pinecone_chunks(index, chunks)


Since your application is designed to answer a wide range of student queries and suggest relevant material, you want to retrieve enough content to cover different facets of a topic without overwhelming the LLM with too much information.

# Starting Point:
- A common starting point is to set top_k between **5 and 10.**
- **top_k=5:** This can work well if your curated content is highly relevant and precise, ensuring that the top 5 matches are very close to the query.
-  **top_k=10:** If you want the coach to consider a broader range of content—perhaps to provide diverse perspectives or cover a topic more comprehensively—increasing top_k to around 10 might be beneficial.

# Experiment and Adjust:
- The “best” value depends on factors such as the diversity of your content, how densely your data covers the topics, and the quality of the embedding matches. It’s a good idea to experiment with different top_k values and evaluate the quality and relevance of the responses in your specific
