### This file now only contains the upsert and updating Pipeline


# All imports and inits

In [None]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from tkinter import scrolledtext, messagebox
from transformers import AutoModel, AutoTokenizer
# from pinecone import Pinecone, ServerlessSpec
import pinecone
from pinecone import (
    Pinecone,
    ServerlessSpec,
    CloudProvider,
    AwsRegion,
    VectorType
)

import os
import requests
import PyPDF2
import textwrap
import numpy as np
import streamlit as st
import tkinter as tk
import gradio as gr
from typing import List, Tuple
import concurrent.futures
import glob
import pandas as pd
import numpy as np
# Important: Import pinecone-client properly
# Load environment variables from .env file
load_dotenv()

DATA_PATH = os.getenv("DATA_PATH")
PINECONE_API = os.getenv("PINECONE_API")
PINECONE_ENV = os.getenv("PINECONE_ENV")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
print("PINECONE_API", PINECONE_API)


# Groq API settings
GROQ_EMBED_URL = "https://api.groq.com/openai/v1/embeddings"
GROQ_CHAT_URL = "https://api.groq.com/openai/v1/chat/completions"
EMBEDDING_MODEL = "llama3-405b-8192-embed"
LLM_MODEL = "llama3-70b-8192"


# Configure headers for Groq API requests
GROQ_HEADERS = {
    "Authorization": f"Bearer {GROQ_API_KEY}",
    "Content-Type": "application/json"
}


# PDF Parser


In [None]:
def pdf_load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()


# documents = pdf_load_documents()
# documents


# def extract_text_from_pdf(pdf_path: str) -> str:
#     """Extract text from a PDF file."""
#     with open(pdf_path, 'r') as file:
#         pdf_reader = PyPDF2.PdfReader(file)
#         text = ""
#         for page_num in range(len(pdf_reader.pages)):
#             page = pdf_reader.pages[page_num]
#             text += page.extract_text() + "\n"
#     return text
# extract_text_from_pdf(DATA_PATH)


### Single Directory Parser

In [None]:
def load_documents():
    """
    Load PDF and Excel files from DATA_PATH.
    Returns a list of documents with content and metadata and a list of filenames.
    """
    documents = []
    file_names = []

    # Load PDFs
    pdf_loader = PyPDFDirectoryLoader(DATA_PATH)
    pdf_docs = pdf_loader.load()
    for doc in pdf_docs:
        documents.append({
            "content": doc.page_content,
            "metadata": {"source": doc.metadata.get("source", "unknown"), "file_type": "pdf"}
        })
        file_names.append(doc.metadata.get("source", "unknown"))

    # Load Excel files
    excel_files = glob.glob(os.path.join(DATA_PATH, "*.xlsx"))
    for file in excel_files:
        df = pd.read_excel(file)
        headers = df.columns.tolist()
        for _, row in df.iterrows():
            content = " ".join([f"{col}: {str(row[col])}" for col in headers])
            documents.append({
                "content": content,
                "metadata": {"source": file, "file_type": "excel"}
            })
        file_names.append(file)

    if not documents:
        print(f"No PDF or Excel files found in {DATA_PATH}")
    else:
        print(f"Loaded {len(documents)} documents")

    return documents, file_names


# Load documents
documents, file_names = load_documents()

# # Print file names
# print("Files parsed:")
# for name in file_names:
#     print(f"- {name}")

# print(f"\nTotal documents loaded: {len(documents)}")


### Multiple Directory Parser

In [None]:
def load_documents():
    """
    Load PDF and Excel files from DATA_PATH and its subdirectories.
    Returns a list of documents with content and metadata, and a list of filenames.
    """
    documents = []
    file_names = []
    processed_files = []

    # Load PDFs from all subdirectories
    pdf_loader = PyPDFDirectoryLoader(DATA_PATH, recursive=True)
    pdf_docs = pdf_loader.load()
    print("PDF DOCS len: \n\n", len(pdf_docs))
    # print("PDF DOCS: \n\n",pdf_docs)
    count = 0
    for doc in pdf_docs:
        source = doc.metadata.get("source", "unknown")
        if source not in processed_files:
            documents.append({
                "content": doc.page_content,
                "metadata": {"source": source, "file_type": "pdf"}
            })
            # print(f"PARSED DATA {source}: \n",documents[-1])

            processed_files.append(source)
            file_names.append(source)
            # print(f"Parsed PDF file : {source}")
        # print("COUNT: ", count)
        count += 1
   # return

    # Load Excel files from all subdirectories
    for root, _, files in os.walk(DATA_PATH):
        for file in files:
            if file.endswith(".xlsx"):
                file_path = os.path.join(root, file)
                if file_path not in processed_files:
                    df = pd.read_excel(file_path)
                    headers = df.columns.tolist()
                    for _, row in df.iterrows():
                        content = " ".join(
                            [f"{col}: {str(row[col])}" for col in headers])
                        documents.append({
                            "content": content,
                            "metadata": {"source": file_path, "file_type": "excel"}
                        })
                        # print("PARSED DATA: \n\n\n",documents[-1])
                    processed_files.append(file_path)
                    file_names.append(file_path)
                    print(f"Parsed Excel file: {file_path}")

    if not documents:
        print(f"No PDF or Excel files found in {DATA_PATH}")
    else:
        print(f"Loaded {len(documents)} documents")
    # Print file names
    print("Files parsed:")
    for name in processed_files:
        print(f"- {name}")
    return documents, file_names


# Load documents
documents, file_names = load_documents()


print(f"\nTotal documents loaded: {len(documents)}")


# Text Splitting \ Chunking using Langchain

In [None]:
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False  # considers separators like '\n\n' if true
    )
    # Assuming each document is a dictionary with 'content' and 'metadata'
    docs = []
    for doc in documents:
        chunks = text_splitter.split_text(doc['content'])
        for i, chunk in enumerate(chunks):
            docs.append({
                "content": chunk,
                "metadata": {
                    **doc['metadata'],
                    "chunk_id": i
                }
            })
    return docs


# Split documents into chunks
chunks = split_documents(documents)
# print(len(chunks))


# Init Pinecone

In [None]:
pc = Pinecone(api_key=PINECONE_API)
print(PINECONE_API)


### When to Use What:
**Use Upsert:**

When you're adding new vectors or want to replace existing vectors with new data (including changing the vector values).
When you need to add a completely new document or vector.
When you want to update both the vector values and metadata.

**Use Update:**

When you're only modifying the metadata of an existing vector.
When the vector values (embeddings) themselves are correct and only extra information like text, author, or document-related metadata needs to be updated.
Summary:
Upsert: Adds or replaces both the vector values and metadata. Use when inserting or completely replacing data.
Update: Modifies the metadata without changing the vector values. Use when the vectors are correct, but metadata needs an update.
For your case, if you just want to add or update the page_content or any other metadata for existing vectors, use update. If you want to re-upload vectors with new embeddings or metadata, use upsert.









## Creating Embeddings Via AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en'  and Upsert each to Pinecone one by one


In [None]:
# Connect to the index
index = pc.Index("ai-coach")


embedding_model = AutoModel.from_pretrained(
    'jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)

# Function to generate embeddings without tokenization


def get_embedding(data):
    embeddings = embedding_model.encode(data).tolist()
    return embeddings


def upsert_chunks_to_pinecone(index, chunks):
    count = 1
    for chunk in chunks:
        # Ensure the chunk has the correct structure
        content = chunk.get('content')
        metadata = chunk.get('metadata', {})

        # Get the embedding for the chunk
        embedding = get_embedding(content)

        # Add the text as part of the metadata
        metadata['text'] = content  # Store text in metadata

        # Create a unique vector ID for each chunk (e.g., based on count or some unique identifier)
        vector_id = f"vec_{count}"

        # Upsert the embedding along with its metadata
        index.upsert(vectors=[(vector_id, embedding, metadata)])

        print(f"Embedding {count} upserted to Pinecone with metadata")
        count += 1

    print(f"All {count-1} embeddings have been upserted to Pinecone")


# Example usage
# Assuming `index` is your Pinecone index and `chunks` is the list of chunked documents
upsert_chunks_to_pinecone(index, chunks)

# query_embeddings = embedding_model.encode(user_query).tolist()
# query_embeddings


# Update Vectors Function

In [None]:
def update_pinecone_chunks(index, chunks):
    count = 1
    for chunk in chunks:
        # Get updated embedding
        embedding = get_embedding(chunk.page_content)

        # Extract metadata and page content
        metadata = chunk.metadata
        text = chunk.page_content

        # Create a unique vector ID for each chunk (e.g., based on count or some unique identifier)
        vector_id = f"vec_{count}"

        # Update the embedding and metadata
        index.update(id=vector_id, values=embedding, set_metadata=metadata)

        print(f"Embedding {count} updated in Pinecone with new metadata")
        count += 1

    print(f"All {count-1} embeddings have been updated in Pinecone")

# update_pinecone_chunks(index, chunks)


Since your application is designed to answer a wide range of student queries and suggest relevant material, you want to retrieve enough content to cover different facets of a topic without overwhelming the LLM with too much information.

# Starting Point:
- A common starting point is to set top_k between **5 and 10.**
- **top_k=5:** This can work well if your curated content is highly relevant and precise, ensuring that the top 5 matches are very close to the query.
-  **top_k=10:** If you want the coach to consider a broader range of content—perhaps to provide diverse perspectives or cover a topic more comprehensively—increasing top_k to around 10 might be beneficial.

# Experiment and Adjust:
- The “best” value depends on factors such as the diversity of your content, how densely your data covers the topics, and the quality of the embedding matches. It’s a good idea to experiment with different top_k values and evaluate the quality and relevance of the responses in your specific
