In [16]:
import os
from dotenv import load_dotenv
import google.generativeai as genai
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone
from langchain.vectorstores import Pinecone as PineconeStore

In [2]:
# Load environment variables
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PDF_PATH = "E:\RAG\FUNDAMENTAL OF MATHEMATICAL STATISTICS-S C GUPTA & V K KAPOOR.pdf"

  PDF_PATH = "E:\RAG\FUNDAMENTAL OF MATHEMATICAL STATISTICS-S C GUPTA & V K KAPOOR.pdf"


In [3]:
# Load PDF
loader = PyPDFLoader(PDF_PATH)
raw_docs = loader.load()
print(f"Loaded {len(raw_docs)} raw docs.")


Loaded 1303 raw docs.


In [4]:
# Chunk documents
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunked_docs = splitter.split_documents(raw_docs)
print(f"Split into {len(chunked_docs)} chunks.")

Split into 3097 chunks.


In [8]:
# Prepare texts from chunked docs
texts = [doc.page_content for doc in chunked_docs]


In [13]:
genai.configure(api_key=GEMINI_API_KEY)

def embed_texts(texts):
    embeddings = []
    for text in texts:
        response = genai.embed_content(
            model="models/text-embedding-004",
            content=text
        )
        embeddings.append(response["embedding"])
    return embeddings

In [14]:
# Get embeddings via Google GenAI
print("Getting embeddings from Google Generative AI...")
all_embeddings = embed_texts(texts)
print(f"Generated {len(all_embeddings)} embeddings.")

Getting embeddings from Google Generative AI...
Generated 3097 embeddings.


In [18]:
pc = Pinecone(api_key=PINECONE_API_KEY)
pinecone_index = pc.Index(PINECONE_INDEX_NAME)

print("Uploading vectors to Pinecone (may take a while for many chunks)...")

vectors = []
for i, (embedding, doc) in enumerate(zip(all_embeddings, chunked_docs)):
    vectors.append({
        "id": str(i), 
        "values": embedding, 
        "metadata": {"text": doc.page_content}
    })

BATCH_SIZE = 100
for i in range(0, len(vectors), BATCH_SIZE):
    batch = vectors[i:i+BATCH_SIZE]
    pinecone_index.upsert(vectors=batch)
    print(f"Uploaded batch {i//BATCH_SIZE + 1}/{(len(vectors)-1)//BATCH_SIZE + 1}")

print("Upload complete!")


Uploading vectors to Pinecone (may take a while for many chunks)...
Uploaded batch 1/31
Uploaded batch 2/31
Uploaded batch 3/31
Uploaded batch 4/31
Uploaded batch 5/31
Uploaded batch 6/31
Uploaded batch 7/31
Uploaded batch 8/31
Uploaded batch 9/31
Uploaded batch 10/31
Uploaded batch 11/31
Uploaded batch 12/31
Uploaded batch 13/31
Uploaded batch 14/31
Uploaded batch 15/31
Uploaded batch 16/31
Uploaded batch 17/31
Uploaded batch 18/31
Uploaded batch 19/31
Uploaded batch 20/31
Uploaded batch 21/31
Uploaded batch 22/31
Uploaded batch 23/31
Uploaded batch 24/31
Uploaded batch 25/31
Uploaded batch 26/31
Uploaded batch 27/31
Uploaded batch 28/31
Uploaded batch 29/31
Uploaded batch 30/31
Uploaded batch 31/31
Upload complete!
