In [1]:
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.word_document import Docx2txtLoader

data_dir = Path("D:\llm-app\data")

docs = []
for path in data_dir.glob("**/*"):
    suf = path.suffix.lower()

    if suf == ".pdf":
        docs.extend(PyPDFLoader(str(path)).load())
    elif suf == ".docx":
        docs.extend(Docx2txtLoader(str(path)).load())

len(docs)




  from .autonotebook import tqdm as notebook_tqdm


52

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

len(chunks)



200

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [4]:
dim = len(embeddings.embed_query("hello"))
dim


384

In [None]:
from dotenv import load_dotenv
from pathlib import Path
import os


env_path = Path("../backend/.env").resolve()

print("Notebook cwd:", os.getcwd())
print("Trying to load:", env_path)
print("Exists?", env_path.exists())

load_dotenv(env_path, override=True)

print("PINECONE_API_KEY present?", os.getenv("PINECONE_API_KEY") is not None)



Notebook cwd: d:\llm-app\notebook
Trying to load: D:\llm-app\backend\.env
Exists? True
PINECONE_API_KEY present? True


In [6]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

INDEX_NAME = "rag-index"

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"].strip())

existing = pc.list_indexes().names()
if INDEX_NAME not in existing:
    pc.create_index(
        name=INDEX_NAME,
        dimension=dim,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

print("Ready index:", INDEX_NAME)


Ready index: rag-index


In [9]:
from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore.from_documents(
    documents=chunks,
    embedding=embeddings,
    index_name=INDEX_NAME
)

print("Upsert complete ✅")


Upsert complete ✅
