In [None]:
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
import pickle, os

# Load PDF
reader = PdfReader("../data/AI_Training_Document.pdf")
raw_text = "".join([page.extract_text() for page in reader.pages])

# Chunk text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300, chunk_overlap=50, separators=["\n\n", "\n", ".", "!", "?"]
)
chunks = text_splitter.split_text(raw_text)

# Save chunks
os.makedirs("../chunks", exist_ok=True)
with open("../chunks/chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks)

# FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# Save
os.makedirs("../vectordb", exist_ok=True)
faiss.write_index(index, "../vectordb/index.faiss")

with open("../vectordb/chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

