In [1]:
!pip install pymupdf faiss-cpu sentence-transformers




[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
import faiss
import pickle
import fitz
from sentence_transformers import SentenceTransformer

EMBED_MODEL = "sentence-transformers/all-MiniLM-L12-v2"
embedder = SentenceTransformer(EMBED_MODEL)
dimension = embedder.get_sentence_embedding_dimension()

def build_faiss_from_folder(folder_path, index_file="faiss.index", metadata_file="index.pkl"):
    documents, metadata = [], []

    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
    print(f"Found {len(pdf_files)} PDF files in '{folder_path}'")

    for pdf_file in pdf_files:
        print(f"Processing: {pdf_file}")
        doc = fitz.open(pdf_file)
        for page_num in range(len(doc)):
            text = doc[page_num].get_text("text")
            if text:
                chunks = [text[i:i+800] for i in range(0, len(text), 800)]
                for chunk in chunks:
                    documents.append(chunk)
                    metadata.append(f"{pdf_file} - page {page_num+1}")
        doc.close()

    print(f"Total chunks created: {len(documents)}")
    print("Generating embeddings...")
    embeddings = embedder.encode(documents, convert_to_numpy=True)

    print("Building FAISS index...")
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    faiss.write_index(index, index_file)
    with open(metadata_file, "wb") as f:
        pickle.dump({"documents": documents, "metadata": metadata}, f)

    print(f"Saved {len(documents)} chunks into {index_file} and {metadata_file}")

books_folder = "Text Books Of Medicine"  
build_faiss_from_folder(books_folder)


Found 30 PDF files in 'Text Books Of Medicine'
Processing: Text Books Of Medicine\(Churchill’s Pocketbook  Differential Diagnosis 3ed.pdf
Processing: Text Books Of Medicine\ABC of urology 3ed.pdf
Processing: Text Books Of Medicine\Andrews’ Diseases of the Skin Clinical Atlas.pdf
Processing: Text Books Of Medicine\Blueprints Of Family Medicine 3ed.pdf
Processing: Text Books Of Medicine\Blueprints Of Neurology 3ed.pdf
Processing: Text Books Of Medicine\Bryan Epidemiology and Biostatistics 2ed [Shared by Ussama Maqbool].pdf
Processing: Text Books Of Medicine\Color Atlas of Autopsy [Shared by Ussama Maqbool].pdf
Processing: Text Books Of Medicine\Color Atlas of Forensic Medicine [Shared by Ussama Maqbool].pdf
Processing: Text Books Of Medicine\Conrad Fischer - Master the Boards USMLE Step 2 CK 2019.pdf
Processing: Text Books Of Medicine\Davidson_s Essentials of Medicine 2ed.pdf
Processing: Text Books Of Medicine\Davidson’s Principles and practice of medicine 23ed.pdf
Processing: Text Books