In [None]:
# !pip install openai PyMuPDF faiss-cpu numpy python-dotenv langchain langchain-openai langchain-community tqdm

In [None]:
import os
import sqlite3
import fitz  # PyMuPDF
import numpy as np
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from tqdm import tqdm
import glob

load_dotenv()

In [None]:
# List of root directories containing PDFs
root_folders = [
        "folder1",
        "fodler2"
    ]  # Replace with your actual folders

pdf_files = []
for root_folder in root_folders:
    found_pdfs = glob.glob(os.path.join(root_folder, "**/*.pdf"), recursive=True)
    pdf_files.extend(found_pdfs)

print(f"Discovered {len(pdf_files)} PDFs:")
for pdf_file in pdf_files:
    print(f"- {pdf_file}")

In [None]:
def extract_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ''
    for page in doc:
        text += page.get_text()
    return text

def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def process_pdfs(pdf_files):
    chunks, sources = [], []
    for pdf_file in pdf_files:
        text = extract_pdf_text(pdf_file)
        pdf_chunks = chunk_text(text)
        chunks.extend(pdf_chunks)
        sources.extend([pdf_file] * len(pdf_chunks))
    return chunks, sources

def init_db(db_name="chunks.db"):
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    c.execute("""
        CREATE TABLE IF NOT EXISTS chunks (
            id INTEGER PRIMARY KEY,
            source TEXT,
            chunk TEXT
        )
    """)
    conn.commit()
    return conn

def store_chunks(conn, chunks, sources):
    c = conn.cursor()
    c.executemany("INSERT INTO chunks (source, chunk) VALUES (?, ?)", zip(sources, chunks))
    conn.commit()

conn = init_db()
chunks, sources = process_pdfs(pdf_files)
store_chunks(conn, chunks, sources)
conn.close()

print(f"Processed and stored chunks from {len(pdf_files)} PDFs.")

In [None]:
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")

def batch_embeddings(chunks, metadatas, batch_size=100):
    vectorstore = None
    for i in tqdm(range(0, len(chunks), batch_size), desc="Creating embeddings"):
        chunk_batch = chunks[i:i+batch_size]
        metadata_batch = metadatas[i:i+batch_size]

        if vectorstore is None:
            vectorstore = FAISS.from_texts(chunk_batch, embeddings_model, metadatas=metadata_batch)
        else:
            vectorstore.add_texts(chunk_batch, metadatas=metadata_batch)

    return vectorstore

metadata_list = [{"source": source} for source in sources]

vectorstore = batch_embeddings(chunks, metadata_list, batch_size=100)

vectorstore.save_local("faiss_index_directory")

print("FAISS vectorstore and metadata successfully saved.")