In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import pickle

# # Define the folder containing PDF files
pdf_folder = "./Datasets"  # Update with your folder path

# # Get all PDF files from the folder
pdf_files = [os.path.join(pdf_folder, file) for file in os.listdir(pdf_folder) if file.endswith(".pdf")]

# # Load documents from PDFs
documents = []
for pdf_file in pdf_files:
    loader = PyPDFLoader(pdf_file)
    documents.extend(loader.load())

# # Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30)
chunks = text_splitter.split_documents(documents)

# # Embedding function

embedding_function = HuggingFaceEmbeddings(show_progress=True, multi_process=True)

# # Create FAISS vector store
vector_store = FAISS.from_documents(chunks, embedding_function)

# # Save the FAISS index
faiss_folder = "./faiss_index"
os.makedirs(faiss_folder, exist_ok=True)
faiss_index_path = os.path.join(faiss_folder, "faiss_index")
vector_store.save_local(faiss_index_path)

# # Save the embeddings separately (optional)
with open(os.path.join(faiss_folder, "faiss_embeddings.pkl"), "wb") as f:
    pickle.dump(embedding_function, f)