In [5]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

Step 1: Load the raw pdf

In [2]:
DATA_PATH = "data/"
def load_pdf_files(data): 
    loader = DirectoryLoader(data, 
                            glob = '*.pdf', 
                            loader_cls = PyPDFLoader)

    documents = loader.load() 
    return documents 
    
documents = load_pdf_files(data=DATA_PATH)
print("Length of pdf", len(documents))

Length of pdf 759


Step 2: Creat Chunks 

In [3]:
def creat_chunks(extracted_data): 
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, 
                                                  chunk_overlap = 50)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks 
text_chunks = creat_chunks(extracted_data = documents)
print("Length of chunks:", len(text_chunks))

Length of chunks: 7080


Step 3: Creat Vector Embedding 

In [4]:
def get_embedding_model(): 
    embedding_model = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")
    return embedding_model
embedding_model = get_embedding_model()

Step 4: Store Embedding on FAISS 

In [6]:
DB_FAISS_PATH = "vectorstore//db_faiss" 
db = FAISS.from_documents(text_chunks, embedding_model)
db.save_local(DB_FAISS_PATH)