In [None]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_chroma import Chroma

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
# Step 1: Load documents:

DATA_PATH = 'data/'

def load_documents(data_path):

    loader = DirectoryLoader(
        path=data_path,
        glob='*.pdf',
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

documents = load_documents(data_path=DATA_PATH)

In [None]:
# len(documents)

759

In [6]:
# Step 2: Split documents and create chunks

def create_chunks(data):

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50
    )

    text_chunks = text_splitter.split_documents(data)
    return text_chunks

text_chunks=create_chunks(documents)

In [None]:
# len(text_chunks)

7080

In [8]:
# Step 3: Create Embedding Model

def get_embedding_model():

    embedding_model=GoogleGenerativeAIEmbeddings(model='models/embedding-001')
    return embedding_model

embedding_model = get_embedding_model()

In [None]:
# embedding_model

GoogleGenerativeAIEmbeddings(client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x000002847C1ACB90>, model='models/embedding-001', task_type=None, google_api_key=SecretStr('**********'), credentials=None, client_options=None, transport=None, request_options=None)

In [10]:
# Step 4: Create Vector Store
def create_vector_store(docs):

    vector_store=Chroma.from_documents(
        documents=docs,
        embedding=embedding_model,
        collection_name='my_collection',
        persist_directory='chroma_db'
    )

    return vector_store

vector_store=create_vector_store(text_chunks)

In [None]:
# vector_store._collection.get()['ids']

['250679af-5737-4b5d-a640-2cf8f6114bdc',
 'c13ae70d-c83b-47d3-b3ca-74caae23513b',
 '43568d11-9e06-4196-a259-c0ecfaf58cad',
 '3ce67330-dfc7-4c4f-9ba6-5d32470b391b',
 '0a6ebc84-a31d-41a2-b1b5-794e14bc77cd',
 '3955117f-d407-4ec6-babd-cdcf57199b36',
 '2d5cf9c1-69a4-48a5-8974-581c57a8d35e',
 '740d7be7-40f0-425d-8e12-3b5d16a955d0',
 '86f146da-7906-4dc8-b6d9-efc90a773203',
 'e90c937d-8c3c-4545-a28c-cdf281fd6367',
 '14c3ba00-c99b-4389-8e7c-8d27604f7d3c',
 'c4a11711-a8e4-48fb-a894-73971c2ed322',
 'e0eaea05-45c7-4fec-a070-b3bf7aaefd18',
 '7d94df79-738e-4463-accc-e7e24bd3c367',
 '438b1eb6-d90c-438d-b946-baa55c244978',
 'f48fa34d-2f99-4d47-8c1a-218fd7cbbddc',
 '51ec42d8-6ec6-448b-a2df-734582b75b8c',
 '8282b293-dc09-45a6-a849-375b9a3c0160',
 '364a9eed-022f-4a92-b60d-aeb480fd7886',
 'cee1b66b-33e8-4ddc-829e-0a8a81f0f330',
 '4d4b4a03-fbbe-4182-b79b-fab459249d08',
 '6f632a08-2678-46bb-bd46-7fb94d3fb71a',
 '4ebdde11-92e6-4f58-a987-c8b28a841704',
 '2b850a93-221f-4f24-91cd-f69d2b99d5f9',
 '71ea1415-7b1c-

In [None]:
# len(vector_store._collection.get()['ids'])

7080

In [None]:
# vector_store._collection.get(ids='250679af-5737-4b5d-a640-2cf8f6114bdc')['documents']

['The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION']

In [None]:
# vector_store._collection.get(ids='250679af-5737-4b5d-a640-2cf8f6114bdc')['metadatas']

[{'subject': '',
  'creationdate': '2017-05-01T10:37:35-07:00',
  'moddate': '2017-05-01T10:37:35-07:00',
  'keywords': '',
  'title': '',
  'author': '',
  'creator': '',
  'source': 'data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND (1).pdf',
  'page_label': '1',
  'total_pages': 759,
  'producer': 'GPL Ghostscript 9.10',
  'page': 0}]