#### On my machine (Mike) I use conda env "trials"

In [186]:
from dotenv import load_dotenv
import os
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

#### Get protocol reference, vectorize and store in Qdrant

In [187]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import Qdrant
from langchain_core.documents.base import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load the document - here we are just using a protocol in a specific directory
# file_path = './documents/protocol.pdf'
file_path = './documents/consent.pdf'
separate_pages = []             
loader = PyMuPDFLoader(file_path)
page = loader.load()
separate_pages.extend(page)
print(f"Number of separate pages: {len(separate_pages)}")


Number of separate pages: 19


In [189]:

# OyMuPDFLoader loads pages into separate docs!
# This is a problem when we chunk because we only chunk individual
# documents.  We need ONE overall document so that the chunks can
# overlap between actual PDF pages.
document_string = ""
for page in separate_pages:
    document_string += page.page_content
print(f"Length of the document string: {len(document_string)}")


Length of the document string: 44940


In [190]:
import tiktoken

# CHOP IT UP
def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-4o").encode(
        text,
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap = 50,
    length_function = tiktoken_len
)
text_chunks = text_splitter.split_text(document_string)
print(f"Number of chunks: {len(text_chunks)} ")
max_chunk_size = 0
for chunk in text_chunks:
    max_chunk_size = max(max_chunk_size, len(chunk))
print(f"Maximum chunk size: {max_chunk_size}")
document = [Document(page_content=chunk) for chunk in text_chunks]
print(f"Length of  document: {len(document)}")



Number of chunks: 67 
Maximum chunk size: 1031
Length of  document: 67


In [191]:
from qdrant_client import QdrantClient
import defaults

embedding_model = defaults.default_embedding_model
qdrant_url = defaults.default_url


client=<openai.resources.embeddings.Embeddings object at 0x10bc89150> async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x10da0c4d0> model='text-embedding-3-small' dimensions=None deployment='text-embedding-ada-002' openai_api_version=None openai_api_base=None openai_api_type=None openai_proxy=None embedding_ctx_length=8191 openai_api_key=None openai_organization=None allowed_special=None disallowed_special=None chunk_size=1000 max_retries=2 request_timeout=None headers=None tiktoken_enabled=True tiktoken_model_name=None show_progress_bar=False model_kwargs={} skip_empty=False default_headers=None default_query=None retry_min_seconds=4 retry_max_seconds=20 http_client=None http_async_client=None check_embedding_ctx_length=True
:memory:
http://localhost:6333


In [211]:
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from langchain_qdrant import QdrantVectorStore
import hashlib

"""
This code creates a hash for every chunk and checks to see if that chunk already exists in the
vector database.  We only want one collection in Qdrant, but want to make sure that if a user
selects a document that has already been embedded and stored, it does not get stored again.  We
also add metadata for the document title, so that we can make our retriever focus on documents of
interest.  For example, after some usage, the application might have 20 documents for the user to 
select from.  We want the retriever to be exactly right for the documents that they selected.

This could also be useful if different versions of documents are in existence.  We would not want to
recreate a large vectorstore.  But the user could select the most recent version.
"""


def get_document_hash(doc_content):
    """Generate a unique hash for the document content."""
    return hashlib.md5(doc_content.encode()).hexdigest()

# Add a unique hash to your documents
for doc in document:
    doc.metadata['content_hash'] = get_document_hash(doc.page_content)

# Add the document title
for doc in document:
    doc.metadata['document_title'] = file_path.split('/')[-1]

client = QdrantClient(url=qdrant_url)

# If the collection exists, then we need to check to see if our document is already
# present, in which case we would not want to store it again.
if client.collection_exists("protocol_collection"):
    print("Collection exists")
    qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
        embedding=embedding_model,
        collection_name="protocol_collection",
        url=qdrant_url
    )
    
    # Check for existing documents and only add new ones
    existing_hashes = set()
    new_docs = []
    
    # Get all existing hashes
    scroll_filter = rest.Filter(
        should=[
            rest.FieldCondition(
                key="metadata.content_hash",
                match=rest.MatchValue(value=doc.metadata['content_hash'])
            ) for doc in document
        ]
    )
    
    scroll_results = client.scroll(
        collection_name="protocol_collection",
        scroll_filter=scroll_filter,
        limit=len(document)  # Adjust this if you have a large number of documents
    )
    
    existing_hashes = set(point.payload.get('metadata', {}).get('content_hash') for point in scroll_results[0])
    
    for doc in document:
        if doc.metadata['content_hash'] not in existing_hashes:
            new_docs.append(doc)
    
    if new_docs:
        qdrant_vectorstore.add_documents(new_docs)
    
    print(f"Added {len(new_docs)} new documents")
    print(f"Skipped {len(existing_hashes)} existing documents")
else: 
    print("Collection does not exist")                           #So we go ahead and just add the documents
    qdrant_vectorstore = QdrantVectorStore.from_documents(
        documents=document,
        embedding=embedding_model,
        collection_name="protocol_collection",
        url=qdrant_url
    )


Collection exists
Added 0 new documents
Skipped 67 existing documents


In [208]:
from qdrant_client.http import models as rest

"""
This code sets up the search type but more importantly it has the filter
set up correctly.  We get a list of document titles that we want to include
in the filter, and pass it into the function, returning the retriever.

"""

def create_protocol_retriever(document_titles):
    return qdrant_vectorstore.as_retriever(
        search_type='mmr',                                  #mmr is experiment for me
        search_kwargs={
            'filter': rest.Filter(
                must=[
                    rest.FieldCondition(
                        key="metadata.document_title",
                        match=rest.MatchAny(any=document_titles)
                    )
                ]
            ),
            'k': 5,                                         # for the mmr search it will only return k
            'fetch_k': 50,                                  # but will evaluate fetch_k candidates
        }
    )

# Usage example
document_titles = ["consent.pdf", "protocol.pdf"]
protocol_retriever = create_protocol_retriever(document_titles)



#### Test protocol retriever

In [210]:
# from pprint import pprint
risks = protocol_retriever.get_relevant_documents("Risks of study")
len(risks)
for i, doc in enumerate(risks, 1):
    print(f"Document {i}")
    print("-" * 50)
    pprint(doc.page_content)
    print("-" * 50)

Document 1
--------------------------------------------------
('49\n'
 '9.1.2\n'
 'Study Procedures and Materials\n'
 '. . . . . . . . . . . . . . . . . . . . .\n'
 '50\n'
 '9.1.3\n'
 'Potential Risks of Study Participation\n'
 '. . . . . . . . . . . . . . . . . .\n'
 '51\n'
 '9.1.4\n'
 'Alternatives to Study Participation . . . . . . . . . . . . . . . . . . . .\n'
 '52\n'
 '9.2\n'
 'Adequacy of Protection Against Risks . . . . . . . . . . . . . . . . . . . . '
 '. .\n'
 '52\n'
 '9.2.1\n'
 'Parental Permission, Informed Consent and Assent . . . . . . . . . . .\n'
 '52\n'
 '9.2.2\n'
 'Institutional Review Board and Human Research Protection\n'
 '. . . . . .\n'
 '54\n'
 '9.2.3')
--------------------------------------------------
Document 2
--------------------------------------------------
('study, you will need to have an IV tube (catheter) available through which '
 'to give the study drug.\n'
 'If for some reason you do not have an IV catheter available during the time '
 'period in w

In [138]:
# total_tokens=0
# for risk in risks:
#     total_tokens+= tiktoken_len(risk.page_content)
# print(f"Tokens in context: {total_tokens}")
# for i, doc in enumerate(risks, 1):
#     print(f"Document {i}")
#     print("-" * 50)
#     pprint(doc.page_content)
#     print("-" * 50)

In [139]:
# benefits = protocol_retriever.get_relevant_documents("Benefits of study")
# for i, doc in enumerate(benefits, 1):
#     print(f"Document {i}")
#     print("-" * 50)
#     pprint(doc.page_content)
#     print("-" * 50)