In [1]:
#!pip install -q transformers langchain-huggingface langchain_experimental

In [6]:
import numpy as np
import requests
from sentence_transformers import SentenceTransformer
from docx import Document
from langchain_docling import DoclingLoader
from langchain_ollama.llms import OllamaLLM
from langchain_docling.loader import ExportType

def load_doc_documents(file_path):
    # document_loader = Docx2txtLoader(file_path)
    document_loader = DoclingLoader(file_path,export_type=ExportType.MARKDOWN)
    return document_loader.load()

In [2]:
from pydantic import BaseModel, Field
from typing import Optional, List
from langchain.output_parsers import PydanticOutputParser

class ClinicalTrialDocument(BaseModel):
    ParticipantInformation:str=Field(None,description="Reason for participation, how participation can help.")
    Study:str=Field(None,description="goal of the study, Sponsor for the study")
    DrugUnderStudy:str=Field(None,description="Drug name, how it works, risks involved")
    TrialOverview:str=Field(None,description="The trial's objectives, treatment or intervention being tested, key outcomes being measured, medical care provided, trial period (calendar days)")
    TreatmentDosage:str=Field(None,description="The treatment regimen being tested, including dosage, frequency, and duration of treatment")
    SafetyAdverseEvents:str=Field(None,description="Any adverse events (AEs) and serious adverse events (SAEs) reported, their frequency and severity, and how they were managed")
    EfficacyResults:str=Field(None,description="Key findings on the efficacy of the treatment, including comparison to placebo or standard of care, and any statistical analysis performed")
    EthicalConsiderations:str=Field(None,description="Ethical guidelines followed, informed consent procedures, and any other ethical concerns addressed in the trial")
    StatisticalMethods:str=Field(None,description="The statistical techniques used for data analysis, including any major statistical tests and how outcomes were measured")
    ConclusionRecommendations:str=Field(None,description="The final conclusions of the trial, including whether the treatment was successful, any recommendations for future research, and any potential next steps (e.g., regulatory approval)")
    RegulatoryCompliance:str=Field(None,description="Any regulatory approvals or compliance measures mentioned, such as adherence to FDA or EMA guidelines")
    TrialPhasesMilestones:str=Field(None,description="The phases of the trial (I, II, III, IV), key milestones, and the number of participants in each phase")
    ParticipantExperience:str=Field(None,description="Any insights into the participant experience, including trial procedures, expectations, and feedback from participants (if available)")

pydantic_parser=PydanticOutputParser(pydantic_object=ClinicalTrialDocument)
format_instructions = pydantic_parser.get_format_instructions()

In [3]:
print(format_instructions)

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"ParticipantInformation": {"default": null, "description": "Reason for participation, how participation can help.", "title": "Participantinformation", "type": "string"}, "Study": {"default": null, "description": "goal of the study, Sponsor for the study", "title": "Study", "type": "string"}, "DrugUnderStudy": {"default": null, "description": "Drug name, how it works, risks involved", "title": "Drugunderstudy", "type": "string"}, "TrialOverview": {"default": null, "description": "The trial's objectives, treatment or intervention

In [7]:
LANGUAGE_MODEL = OllamaLLM(model="llama3.2", params={"temperature": 0.01, "seed": 42, "top_k": 1})

LANGUAGE_MODEL=OllamaLLM(model="deepseek-r1")

In [None]:
# def summarize_with_llm(context):
#     """Summarizes it with an LLM."""
    
#     prompt = f"""
#     You are an expert in medical context understanding with deep knowledge of clinical trials. Your task is to extract and summarize all relevant and granular details from the given clinical trial document context passed to you, while maintaining precision, depth, and completeness.

#     Instructions:
#     Retrieve all key facts, numerical data, contextual insights, implicit relationships, metadata, and supporting references from the context. Ensure no critical information is omitted, even if seemingly minor. However, limit finer details regarding dosing and schedules.

#     Summarize the clinical trial context in a structured format, covering the following aspects:
#         - **Participant Information**: Reason for participation, how participation can help.
#         - **Study**: goal of the study, Sponsor for the study. 
#         - **Drug Under Study**: Drug name, how it works, risks involved.
#         - **Trial Overview**: The trial's objectives, treatment or intervention being tested, key outcomes being measured, medical care provided, trial period (calendar days).  
#         - **Treatment and Dosage**: The treatment regimen being tested, including dosage, frequency, and duration of treatment.  
#         - **Safety and Adverse Events**: Any adverse events (AEs) and serious adverse events (SAEs) reported, their frequency and severity, and how they were managed.  
#         - **Efficacy Results**: Key findings on the efficacy of the treatment, including comparison to placebo or standard of care, and any statistical analysis performed.  
#         - **Ethical Considerations**: Ethical guidelines followed, informed consent procedures, and any other ethical concerns addressed in the trial.  
#         - **Statistical Methods**: The statistical techniques used for data analysis, including any major statistical tests and how outcomes were measured.  
#         - **Conclusion and Recommendations**: The final conclusions of the trial, including whether the treatment was successful, any recommendations for future research, and any potential next steps (e.g., regulatory approval).  
#         - **Regulatory Compliance**: Any regulatory approvals or compliance measures mentioned, such as adherence to FDA or EMA guidelines.  
#         - **Trial Phases and Milestones**: The phases of the trial (I, II, III, IV), key milestones, and the number of participants in each phase.  
#         - **Patient/Participant Experience**: Any insights into the participant experience, including trial procedures, expectations, and feedback from participants (if available).

#     Tone: 
#         - Present the summary with a structured and professional tone, suitable for medical research and regulatory review.
#         Output under each section should be clear, detailed, and organized, ensuring a high level of accuracy in medical document analysis.

#     Output Format : {format_instructions}   
    
#     context: {context}
#     """
#     #print(context)
#     response = LANGUAGE_MODEL.invoke(prompt)
    
#     return response

In [9]:
# Example Usage
docx_path = "./HRP-503 - SAMPLE Biomedical Protocol.docx"  # Replace with actual file path
text_data = load_doc_documents(docx_path)
#print(text_data[0])

In [17]:
# summary = summarize_with_llm(text_data)
# print(summary)

In [None]:
# from langchain_core.prompts import PromptTemplate
# # Prompt template
# prompt = PromptTemplate.from_template("""
# You are an expert in medical context understanding with deep knowledge of clinical trials. Your task is to extract and summarize all relevant and granular details from the given clinical trial document context passed to you, while maintaining precision, depth, and completeness.

# Instructions:
#     - Retrieve all key facts, numerical data, contextual insights, implicit relationships, metadata, and supporting references from the context. 
#     - Ensure no critical information is omitted, even if seemingly minor. However, limit finer details regarding dosing and schedules.

# Summarize the clinical trial context in a structured format, covering the following aspects:
#     - **Participant Information**: Reason for participation, how participation can help.
#     - **Study**: goal of the study, Sponsor for the study. 
#     - **Drug Under Study**: Drug name, how it works, risks involved.
#     - **Trial Overview**: The trial's objectives, treatment or intervention being tested, key outcomes being measured, medical care provided, trial period (calendar days).  
#     - **Treatment and Dosage**: The treatment regimen being tested, including dosage, frequency, and duration of treatment.  
#     - **Safety and Adverse Events**: Any adverse events (AEs) and serious adverse events (SAEs) reported, their frequency and severity, and how they were managed.  
#     - **Efficacy Results**: Key findings on the efficacy of the treatment, including comparison to placebo or standard of care, and any statistical analysis performed.  
#     - **Ethical Considerations**: Ethical guidelines followed, informed consent procedures, and any other ethical concerns addressed in the trial.  
#     - **Statistical Methods**: The statistical techniques used for data analysis, including any major statistical tests and how outcomes were measured.  
#     - **Conclusion and Recommendations**: The final conclusions of the trial, including whether the treatment was successful, any recommendations for future research, and any potential next steps (e.g., regulatory approval).  
#     - **Regulatory Compliance**: Any regulatory approvals or compliance measures mentioned, such as adherence to FDA or EMA guidelines.  
#     - **Trial Phases and Milestones**: The phases of the trial (I, II, III, IV), key milestones, and the number of participants in each phase.  
#     - **Patient/Participant Experience**: Any insights into the participant experience, including trial procedures, expectations, and feedback from participants (if available).

# Tone: 
#     - Present the summary with a structured and professional tone, suitable for medical research and regulatory review.
#     Output under each section should be clear, detailed, and organized, ensuring a high level of accuracy in medical document analysis.
# context: {document_text}
# """,
# partial_variables={"format_instructions": format_instructions})

In [None]:
user_question="what is the goal of the study in the given context?"    

prompt = f"""
    You are an expert in medical context understanding with deep knowledge of clinical trials given the context.Your task is to respond to the questions asked by user from the given clinical trial document context passed to you.

    Instructions:
        - You must limit your understading to the document itself, do not hallucinate. If the question is out of the context repsond with i do not know.

    context: {text_data[0]}

    Question : {user_question}
"""

response = LANGUAGE_MODEL.invoke(prompt)

In [None]:
print(response)

In [11]:
print(prompt)


    You are an expert in medical context understanding with deep knowledge of clinical trials given the context.Your task is to respond to the questions asked by user from the given clinical trial document context passed to you.

    Instructions:
        - You must limit your understading to the document itself, do not hallucinate. If the question is out of the context repsond with i do not know.

    context: page_content='PROTOCOL TITLE:

Rapid Elimination Procedure of Teriflunomide

PRINCIPAL INVESTIGATOR:

Alice Roberts, M.D.

USF Department of Neurology

813-974-5555

ARoberts@usf.edu

VERSION NUMBER/DATE:

Version 1, 9/17/2019

REVISION HISTORY

| Revision #   | Version Date   | Summary of Changes   | Consent Change?   |
|--------------|----------------|----------------------|-------------------|
|              |                |                      |                   |
|              |                |                      |                   |
|              |              

In [None]:
print(response)

The text you provided appears to be a sample biomedical protocol, likely used as a template or example in medical research. Based on the content, it seems that the primary goal of the study is to investigate the pharmacokinetics and safety of a new medication.

However, the specific goal of the study is not explicitly stated in the text. It appears to be an experimental design involving multiple phases, including prescreening, administration of the investigational drug, follow-up assessments, and potential withdrawals due to adverse events or other reasons.

If I had to infer the primary goal of the study based on the content, it might be something like:

"To evaluate the safety and efficacy of a new medication in healthy subjects, assessing its pharmacokinetic profile and identifying any potential side effects or adverse reactions."

Please note that this is an educated guess, and without further context or information from the researchers, it's difficult to provide a more specific an

Another way to query with support for pydantic parser

In [None]:
from langchain_core.prompts import PromptTemplate
# Prompt template
prompt = PromptTemplate.from_template("""
You are an expert in medical context understanding with deep knowledge of clinical trials.Your task is to respond to the questions asked by user from the given clinical trial document context passed to you.

Instructions:
    - Retrieve all key facts, numerical data, contextual insights, implicit relationships, metadata, and supporting references from the context. 
    - Ensure no critical information is omitted, even if seemingly minor. However, limit finer details regarding dosing and schedules.
                                      
context: {document_text}
Question : {user_question}
""",
partial_variables={"format_instructions": format_instructions})

In [None]:
chain = prompt | LANGUAGE_MODEL
answer = chain.invoke({"document_text": text_data[0],"user_question":"what is the goal of the study?"})
print(answer)