In [None]:
# !pip install langchain
# !pip install pypdf
# !pip install yt_dlp
# !pip install pydub
# !pip install chromadb
# !pip install langchain_openai
# !pip install sentence_transformers

In [1]:
import os

import shutil

In [2]:
# Load dependencies
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline, HuggingFaceHub
from transformers import AutoTokenizer, pipeline, logging
from openai import OpenAI
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.retrievers.self_query.base import SelfQueryRetriever

In [3]:
# Load the protocol and cut it up into chunks
loader = PyPDFLoader("TITRE-protocol.pdf")
pages = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
splits = text_splitter.split_documents(pages)
print(len(splits))

156


In [4]:
# Embed the chunks and store in Chromadb
model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True}
embedding_function = HuggingFaceBgeEmbeddings(model_name=model_name,
                                              encode_kwargs=encode_kwargs)
persist_directory = 'docs/chroma/'
# if os.path.exists("docs/chroma/") and os.path.isdir(persist_directory):
#     shutil.rmtree("docs/")
#     print(f"The directory {persist_directory} has been deleted.")
# else:
#     print(f"The directory {persist_directory} did not exist.")
# persist_directory = 'docs/chroma/'
vectordb = Chroma.from_documents(documents=splits,
                                 embedding=embedding_function,
                                 persist_directory=persist_directory)


  embedding_function = HuggingFaceBgeEmbeddings(model_name=model_name,


In [25]:
# Set up my local LLM, which is running on the M3 laptop.
llm = OpenAI(base_url="http://192.168.1.157:1234/v1/", api_key="not-needed")
# llm = OpenAI(base_url="http://192.168.1.157:11434", api_key="not-needed")

In [13]:
def getRagData(query:str, k=5):
    rag_return = vectordb.similarity_search(query, k=k)
    prompt = ""
    for r in rag_return:
        prompt = prompt + r.page_content
    prompt = query + "\n" + prompt
    return prompt

In [22]:
def askQuestion(query: str, k=5):
    prompt = getRagData(query,k)
    completion = llm.chat.completions.create(
        model="local-model", # this field is currently unused
        messages=[  
            {"role": "system", "content": "You are an expert helpful assistant."},   
            {"role": "user", "content": prompt}
        ], 
        temperature=0.01
    )
    print(completion.choices[0].message.content)

In [32]:
import requests
import json

def askQuestionOllama(query: str, k=5, model="deepseek-r1:8b"):
    prompt = getRagData(query, k)
    
    # Ollama API endpoint
    url = "http://192.168.1.157:11434/api/generate"
    
    # Prepare the payload according to Ollama API
    payload = {
        "model": model,
        "prompt": prompt,
        "system": "You are an expert helpful assistant.",
        "stream": False,
        "temperature": 0.01
    }
    
    headers = {"Content-Type": "application/json"}
    
    try:
        response = requests.post(url, headers=headers, data=json.dumps(payload))
        response.raise_for_status()
        
        result = response.json()
        
        # Ollama returns the response in the "response" field
        if "response" in result:
            print(result["response"])
            return result["response"]
        else:
            print(f"Unexpected response structure: {result}")
            return None
    except Exception as e:
        print(f"Error during Ollama API call: {e}")
        import traceback
        traceback.print_exc()
        return None

In [9]:
def askQuestion(query: str, k=5):
    prompt = getRagData(query, k)
    try:
        completion = llm.chat.completions.create(
            model="meta-llama-3.1-8b-instruct",
            messages=[
                {"role": "system", "content": "You are an expert helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.01
        )
        print(completion.choices[0].message.content)
    except Exception as e:
        print(f"Error during API call: {e}")

In [28]:
askQuestion("Describe the inclusion and exclusion criteria of the study.",15)

The inclusion and exclusion criteria of the study are as follows:

**Inclusion Criteria (all must be met):**

1. Age < 6 years at ECMO cannulation
2. Veno-arterial (VA) mode of ECMO
3. First ECMO run during the index hospitalization

**Exclusion Criteria (any single criterion met at time of screening excludes the patient):**

1. Gestationally-corrected age < 37 weeks
2. Veno-venous (VV) mode of ECMO
3. Patients initially started on VV-ECMO and then transitioned to VA ECMO
4. ECMO used for procedural support (ECMO deployed and decannulated in procedural area with no ICU ECMO care) or ECMO duration expected to be < 24 hours
5. Limitation of care in-place or being discussed
6. Congenital bleeding disorders
7. Hemoglobinopathies
8. Primary Residence outside the country of enrollment
9. Concurrent participation in a separate interventional trial that has the potential to impact neurodevelopmental status of the patient

These criteria are used to select participants for the study, which aims

In [20]:
askQuestion("Please write a detailed checklist for starting up and activating a clinical site for the study.")


**Clinical Site Activation Checklist**

**Pre-Activation (Before Site Visit)**

1. **Review Study Protocol**: Ensure understanding of the study protocol, including inclusion and exclusion criteria, study procedures, and informed consent process.
2. **Verify IRB/REB Approval**: Confirm that the study protocol and any amendments have been approved by a properly constituted independent IRB or REB.
3. **Obtain Site-Specific Information**: Gather information about the site, including the number of patients, available resources, and any site-specific procedures.
4. **Review Regulatory Documentation**: Ensure that all necessary regulatory documents, such as informed consent forms and trial eligibility forms, are complete and accurate.
5. **Coordinate with DCC PI and/or Trial Co-Chair**: Schedule a site visit with the DCC PI and/or trial co-chair to discuss site-specific details and ensure a smooth activation process.

**Site Visit and Activation**

1. **Meet with Local Study Team**: Meet with

In [24]:
!curl -v http://192.168.1.157:11434

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


*   Trying 192.168.1.157:11434...
* Connected to 192.168.1.157 (192.168.1.157) port 11434
> GET / HTTP/1.1
> Host: 192.168.1.157:11434
> User-Agent: curl/8.7.1
> Accept: */*
> 
* Request completely sent off
< HTTP/1.1 200 OK
< Content-Type: text/plain; charset=utf-8
< Date: Mon, 17 Mar 2025 15:25:07 GMT
< Content-Length: 17
< 
* Connection #0 to host 192.168.1.157 left intact
Ollama is running

In [11]:
import requests
import json

url = "http://192.168.1.157:1234/v1/chat/completions"
payload = {
    "model": "meta-llama-3.1-8b-instruct",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Hello, how are you?"}
    ],
    "temperature": 0.01
}
headers = {"Content-Type": "application/json"}

try:
    response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=5)
    response.raise_for_status()
    print(response.json())
except requests.exceptions.RequestException as e:
    print(f"Error: {e}")

{'id': 'chatcmpl-qmpncabz5plju7yhf4fjcn', 'object': 'chat.completion', 'created': 1742224676, 'model': 'mlx-community/meta-llama-3.1-8b-instruct', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': "I'm just a computer program, so I don't have feelings, but I'm functioning properly and ready to help you with any questions or tasks you may have. How can I assist you today?"}}], 'usage': {'prompt_tokens': 47, 'completion_tokens': 88, 'total_tokens': 88}, 'stats': {}, 'system_fingerprint': 'mlx-community/meta-llama-3.1-8b-instruct'}


In [34]:
askQuestionOllama("Please write a descriptive summary of the study, including the background, significance, and reasons for doing the study.")

<think>
Okay, so I need to help the user write a descriptive summary of their study. They provided a chunk of text that seems repetitive and includes some specific details about informed consent and secondary analyses. Hmm, maybe they included too much information by accident?

Alright, first, I should break down what they're asking for. They want a background, significance, and reasons for the study. So I need to figure out what the study is about. The text mentions IRB/REB approval, informed consent forms, and modifications for local regulations. It also talks about PedsQL and secondary analyses like regression modeling and genetic findings.

Wait, the content they provided seems to be part of a protocol or ethics application, not the actual study summary. They might have pasted that by mistake. So my task is to create a concise summary based on what's provided. Let me parse through it again.

They mention participants needing an informed consent form, which is approved by IRB/REB an

"<think>\nOkay, so I need to help the user write a descriptive summary of their study. They provided a chunk of text that seems repetitive and includes some specific details about informed consent and secondary analyses. Hmm, maybe they included too much information by accident?\n\nAlright, first, I should break down what they're asking for. They want a background, significance, and reasons for the study. So I need to figure out what the study is about. The text mentions IRB/REB approval, informed consent forms, and modifications for local regulations. It also talks about PedsQL and secondary analyses like regression modeling and genetic findings.\n\nWait, the content they provided seems to be part of a protocol or ethics application, not the actual study summary. They might have pasted that by mistake. So my task is to create a concise summary based on what's provided. Let me parse through it again.\n\nThey mention participants needing an informed consent form, which is approved by IR

In [None]:
from openai import OpenAI
import logging

# Enable debugging
logging.basicConfig(level=logging.DEBUG)

# Initialize the OpenAI client
llm = OpenAI(base_url="http://192.168.1.157:1234/v1", api_key="not-needed")

def getRagData(query: str, k=5):
    rag_return = vectordb.similarity_search(query, k=k)  # Assuming vectordb is defined
    prompt = ""
    for r in rag_return:
        prompt += r.page_content
    prompt = query + "\n" + prompt
    return prompt

def askQuestion(query: str, k=5):
    prompt = getRagData(query, k)
    try:
        completion = llm.chat.completions.create(
            model="meta-llama-3.1-8b-instruct",
            messages=[
                {"role": "system", "content": "You are an expert helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.01
        )
        print(completion.choices[0].message.content)
    except Exception as e:
        print(f"Error during API call: {e}")

# Run the function
askQuestion("Describe the inclusion and exclusion criteria of the study.", 15)

In [None]:
askQuestion("What are the primary and secondary endpoints of the study. What are the main hypotheses?")


In [None]:
askQuestion("What are the risks and benefits of participating in the study?")


In [None]:
askQuestion("What are the potential adverse events and how are these defined? ")


In [39]:
askQuestionOllama("List all the visits in the study")

<think>
Okay, so I need to figure out what the user is asking for based on the query they provided. The initial part of their message seems repetitive and includes mentions of study visits, specifically Visits 6 and 7 at 6 and 9 months post-randomization. They also mention using PCPC, POPC scales, and FSS scores during these visits, either in-person or via medical records if data collection isn't feasible.

Then, the user pasted a section about participant selection and withdrawal, listing inclusion and exclusion criteria, recruitment methods, and reasons for early withdrawal. It's clear that this is part of a study protocol, possibly in clinical research.

The user ended their query with "Please list all the visits in the study." Given the initial repetition, I think they might have made a mistake or were testing something. But focusing on their main request: listing all visits.

Looking at the text, there are mentions of Visit 6 and Visit 7. However, the text is repetitive, so it's p

'<think>\nOkay, so I need to figure out what the user is asking for based on the query they provided. The initial part of their message seems repetitive and includes mentions of study visits, specifically Visits 6 and 7 at 6 and 9 months post-randomization. They also mention using PCPC, POPC scales, and FSS scores during these visits, either in-person or via medical records if data collection isn\'t feasible.\n\nThen, the user pasted a section about participant selection and withdrawal, listing inclusion and exclusion criteria, recruitment methods, and reasons for early withdrawal. It\'s clear that this is part of a study protocol, possibly in clinical research.\n\nThe user ended their query with "Please list all the visits in the study." Given the initial repetition, I think they might have made a mistake or were testing something. But focusing on their main request: listing all visits.\n\nLooking at the text, there are mentions of Visit 6 and Visit 7. However, the text is repetitive,

In [38]:
askQuestion("List all the visits in the study")

Based on the provided information, there are two visits mentioned:

1. Visit 6: 6 month post-randomization follow-up (Timing 6 ± 1 month)
2. Visit 7: 9 month post-randomization follow-up (Timing 9 ± 1 month)

Both visits will include the collection of data via a scripted form or in-person, depending on the situation, and will include the following:

- PCPC (Pain Catastrophizing Scale) and POPC (Pain Outcome Profile) scales
- FSS (Fear-Avoidance Beliefs Questionnaire) scores

If it's not feasible to collect these data via the scripted form, the information will be obtained from medical records.
