### Create a conda environment (or venv if you prefer that method) to work in, many ways to run this command:
`conda create --name <name_of_env> python==3.11`

#### When running next command, will be prompted to install ipykernel package, do that first...

In [1]:
# Install the core packages for LangChain, Ollama, and related components
%pip install -U langchain langchain-ollama langchain-community langchain-huggingface faiss-cpu huggingface_hub pandas python-dotenv

Note: you may need to restart the kernel to use updated packages.


#### Environment Setup and Imports

In [5]:
# Import necessary libraries
import os
import pandas as pd
import json
import logging
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import time
import atexit

# Load environment variables
load_dotenv()

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Define Ollama generation arguments
generation_kwargs = {
    "max_tokens": 500,
    "temperature": 0.5
}

#### Initialization of Vector Stores and Embeddings (Create vector storage that will be drawn from for RAG)

In [6]:
# Install remaining packages
%pip install tqdm tenacity tiktoken

import pandas as pd
import os
from dotenv import load_dotenv
import traceback
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from tenacity import retry, stop_after_attempt, wait_random_exponential, retry_if_exception_type
from huggingface_hub import login
import tiktoken

# Add a huggingface token to .env file 
load_dotenv()

# Use the Hugging Face access token for authentication
hf_access_token = os.getenv('HF_TOKEN')

#PATH_TO_DATA = "hf://datasets/zhengyun21/PMC-Patients/PMC-Patients.csv"
PATH_TO_DATA = "hf://datasets/ncbi/Open-Patients/Open-Patients.jsonl"
BATCH_SIZE = 10  # Adjust based on your needs
PATH_TO_VECTORDB = "./db/faiss_index"
TOKEN_LIMIT = 125  # Maximum number of tokens per chunk

# Initialize tokenizer (assuming LLaMA tokenizer, you can replace with a specific tokenizer you use)
tokenizer = tiktoken.get_encoding("cl100k_base")  # Replace with the correct tokenizer for your model if different

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [3]:
def chunk_text(text, token_limit):
    """
    Splits a long text into chunks of no more than `token_limit` tokens.
    
    Args:
        text (str): The text to be chunked.
        token_limit (int): Maximum number of tokens per chunk.

    Returns:
        List of chunks (str) each within the token limit.
    """
    tokens = tokenizer.encode(text)
    chunks = []
    
    for i in range(0, len(tokens), token_limit):
        chunk_tokens = tokens[i:i + token_limit]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text)
    
    return chunks

def create_documents(row):
    """
    Create documents, chunked by the token limit. One row can result in multiple documents.
    
    Args:
        row: A row of the dataframe (with _id and description fields).
    
    Returns:
        List of Documents, one for each chunk.
    """
    chunks = chunk_text(row['description'], TOKEN_LIMIT)  # Access the 'description' field
    
    return [
        Document(
            page_content=chunk,
            metadata={"_id": row['_id']}  # Use _id as metadata
        )
        for chunk in chunks
    ]

@retry(
    wait=wait_random_exponential(min=1, max=60),
    stop=stop_after_attempt(6),
    retry=retry_if_exception_type(Exception)
)
def embed_with_backoff(embeddings, texts):
    return embeddings.embed_documents(texts)

def process_batch(batch, embeddings):
    try:
        return embed_with_backoff(embeddings, batch)
    except Exception as e:
        print(f"Error embedding batch: {str(e)}")
        return None

# Build vector database
def main():
    if os.path.exists(PATH_TO_VECTORDB):
        print('Vector database already exists')
    else:
        try:
            print('Loading data')
            # Loading JSON data instead of CSV
            df = pd.read_json(PATH_TO_DATA, lines=True)  # Assuming the JSON is in "lines" format

            # Limiting to the first 1000 rows
            df = df.head(10)

            total_rows = len(df)

            print('Creating documents')
            # Generate documents, including chunking where necessary
            documents = []
            for _, row in tqdm(df.iterrows(), total=total_rows, desc="Creating documents"):
                documents.extend(create_documents(row))

            print('Building vector database...')
            # Initialize Hugging Face embeddings with the same model used in evaluate_matches.py
            embeddings = HuggingFaceEmbeddings(
                model_name="mixedbread-ai/mxbai-embed-large-v1",
                model_kwargs={"trust_remote_code": True}
            )

            # Batch processing for embeddings
            batches = [documents[i:i + BATCH_SIZE] for i in range(0, len(documents), BATCH_SIZE)]

            all_embeddings = []
            with ThreadPoolExecutor() as executor:
                futures = [executor.submit(process_batch, [doc.page_content for doc in batch], embeddings) for batch in batches]
                for future in tqdm(as_completed(futures), total=len(futures), desc="Processing batches"):
                    result = future.result()
                    if result is not None:
                        all_embeddings.extend(result)

            # Filter out documents for which we couldn't get embeddings
            valid_docs = [doc for doc, emb in zip(documents, all_embeddings) if emb is not None]
            valid_embeddings = [emb for emb in all_embeddings if emb is not None]

            print(f"Successfully embedded {len(valid_docs)} out of {len(documents)} documents")

            # Create the FAISS vector store with Hugging Face embeddings
            vector_store = FAISS.from_embeddings(
                text_embeddings=list(zip([doc.page_content for doc in valid_docs], valid_embeddings)),
                embedding=embeddings,
                metadatas=[doc.metadata for doc in valid_docs]
            )

            retriever = vector_store.as_retriever()

            print('Saving vector database locally...')
            vector_store.save_local(PATH_TO_VECTORDB)

            print('Done')
            return vector_store, retriever
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            print("Traceback:")
            print(traceback.format_exc())
            return "Error occurred", "Error occurred", [], []

if __name__ == "__main__":
    main()


Loading data
Creating documents


Creating documents: 100%|██████████| 10/10 [00:00<00:00, 3282.70it/s]

Building vector database...



Processing batches: 100%|██████████| 2/2 [00:01<00:00,  1.71it/s]


Successfully embedded 12 out of 12 documents
Saving vector database locally...
Done


#### Or load the preindexed by Brady PMC-Patients dataset (This is your vector storage or indexed documents/text, what RAG draws from)

In [11]:
# Initialize embeddings and FAISS vector store
PATH_TO_VECTORDB = "./db/faiss_index"

embeddings = HuggingFaceEmbeddings(
    model_name="mixedbread-ai/mxbai-embed-large-v1",
    model_kwargs={"trust_remote_code": True}
)

vector_store = FAISS.load_local(PATH_TO_VECTORDB, embeddings, allow_dangerous_deserialization=True)


2024-10-18 00:54:06,379 - INFO - Use pytorch device_name: mps
2024-10-18 00:54:06,380 - INFO - Load pretrained SentenceTransformer: mixedbread-ai/mxbai-embed-large-v1


#### Query Ollama (Invoking LLM)

In [8]:
def query_ollama(prompt, generation_kwargs, output_format="text"):
    """
    Sends a prompt to the Ollama model using langchain-ollama integration.
    Args:
    prompt (str): The generated prompt to send to the model.
    generation_kwargs (dict): Generation settings.
    output_format (str): The expected format of the output, either "text" or "json".
    """
    model = OllamaLLM(model="llama3.2:3b")
    chat_prompt = ChatPromptTemplate.from_template(prompt)
    chain = chat_prompt | model

    response = chain.invoke({"question": prompt})

    if output_format == "json":
        try:
            result = json.loads(response)
            return result
        except json.JSONDecodeError:
            logger.error("Failed to decode JSON response from model.")
            return {
                "error": "Error in generating response or unexpected response format."
            }
    else:
        return response

#### Functions for Generation (Instructions for the LLM)

In [9]:
def generate_summary(article_text, query_ollama, generation_kwargs):
    prompt = f"""
    Summarize the following patient case in a single paragraph, focusing on relevant phenotypes, diagnoses, and other attributes of the patient.
    The summary should be concise and retain important scientific or clinical terminology.
    ### Article Text:
    {article_text}
    """.strip()

    response = query_ollama(prompt, generation_kwargs, output_format="text")
    return response


def generate_potential_causes(patient_case, article_summaries):
    examples = "\n\n".join([f"### Example Case {i+1}:\n{summary}" for i, summary in enumerate(article_summaries)])

    prompt = f"""
    A user has provided a detailed patient's case with specific symptoms and medical history.
    Below are some examples of similar patient cases from medical literature.
    
    Your task is to identify potential causes for the symptoms described in the user's case, using the examples as references.

    ### User-Provided Patient Case:
    {patient_case}
    
    {examples}

    Please provide:
    - A list of potential causes based on the information.
    - Brief explanations for each cause.
    """
    return prompt


In [13]:
def ask_me_potential_causes(patient_narrative, max_retries=10, delay=2, k=4):
    retries = 0
    while retries < max_retries:
        try:
            logger.info(f"Patient Case: {patient_narrative}")
            # Retrieve similar cases from FAISS vector store
            docs = vector_store.similarity_search_with_score(patient_narrative, k=k)
            article_summaries = [doc.page_content for doc, score in docs]

            # Save related cases in human-readable format to a .txt file
            output_path_cases_txt = "related_patient_cases.txt"
            with open(output_path_cases_txt, 'w') as txt_file:
                txt_file.write("Related Patient Cases (Double-Spaced):\n\n")
                for i, summary in enumerate(article_summaries):
                    txt_file.write(f"Case {i+1}:\n{summary}\n\n")
            logger.info(f"Related patient cases saved to {output_path_cases_txt} in human-readable format.")

            # Generate a prompt to find potential causes
            prompt = generate_potential_causes(patient_narrative, article_summaries)
            potential_causes = query_ollama(prompt, generation_kwargs, output_format="text")

            # Save potential causes to a .txt file
            output_path_causes_txt = "potential_causes.txt"
            with open(output_path_causes_txt, 'w') as txt_file:
                txt_file.write("Potential Causes for Patient Case:\n\n")
                txt_file.write(potential_causes)
            logger.info(f"Potential causes saved to {output_path_causes_txt}.")

            return potential_causes

        except Exception as e:
            retries += 1
            logger.error(f"Error occurred: {e}. Retrying {retries}/{max_retries}...")
            time.sleep(delay)

    logger.warning("Max retries reached, could not complete the request.")
    return "Max retries reached, could not complete the request."


# Example usage
if __name__ == "__main__":
    patient_narrative = "64-year-old obese female with diagnosis of diabetes mellitus and persistently elevated HbA1c"

    result = ask_me_potential_causes(patient_narrative)
    logger.info("Potential causes identified and saved.")



2024-10-18 00:55:27,957 - INFO - Patient Case: 64-year-old obese female with diagnosis of diabetes mellitus and persistently elevated HbA1c
2024-10-18 00:55:29,065 - INFO - Related patient cases saved to related_patient_cases.txt in human-readable format.
2024-10-18 00:55:31,443 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2024-10-18 00:55:40,726 - INFO - Potential causes saved to potential_causes.txt.
2024-10-18 00:55:40,726 - INFO - Potential causes identified and saved.
