# Creating a Disease Profile

## Configuration

In [6]:
import google.generativeai as gemini
from anthropic import Anthropic
import os

# "claude" or "gemini"
generator = "claude"    
reviewer = "claude"

disease = "hyperthyrodism"
#examples = ["hyperthyroidism_disease_profile_4.txt"]

claude_api_key = os.getenv('ANTHROPIC_API_KEY')
gemini_api_key = os.getenv('GEMINI_API_KEY')

gemini.configure(api_key=gemini_api_key)
gemini_version = "models/gemini-1.5-pro"      # "models/gemini-1.5-flash" "models/gemini-1.5-pro" 
gemini_max_output_tokens = 8192

claude = Anthropic(api_key = claude_api_key)
claude_version = "claude-3-5-sonnet-20240620"  # "claude-3-opus-20240229"   "claude-3-5-sonnet-20240620" "claude-3-sonnet-20240229" "claude-3-haiku-20240307"
claude_max_output_tokens = 4096

debug = False

## Prompt

In [7]:
profile_prompt = f"""
Please create a comprehensive disease profile for {disease} . Output the result as a numbered list of facts about the disease covering the following aspects, where applicable:

* Prevalence and incidence in the general population and specific subgroups (e.g., by age, gender, ethnicity)
* Risk factors with associated increased risk percentages
* Etiology, including common causes and their relative frequencies
* Pathophysiology
* Symptoms and clinical presentation, including probability of each symptom
* Diagnostic criteria and tests, with sensitivity and specificity data
* Differential diagnoses
* Screening recommendations and prevention strategies, if applicable
* Treatment options, including: a. First-line treatments b. Alternative treatments c. Success rates for each treatment d. Criteria for selecting different treatment approaches e. Common side effects and their frequencies
* Monitoring and follow-up protocols
* Short-term and long-term outcomes
* Complication rates and types
* Recurrence or progression rates
* Survival rates, if applicable
* Quality of life impacts

Please provide quantitative data whenever possible, such as percentages, probabilities, or specific numeric ranges. Include relevant medical test results with their normal and abnormal ranges. If certain information is not typically applicable or available for this disease, you may omit those sections.
Aim for a comprehensive profile of 30-60 numbered points.
"""

## Utilities

In [9]:
import io, threading, time, re, json
import pandas as pd
from json_repair import repair_json

def upload_file(file_path):
    try:
        with open(file_path, 'r') as file:
            content = file.read()
        return content
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        return content

def heartbeat(stop_event, start_time):
    """Prints elapsed time every second until told to stop."""
    elapsed_time = 0
    while not stop_event.is_set():
        elapsed_time = int(time.time() - start_time)
        print(f"{elapsed_time} seconds", end="\r", flush=True)
        time.sleep(1)
    # Print final time
    elapsed_time = int(time.time() - start_time)
    print(f"Total time taken: {elapsed_time} seconds")
    
def is_valid_json(text):
    try:
        json.loads(text)
        return True
    except:
        return False
    
# gemini maintains history in a chat session object. Messages are sent to a specific chat session.
def message_gemini(prompt, chat_session=None):
    if chat_session is None:
        model = gemini.GenerativeModel(model_name=gemini_version, generation_config={"max_output_tokens": gemini_max_output_tokens, "response_mime_type": "application/json"})
        chat_session = model.start_chat()
    start_time = time.time()
    stop_event = threading.Event()
    heartbeat_thread = threading.Thread(target=heartbeat, args=(stop_event, start_time))
    heartbeat_thread.start()
    try:
        result = chat_session.send_message(prompt) 
    finally:
        stop_event.set()  # Signal the heartbeat to stop
        heartbeat_thread.join()  # Wait for the heartbeat thread to finish
    return result, chat_session

# claude maintains chat history in an array of messages alternating between 'user' and 'assistant'.
# the optional assistant_prompt is the beginning of the output that Claude will extend
def message_claude(user_prompt, assistant_prompt=None, messages=None):
    if messages is None:
        messages = [{"role": "user","content": user_prompt}]
    else:
        messages.append({"role": "user","content": user_prompt})
    if assistant_prompt is not None:
        messages.append({"role": "assistant", "content": assistant_prompt})
    if debug:
        print("About to message_claude with these messages:")
        print_claude_messages(messages)
    start_time = time.time()
    stop_event = threading.Event()
    heartbeat_thread = threading.Thread(target=heartbeat, args=(stop_event, start_time))
    heartbeat_thread.start()
    try:
        response = claude.messages.create(model=claude_version, max_tokens=claude_max_output_tokens, messages=messages)
    finally:
        stop_event.set()  # Signal the heartbeat to stop
        heartbeat_thread.join()  # Wait for the heartbeat thread to finish
    if assistant_prompt is not None:
        messages=messages[:-1]
    messages.append({"role": response.role, "content": response.content[0].text})
    return response, messages

In [10]:
response = message_claude(profile_prompt)

Total time taken: 24 seconds


In [12]:
print(response[0].content[0].text)

Here's a comprehensive disease profile for acute lymphoblastic leukemia (ALL):

1. The overall incidence of ALL is approximately 1.7 per 100,000 individuals annually in the United States.

2. ALL is the most common childhood cancer, accounting for about 25% of all cancers in children under 15 years old.

3. The incidence of ALL has a bimodal age distribution, with peaks in children aged 2-5 years and adults over 50 years.

4. Males are slightly more affected than females, with a male-to-female ratio of 1.3:1.

5. Hispanic children have the highest incidence rate (43 cases per million), followed by non-Hispanic whites (36 cases per million), and African Americans (15 cases per million).

6. Risk factors include genetic syndromes (e.g., Down syndrome, 20-fold increased risk), exposure to ionizing radiation (2-4 fold increased risk), and certain chemotherapies (2-10 fold increased risk).

7. The etiology of ALL is largely unknown, but it involves genetic alterations in lymphoid progenitor