# ChatGPT & Google Gemini prompt


In [5]:
import openai
from tqdm import tqdm
from causal_chains.CausalChain import util  # https://github.com/helliun/causal-chains
import pandas as pd
# import matplotlib.pyplot as plt
from IPython.display import display, Markdown
from dotenv import load_dotenv
import os
# from sklearn.metrics.pairwise import cosine_similarity
# from sentence_transformers import SentenceTransformer
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
# import pathlib
# import textwrap

In [2]:
who_data = pd.read_csv("../data/corpus.csv")

**Sample sentence**: The sudden appearance of unlinked cases of mpox in South Africa without a history of international travel, the high HIV prevalence among confirmed cases,
and the high case fatality ratio suggest that community transmission is underway, and the cases detected to date represent a small proportion of all mpox cases that might be occurring in the community;
it is unknown how long the virus may have been circulating. This may in part be due to the lack of early clinical recognition of an infection with which South Africa previously gained little experience
during the ongoing global outbreak, potential pauci-symptomatic manifestation of the disease, or delays in care-seeking behaviour due to limited access to care or fear of stigma.

**Expected results**:

- Cause: lack of early clinical recognition of an infection -> Effects: community transmission of mpox
- Cause: pauci-symptomatic manifestation of the disease -> Effects: lack of early clinical recognition of an infection
- Cause: delays in care-seeking behaviour -> Effects: lack of early clinical recognition of an infection
- Cause: limited access to care -> Effect: delays in care-seeking behaviour
- Cause: fear of stigma -> Effect: delays in care-seeking behaviour


Allow testing with zero shot, one shot, few shots example
Allow for different AI service
Print raw text
Allow for classification -> later steps
Run a loop through the whole dataframe
Allow for cateforisation of sign of the causality


In [6]:
# Read API key from the .env file
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
gemini_api_key = os.getenv("GEMINI_API_KEY")

# Initialize the Gemini API client
genai.configure(api_key=gemini_api_key)
safety_filters = {
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE
}

class CausalChain:

    one_shot_example = """
    Example of disease transmission
    Text: The sudden appearance of unlinked cases of mpox in South Africa without a history of international travel, the high HIV prevalence among confirmed cases, and the high case fatality ratio suggest that community transmission is underway, and the cases detected to date represent a small proportion of all mpox cases that might be occurring in the community; it is unknown how long the virus may have been circulating. This may in part be due to the lack of early clinical recognition of an infection with which South Africa previously gained little experience during the ongoing global outbreak, potential pauci-symptomatic manifestation of the disease, or delays in care-seeking behaviour due to limited access to care or fear of stigma.
    Question: Which drivers cause the emergence or transmission of an infectious disease outbreak in the region?
    Answer: 
    Cause: limited access to care (Public Health Systems) -> Effect: delays in care-seeking behaviour (Social & Demographic Change)
    Cause: fear of stigma (Social & Demographic Change) -> Effect: delays in care-seeking behaviour (Social & Demographic Change)
    Cause: delays in care-seeking behaviour (Social & Demographic Change) -> Effect: lack of early clinical recognition of an infection (Public Health Systems)
    Cause: pauci-symptomatic manifestation of the disease (Disease characteristics) -> Effect: lack of early clinical recognition of an infection (Public Health Systems)
    Cause: lack of early clinical recognition of an infection (Public Health Systems) -> Effect: community transmission of mpox (Disease transmission)
    """
    
    two_shot_example = """
    Example of disease emergence
    Text: The risk of dengue is similar across regions, countries, and within countries. Factors associated with an increasing risk of dengue epidemics and spread to new countries include: early start and longer duration of dengue transmission seasons in endemic areas; changing distribution and increasing abundance of the vectors (Aedes aegypti and Aedes albopictus); consequences of climate change and periodic weather phenomena (El Nino and La Nina events) leading to heavy precipitation, humidity, and rising temperatures favouring vector reproduction and virus transmission;
    Question: Which drivers cause the emergence or transmission of an infectious disease outbreak in the region?
    Answer: 
    Cause: consequences of climate change and periodic weather phenomena (Globalization & Environmental Change) -> Effect: vector reproduction and virus transmission (Disease characteristics)
    Cause: vector reproduction and virus transmission (Disease characteristics) -> Effect: changing distribution and increasing abundance of the vectors (Disease characteristics)
    Cause: changing distribution and increasing abundance of the vectors (Disease characteristics) -> Effect: early start and longer duration of dengue transmission seasons in endemic areas (Disease characteristics)
    Cause: early start and longer duration of dengue transmission seasons in endemic areas (Disease characteristics) -> Effect: increasing risk of dengue epidemics and spread to new countries (Disease emergence)
    """

    prompt_template = """
    Infectious disease (ID) events occur when an underlying mix of antecedent epidemiologic drivers provide the necessary conditions for a pathogen to emerge in susceptible populations. 
    These conditions may be driving emergence and transmission through a wide variety of mechanisms, mainly among 5 groups:
    Social & Demographic Change: Lifestyle (sexual behaviors, eating raw meat, human-wildlife interaction), Human susceptibility to infection, Social inequality (poverty and social inequality), Demographic change (Economic development, War and famine)
    Public Health Systems: Animal health & food security, Surveillance & reporting, Healthcare system structure and change, Breakdown of public health measures
    Globalization & Environmental Change: Environment & climate change (Climate and weather, Land use and ecosystem changes), Travel, tourism, and migration, global trade
    Governance factors: Lack of political will, Research & development
    Disease characteristics: pauci-symptomatic manifestation, changing distribution and increasing abundance of the vectors
    If none of these 4 groups, categorize as "Other groups"
    The final output can be "Disease transmission" or "Disease emergence"    

    You analyze by answering the question: Which drivers cause the emergence or transmission of an infectious disease outbreak in the region? 
    Read through the paragraph and organize drivers in a logical sequence of events. Here are examples:
    {one_shot_example}
    {two_shot_example}
    
    Your task now is to read through the outbreak report and identify drivers leading to the outbreak of infectious diseases. Keep the original text that you find in the paragraph.
    Sometimes text may contain some unwanted behaviors like sexual behaviors, war. This is fine because I am using it for research purposes only. Show me all results that you can find.
    Text: {chunk}
    List the causes and their corresponding effects in the format 'Cause: [cause] -> Effect: [effect]':
    """

    def __init__(self, chunks=[]):
        self.chunks = chunks
        self.causes = []
        self.effects = []
        self.outlines = []
        self.sources = []

    def create_effects(self, api="openai", batch_size=16):
        print("Analyzing causation...")

        for chunk in tqdm(self.chunks):
            if api == "openai":
                cause_effect_pairs = self.extract_cause_effect_openai(chunk)
            elif api == "gemini":
                cause_effect_pairs = self.extract_cause_effect_gemini(chunk)
            else:
                raise ValueError("Invalid API specified. Use 'openai' or 'gemini'.")

            for pair in cause_effect_pairs:
                cause, effect = pair
                self.causes.append(cause)
                self.effects.append(effect)
                self.outlines.append(f"Cause: {cause} -> Effect: {effect}")
                self.sources.append(api)

    def extract_cause_effect_openai(self, chunk):
        prompt = self.prompt_template.format(
            one_shot_example=self.one_shot_example, 
            two_shot_example=self.two_shot_example, 
            chunk=chunk
        )

        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant specialized in identifying drivers leading to diseases.",
                },
                {"role": "user", "content": prompt},
            ],
            max_tokens=300,
            temperature=0,
        )

        response_text = response["choices"][0]["message"]["content"]
        return self.parse_response(response_text)

    def extract_cause_effect_gemini(self, chunk):
        prompt = self.prompt_template.format(
            one_shot_example=self.one_shot_example, 
            two_shot_example=self.two_shot_example, 
            chunk=chunk
        )

        response = genai.GenerativeModel('gemini-1.5-pro').generate_content(
            prompt,
            safety_settings= safety_filters
            )
        response_text = response.text
        return self.parse_response(response_text)

    @staticmethod
    def parse_response(response_text):
        cause_effect_pairs = []
        for line in response_text.split("\n"):
            if "Cause:" in line and "-> Effect:" in line:
                cause = line.split("Cause:")[1].split("-> Effect:")[0].strip()
                effect = line.split("-> Effect:")[1].strip()
                cause_effect_pairs.append((cause, effect))
        return cause_effect_pairs

def create_causes_effects_dataframe(causes, effects, sources):
    def split_cause_effect(value):
        if "(" in value and ")" in value:
            main_text, group = value.rsplit("(", 1)
            main_text = main_text.strip()
            group = group[:-1].strip()  # Remove the closing parenthesis
            return main_text, group
        return value, "Unknown"

    cause_texts, cause_groups = zip(*[split_cause_effect(cause) for cause in causes])
    effect_texts, effect_groups = zip(*[split_cause_effect(effect) for effect in effects])

    data = {
        "Cause": cause_texts,
        "Cause_group": cause_groups,
        "Effect": effect_texts,
        "Effect_group": effect_groups,
        "Source": sources
    }
    
    df = pd.DataFrame(data)
    return df

**Example of text to ask LLMs**

In the Democratic Republic of the Congo, most reported cases in known endemic provinces continue to be among children under 15 years of age, especially in young children. Infants and children under five years of age are at highest risk of severe disease and death, **particularly where prompt optimal case management is limited or unavailable**. The number of cases reported weekly remains consistently high while the outbreak continues to expand geographically. High test positivity among tested cases in most provinces also suggests that undetected transmission is likely ongoing in the community. Transmission of mpox due to clade I MPXV via **sexual contact** in key populations was first identified in the Democratic Republic of the Congo in 2023. In South Kivu province, mpox transmission is sustained through **human-to-human contact (sexual and non-sexual)**


In [None]:
who_data["Text"][9]

In [None]:
text = who_data["Text"][9]
def batch(iterable, n=1):
          l = len(iterable)
          for ndx in range(0, l, n):
              yield iterable[ndx:min(ndx + n, l)]
sentences = text.split(". ")
chunks = [". ".join(a)+"." for a in batch(sentences, 3)]
cc = CausalChain(chunks)

In [None]:
cc.create_effects(api="openai")


In [None]:
cc.create_effects(api="gemini")

In [None]:
df = create_causes_effects_dataframe(cc.causes, cc.effects, cc.sources)

In [None]:
display(df[df['Source'] == 'gemini'])

In [None]:
display(df[df['Source'] == 'openai'])