# Causality mining using OpenAI API


## Loading packages and data

In [1]:
from openai import OpenAI
from tqdm import tqdm
import pandas as pd
# from sentence_transformers import SentenceTransformer, util
# from sklearn.cluster import AgglomerativeClustering
# from sklearn.metrics.pairwise import cosine_distances
from IPython.display import display, Markdown
from dotenv import load_dotenv
import os
import re

In [3]:
# Read API key from the .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [2]:
who_data = pd.read_csv("../data/corpus.csv")
who_data_epi = who_data[who_data["InformationType"] == "Epidemiology"]
who_data_assessment = who_data[who_data["InformationType"] == "Assessment"]
who_data_epi_and_assessment = who_data[who_data["InformationType"].isin(["Epidemiology", "Assessment"])]

In [2]:
#import 3.Clustering Drivers.xlsx, sheet "Sheet1"
driver_cat = pd.read_excel("../data/3. Clustering Drivers.xlsx", sheet_name="V2_Peter")

## Function defnition

### Prompt Generation

In [4]:
class PromptDesigner:
    def __init__(self):
        # Store different parts of the prompt as class attributes
        self.persona_task_description = """
        You are an epidemiologist tasked with identifying sentences or phrases from outbreak reports that describe the drivers or contributors to the emergence or transmission of emerging pests and pathogens.
        """

        self.domain_localization = """
        Here is the definition of DPSIR (Drivers, Pressure, State, Impacts, and Responses) framework, where it shows how drivers are associated with the emergence of disease.
        Drivers: underlying socio-economic, environmental, or ecological forces that create conditions favourable for how a disease emerges, spreads or sustains transmission in human, animals or plants.
        Pressure: human anthropogenic activities that are mainly responsible for the chances of spillover events and the transmission of pests and pathogens.
        State: the current circulation of pests and pathogens, represented as either new case detected, an endemic, an epidemic or a pandemic.
        Impacts: the effects caused by pests and pathogens on individuals, communities' socio-economic, and political.
        Responses: the actions and interventions taken by governments to manage the occurrence of drivers and pressures, and to control the spread of pests and pathogens and to mitigate the impacts.
        """

        self.causality_definition = """
        Causality definition: In the reports, causality can take two forms. The first form is "Intra-sentence causality", where the “cause” and the “effect” lie in a single sentence, while in "Inter-sentence causality", the “cause” and the “effect” lie in different sentences.
        """
        
        self.extraction_guide = """
        Input text: The sudden appearance of unlinked cases of mpox in South Africa without a history of international travel, the high HIV prevalence among confirmed cases, and the high case fatality ratio suggest that community transmission is underway, and the cases detected to date represent a small proportion of all mpox cases that might be occurring in the community; it is unknown how long the virus may have been circulating. This may in part be due to the lack of early clinical recognition of an infection with which South Africa previously gained little experience during the ongoing global outbreak, potential pauci-symptomatic manifestation of the disease, or delays in care-seeking behaviour due to limited access to care or fear of stigma.
        
        Expected output
        1. Raw text with marked causes and effects
        The sudden appearance of unlinked cases of mpox in South Africa without a history of international travel, the high HIV prevalence among confirmed cases, and the high case fatality ratio suggest that (E1) community transmission (E1) is underway, and the cases detected to date represent a small proportion of all mpox cases that might be occurring in the community; it is unknown how long the virus may have been circulating. This may in part be due to the (C1) lack of early clinical recognition of an infection (C1) with which South Africa previously gained little experience during the ongoing global outbreak, potential (C1) pauci-symptomatic manifestation of the disease (C1), or (C1, E2) delays in care-seeking behavior (C1, E2) due to (C2) limited access to care (C2) or (C2) fear of stigma (C2).
       
        2. Extracted causes and effects
        C1: lack of early clinical recognition of an infection -> E1: community transmission 
        C1: pauci-symptomatic manifestation of the disease -> E1: community transmission 
        C1: delays in care-seeking behavior -> E1: community transmission 
        C2: limited access to care -> E2: delays in care-seeking behaviour
        C2: fear of stigma -> E2: delays in care-seeking behaviour delays in care-seeking behaviour  
        """

        self.few_shot_examples = """
        Below are some examples how causality can be reported in different forms:
        - Single cause, single effect (Type 1)

        Example 1: (C1) High population density and mobility in urban areas (C1) have facilitated (E1) the rapid spread of the virus (E1)". 

        Example 2: There is (C1) no vaccine for Influenza A(H1N1)v infection currently licensed for use in humans (C1). Seasonal influenza vaccines against human influenza viruses are generally not expected to protect people from (E1) infection with influenza viruses (E1) that normally circulate in pigs, but they can reduce severity.


        - Single cause, multiple effects (Type 2)

        Example 3: Several countries including Cameroon, Ethiopia, Haiti, Lebanon, Nigeria (north-east of the country), Pakistan, Somalia, Syria and the Democratic Republic of Congo (eastern part of the country) are in the midst of complex (C1) humanitarian crises (C1) with (E1) fragile health systems (E1), (E1) inadequate access to clean water and sanitation (E1) and have (E1) insufficient capacity to respond to the outbreaks (E1)

        - Multiple causes, single effect (Type 3)
        Example 4: Moreover, (C1) a low index of suspicion (C1), (C1) socio-cultural norms (C1), (C1) community resistance (C1), (C1) limited community knowledge regarding anthrax transmission (C1), (C1) high levels of poverty (C1) and (C1) food insecurity (C1), (C1) a shortage of available vaccines and laboratory reagents (C1), (C1) inadequate carcass disposal (C1) and (C1) decontamination practices (C1) significantly contribute to hampering (E1) the containment of the anthrax outbreak (E1).

        Example 5:
        The (E1) risk at the national level (E1) is assessed as 'High' due to the following:
        + In other parts of Timor-Leste (C1) health workers have limited knowledge dog bite and scratch case management (C1) including PEP and RIG administration
        + (C2) Insufficient stock of human rabies vaccines (C2) in the government health facilities.

        - Multiple causes, multiple effects (Type 4) - Chain of causalities
        The text may describe a chain of causality, where one effect becomes then the cause of another effect. To describe the chain, you should number the causes and effects. For example, cause 1 (C1) -> effect 1 (E1), but since effect 1 is also cause of effect 2, you should do cause 1 (C1) -> effect 1 (E1, C2) -> effect 2 (E2). 

        Example 6: (E2) The risk of insufficient control capacities (E2) is considered high in Zambia due to (C1) concurrent public health emergencies in the country (cholera, measles, COVID-19) (C1) that limit the country’s human and (E1, C2) financial capacities to respond to the current anthrax outbreak adequately (E1, C2).

        Example 7: (C1) Surveillance systems specifically targeting endemic transmission of chikungunya or Zika are weak or non-existent (C1) -> (E1, C2) Misdiagnosis between diseases  & Skewed surveillance (E1, C2) -> (E2, C3) Misinform policy decisions (E2, C3) -> (E3)reduced accuracy on the estimation of the true burden of each diseases (E3), poor risk assessments (E3), and non optimal clinical management and resource allocation (E3). 

        Example 8: (C1) Changes in the predominant circulating serotype (C1) -> (E1, C2) increase the population risk of subsequent exposure to a heterologous DENV serotype (E1, C2), -> (E2) which increases the risk of higher rates of severe dengue and deaths (E2).

        """

        self.negative_cases = """
        Irrelevant causality (negative cases): Some sentences contain causal relationships, but the effect may not be related to the disease transmission or emergence. Avoid classifying those causal relationships.

        Example 1 (no causality): Because these viruses continue to be detected in swine populations worldwide, further human cases following direct or indirect contact with infected swine can be expected.

        Example 2 (no relevant causality): There is some (E1) pressure on the healthcare capacity (E1) due to the (C1) very high number of admissions for dengue (C1); (C1) high vector density (C1); and an (C1) anticipated prolonged monsoon (C1). 

        Example 3 (no relevant causality): (C1) MVD is a highly virulent disease (C1) that can cause (E1) haemorrhagic fever (E1) and is clinically similar to Ebola virus disease.

        """

        self.mechanism_of_causality = """
        When the text describes/list possible mechanisms behind the cause of transmission or emergence, tag them with (M). A mechanism of causality describes the specific interactions between the pathogen, host, and environment that causes the transmission / emergence. They often describe interactions at the physiological level. 

        Example 1: The global outbreak 2022 — 2024 has shown that (C1) sexual contact (C1) enables faster and more efficient (E1) spread of the virus (E1) from one person  to another due to (M1) direct contact of mucous membranes between people (M1), (M1) contact with multiple partners (M1), (M1) a possibly shorter incubation period on average (M1), and (M1) a longer infectious period for immunocompromised individuals (M1).

        """

        self.sign_of_causality = """
        For each cause-effect relationship, indicate whether each cause (C) is positive (C+) or negative (C-) and each effect (E) is positive (E+) or negative (E-). 
        Use the list of positive and negative sign words provided to help determine the sign of each cause and effect. Be mindful of sentences with negations (e.g., “does not improve”), which reverses polarity. 
        Positive sign words: increase, facilitate, support, improve, expand, promote, enable, enhance, accelerate, advance, grow, boost, strengthen, benefit, contribute, progress, initiate, develop, elevate, stimulate, alleviate, optimize, revitalize. 
        Negative sign words: limit, decrease, reduce, hamper, hinder, restrict, suppress, impair, inhibit, undermine, challenge, disrupt, lack, insufficient, incomplete, challenge, deficit, obstacle, barrier, diminish, shortage, scarcity, obstruct, worsen, decline. 

        Example 1: “(C1-) a lack of timely access to diagnostics in many areas (C1-), (C1-) incomplete epidemiological investigations (C1-), (C1-) challenges in contact tracing and extensive but inconclusive animal investigations (C1-) continue to hamper rapid response (E1-)”

        Example 2: Moreover, (C1-) a low index of suspicion (C1-), (C1) socio-cultural norms (C1), (C1) community resistance (C1), (C1-) limited community knowledge regarding anthrax transmission (C1-), (C1+) high levels of poverty (C1+) and (C1) food insecurity (C1), (C1-) a shortage of available vaccines and laboratory reagents (C1-), (C1-) inadequate carcass disposal (C1-) and (C1) decontamination practices (C1) significantly contribute to hampering (E1-) the containment of the anthrax outbreak (E1-).
        """

    def generate_prompt(self, include_persona=False, include_domain=False, include_causality=False, include_guidance = False, include_examples=False, include_negative=False, include_mechanism=False, include_sign=False):
        """
        Dynamically generate a prompt based on the specified parts.
        """
        # Start with an empty prompt
        prompt = ""

        # Append parts based on the arguments provided
        if include_persona:
            prompt += self.persona_task_description + "\n"
        
        if include_domain:
            prompt += self.domain_localization + "\n"
        if include_causality:
            prompt += self.causality_definition + "\n"
        if include_guidance:
            prompt += self.extraction_guide + "\n"
        if include_examples:
            prompt += self.few_shot_examples + "\n"
        if include_negative:
            prompt += self.negative_cases + "\n"
        if include_mechanism:
            prompt += self.mechanism_of_causality + "\n"
        if include_sign:
            prompt += self.sign_of_causality + "\n"

        return prompt

### Causality Extraction

In [7]:
# Function to split text into chunks
def batch(iterable, n=1):
    """Utility function to batch sentences into chunks."""
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

class CausalChain:
    def __init__(self, dataframe, prompt_designer=None):
        self.dataframe = dataframe
        self.outlines = []  # Store a list of dictionaries to represent complex relationships
        self.prompt_designer = prompt_designer if prompt_designer else PromptDesigner()
        self.processed_chunks = set()  # Track processed chunks to avoid repetition

    def create_effects(self, batch_size=16, prompt_parts={}):
        print("Extracting causal relationships from text...")

        with open("api_responses.md", "w", encoding="utf-8") as file:
            for index, row in tqdm(self.dataframe.iterrows(), total=self.dataframe.shape[0]):
                text = row['Text']
                don_id = row['DonID_standardized']

                # Split text into sentences and then into chunks of 3 sentences
                sentences = text.split(". ")
                chunks = [". ".join(a) + "." for a in batch(sentences, 3)]

                for chunk in chunks:
                    # Skip if the chunk has already been processed
                    if (don_id, chunk) in self.processed_chunks:
                        continue

                    cause_effect_pairs, raw_texts, causality_types, response_text = self.extract_cause_effect_openai(chunk, prompt_parts, don_id)

                    # Mark the chunk as processed
                    self.processed_chunks.add((don_id, chunk))

                    # Write the response to the file immediately after receiving it
                    file.write(f"\n\n## API Response for Article ID {don_id}:\n\n{response_text}\n\n")

                    if not cause_effect_pairs and not raw_texts:
                        self.outlines.append({
                            "DonId": don_id,
                            "Cause": None,
                            "Effect": None,
                            "Causality_Type": "No relevant causality",
                            "Raw_Text": chunk
                        })
                        print(f"No cause-effect pairs found for chunk: {chunk}")

                    for pair, raw_text, causality_type in zip(cause_effect_pairs, raw_texts, causality_types):
                        cause, effect = pair
                        # Remove markers like "E1:" from the effect
                        effect = effect.split(":", 1)[-1].strip() if effect and ":" in effect else effect
                        self.outlines.append({
                            "DonId": don_id,
                            "Cause": cause,
                            "Effect": effect,
                            "Causality_Type": causality_type,
                            "Raw_Text": raw_text
                        })

        # Print the raw texts, causes, effects, and types of causality
        if self.outlines:
            for outline in self.outlines:
                print(f"DonId: {outline['DonId']}")
                print(f"Raw Text: {outline['Raw_Text']}")
                print(f"Cause: {outline['Cause']}")
                print(f"Effect: {outline['Effect']}")
                print(f"Causality Type: {outline['Causality_Type']}")
                print("\n")
        else:
            print("No cause-effect pairs found in the entire dataset.")

    def extract_cause_effect_openai(self, chunk, prompt_parts={}, don_id=None):
        # Use the PromptDesigner to generate the customized prompt
        prompt = self.prompt_designer.generate_prompt(**prompt_parts)

        # Append the text chunk to the prompt and provide a clear format for the response
        full_prompt = f"""{prompt}

        Input text: {chunk}

        Expected output format:
        1. Raw text with marked causes and effects:
        [Provide the input text with marked causes and effects]

        2. Extracted causes and effects:
        C1: [cause] -> E1: [effect], Causality type: [T1/T2/...]
        C2: [cause] -> E2: [effect], Causality type: [T1/T2/...]
        ...
        """

        # Call the OpenAI API
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": full_prompt}],
            max_tokens=2048,
            temperature=0,
        )

        response_text = response.choices[0].message.content
        print(f"API Response for Article ID {don_id}: {response_text}")  # Print the API response for debugging
        return self.parse_response(response_text) + (response_text,)

    @staticmethod
    def parse_response(response_text):
        cause_effect_pairs = []
        raw_texts = []
        causality_types = []  # Store the causality types

        if not response_text.strip():
            print("Empty response received from API.")
            return cause_effect_pairs, raw_texts, causality_types

        # Parse the response based on the expected output format
        lines = response_text.split("\n")
        raw_text_section = False
        extracted_pairs_section = False
        raw_text = ""

        for line in lines:
            line = line.strip()

            if line.startswith("1. Raw text with marked causes and effects"):
                raw_text_section = True
                extracted_pairs_section = False
                raw_text = ""  # Reset raw text for each new section
                continue

            if line.startswith("2. Extracted causes and effects"):
                raw_text_section = False
                extracted_pairs_section = True
                continue

            if raw_text_section and line:
                raw_text += line + " "

            if extracted_pairs_section:
                if line.startswith("C") and "->" in line:
                    try:
                        cause = line.split(":")[1].split("->")[0].strip()
                        effect = line.split("->")[1].split(", Causality type:")[0].strip()
                        causality_type = line.split("Causality type:")[1].strip()
                        #! Remove markers like "E1:" from the effect
                        effect = effect.split(":", 1)[-1].strip() if effect and ":" in effect else effect
                        cause_effect_pairs.append((cause, effect))
                        raw_texts.append(raw_text.strip())
                        causality_types.append(causality_type)
                    except IndexError:
                        print(f"Malformed line: {line}")
                #! Process No causality line
                # elif line.lower().startswith("no relevant causality") or line.lower().startswith("there are no relevant causes"):
                elif "->" not in line:
                    cause_effect_pairs.append((None, None))
                    raw_texts.append(raw_text.strip())
                    causality_types.append("No causality")

        return cause_effect_pairs, raw_texts, causality_types

def create_causes_effects_dataframe(outlines):
    # Create a DataFrame from the outlines list
    dataframe = pd.DataFrame(outlines)

    # Ensure all expected columns are present, fill with 'Unknown' if missing
    for column in ["DonId", "Cause", "Effect", "Causality_Type", "Raw_Text"]:
        if column not in dataframe:
            dataframe[column] = "Unknown"

    return dataframe


## Experimenting the model

In [6]:
prompt_designer = PromptDesigner()

prompt_parts = {
    "include_persona": True,
    "include_domain": True,
    "include_causality": True,
    "include_guidance": True,
    "include_examples": True,
    "include_negative": True,
    "include_mechanism": False,
    "include_sign": False,
}

In [7]:
example_data = who_data_assessment.iloc[0:30]


# Create a CausalChain instance with the dataset
causal_chain = CausalChain(dataframe=example_data, prompt_designer=prompt_designer)

# Generate effects based on the chunks of text
causal_chain.create_effects(prompt_parts=prompt_parts)

Extracting causal relationships from text...


  0%|                                                   | 0/30 [00:00<?, ?it/s]

API Response for Article ID 2024-DON540: 1. Raw text with marked causes and effects:
Marburg virus disease (MVD) is caused by the same family of viruses (Filoviridae) that causes Ebola virus disease. MVD is an epidemic-prone disease associated with (C1) high CFR (24-88%) (C1). In the early course of the disease, (C2) MVD is challenging to distinguish from other infectious diseases such as malaria, typhoid fever, shigellosis, meningitis and other viral haemorrhagic fevers (C2).

2. Extracted causes and effects:
C1: high CFR (24-88%) -> E1: MVD is an epidemic-prone disease, Causality type: T1
C2: MVD is challenging to distinguish from other infectious diseases -> E2: [No relevant effect related to disease transmission or emergence], Causality type: [Not applicable]
API Response for Article ID 2024-DON540: 1. Raw text with marked causes and effects:
Epidemiologic features can help differentiate between viral hemorrhagic fevers (including history of exposure to bats, caves, or mining) and 

  3%|█▍                                         | 1/30 [00:11<05:34, 11.54s/it]

API Response for Article ID 2024-DON540: 1. Raw text with marked causes and effects:
In addition, there are ongoing (C1) public health measures (C1) in place, including (E1) active surveillance in facilities and communities (E1), (E1) testing suspected cases (E1), (E1) isolation and treatment of cases (E1), and (E1) contact tracing (E1).

2. Extracted causes and effects:
C1: public health measures -> E1: active surveillance in facilities and communities, testing suspected cases, isolation and treatment of cases, and contact tracing, Causality type: T2
API Response for Article ID 2024-DON538: 1. Raw text with marked causes and effects:
Although no cases of WNV have been documented in birds or horses in the country, it is possible that the virus is circulating in these populations undetected. Despite this, the overall impact on public health remains limited at this stage, as there is currently only one recorded human case and appropriate public health response measures have been implemen

  7%|██▊                                        | 2/30 [00:15<03:11,  6.84s/it]

API Response for Article ID 2024-DON538: 1. Raw text with marked causes and effects:
The input text does not contain any relevant causality related to the emergence or transmission of pests and pathogens.

2. Extracted causes and effects:
No relevant causality detected.
API Response for Article ID 2024-DON536: 1. Raw text with marked causes and effects:
Since the first report of MERS-CoV in the Kingdom of Saudi Arabia (KSA) in 2012 until now, human infections have been reported in 27 countries, spanning all six WHO regions. The majority of MERS-CoV cases (2205; 84%), have been reported in KSA, including this newly reported case. The notification of this case does not change the overall risk assessment. The new case reported is believed to have acquired (E1) MERS-CoV infection (E1) locally within KSA.

2. Extracted causes and effects:
C1: locally within KSA -> E1: MERS-CoV infection, Causality type: T1
API Response for Article ID 2024-DON536: 1. Raw text with marked causes and effects:


 10%|████▎                                      | 3/30 [00:28<04:29,  9.99s/it]

API Response for Article ID 2024-DON536: 1. Raw text with marked causes and effects:
Human-to-human transmission of MERS-CoV may occur if there are (C1) delays in identifying the infection (C1), particularly in countries that are not well-acquainted with the disease, as well as (C1) slow triage of suspected cases (C1) and (C1) delays in the implementation of standard infection prevention and control measures (C1). WHO continues to monitor the epidemiological situation and conducts risk assessments based on the latest available information.

2. Extracted causes and effects:
C1: delays in identifying the infection -> E1: Human-to-human transmission of MERS-CoV, Causality type: T3
C1: slow triage of suspected cases -> E1: Human-to-human transmission of MERS-CoV, Causality type: T3
C1: delays in the implementation of standard infection prevention and control measures -> E1: Human-to-human transmission of MERS-CoV, Causality type: T3
API Response for Article ID 2024-DON537: 1. Raw text with

 13%|█████▋                                     | 4/30 [00:42<04:52, 11.26s/it]

API Response for Article ID 2024-DON537: 1. Raw text with marked causes and effects:
&nbsp;WHO has provided guidance to the Ministry of Health on how to manage cases. WHO assesses the risk of this outbreak as very high at the national level, high at the regional level, and low at the global level.&nbsp; Investigations are ongoing to determine the full extent of the outbreak and this risk assessment will be updated as more information is received.&nbsp;&nbsp;.

2. Extracted causes and effects:
There are no relevant causes and effects related to the emergence or transmission of pests and pathogens in the provided text.
API Response for Article ID 2024-DON534: 1. Raw text with marked causes and effects:
This is the first human case of infection with a zoonotic influenza virus notified by Ghana. Laboratory testing confirmed the virus as an influenza A(H9N2) virus. The majority of human infections with A(H9N2) viruses occur due to (C1) contact with infected poultry or environments that have

 17%|███████▏                                   | 5/30 [00:49<04:07,  9.91s/it]

API Response for Article ID 2024-DON534: 1. Raw text with marked causes and effects:
   However, if this occurs, further (E1) community-level spread (E1) is considered unlikely.

2. Extracted causes and effects:
   No relevant causality related to disease transmission or emergence is present in the text.
API Response for Article ID 2024-DON532: 1. Raw text with marked causes and effects:
Human infections with swine-origin influenza viruses have been reported in recent years from many countries. Most human cases with influenza A(H1N1)v virus infection result from (C1) exposure to swine influenza viruses through direct contact with infected swine (C1) or (C1) indirectly through contaminated environments (C1). However, a few cases have been reported without an apparent source of exposure to swine in the weeks prior to illness onset.

2. Extracted causes and effects:
C1: exposure to swine influenza viruses through direct contact with infected swine or indirectly through contaminated enviro

 20%|████████▌                                  | 6/30 [00:57<03:41,  9.22s/it]

API Response for Article ID 2024-DON532: 1. Raw text with marked causes and effects:
According to the information available thus far, no further human cases of infection with A(H1N1)v viruses associated with this case have been detected. Based on the available information, WHO assesses the current (E1) risk to the general population posed by this virus to be low (E1). Further virus characterization is ongoing. The (E1) risk assessment (E1) will be reviewed should further epidemiological or virological information become available.

2. Extracted causes and effects:
No relevant causality related to the emergence or transmission of pests and pathogens is identified in the input text.
API Response for Article ID 2024-DON533: 1. Raw text with marked causes and effects:
From 2003 to 20 August 2024, a total of 903 human cases of infection of influenza A(H5N1) have been reported globally to WHO from 24 countries, including this case. Almost all cases of human infection with avian influenza A(H

 23%|██████████                                 | 7/30 [01:09<03:56, 10.28s/it]

API Response for Article ID 2024-DON533: 1. Raw text with marked causes and effects:
The list of such CVVs is available on the WHO website, at the reference below. In addition, the genetic and antigenic characterization of contemporary zoonotic influenza viruses is published here.

2. Extracted causes and effects:
No relevant causality related to the emergence or transmission of pests and pathogens is present in the input text.
API Response for Article ID 2024-DON531: 1. Raw text with marked causes and effects:
Regardless of geographic area, epidemiological context, gender identity or sexual behaviour, individual-level risk is largely dependent on (C1) individual factors such as potential exposures and immunity status (C1). This case represents the first ever report of mpox due to clade I MPXV outside of the African region. Further sporadic cases may be expected, whether among travelers from endemic areas / countries or appearing through (E1) community transmission without any travel l

 27%|███████████▍                               | 8/30 [01:16<03:21,  9.17s/it]

API Response for Article ID 2024-DON531: 1. Raw text with marked causes and effects:
To date, this appears to be an isolated case for which one close contact is under monitoring.

2. Extracted causes and effects:
No relevant causality related to the emergence or transmission of pests and pathogens is present in the input text.
API Response for Article ID 2024-DON530: 1. Raw text with marked causes and effects:
In the Region of the Americas, outbreaks of Oropouche virus disease have occurred mainly in the Amazon region during the last ten years. With geographical limitations, OROV causing persistent endemicity and periodic outbreaks are reported in both rural and urban communities in Brazil, the Plurinational State of Bolivia, Cuba, Colombia, Ecuador, French Guiana, Panama, Peru, and Trinidad and Tobago. The ongoing outbreak highlights the need to strengthen epidemiological and entomological surveillance and to reinforce preventive measures in the population. This is crucial due to the 

 30%|████████████▉                              | 9/30 [01:24<03:03,  8.75s/it]

API Response for Article ID 2024-DON530: I'm sorry, but it seems that the input text is missing. Could you please provide the text you would like me to analyze for causality related to the emergence or transmission of pests and pathogens?
No cause-effect pairs found for chunk: .
API Response for Article ID 2024-DON529: 1. Raw text with marked causes and effects:
Although previous outbreaks have been reported in India, this outbreak is considered the largest in the past 20 years. While authorities are making efforts to control the transmission of CHPV, further transmission of CHPV is possible in the coming weeks, considering the (C1) favorable conditions for vector populations during the monsoon season in affected areas (C1). CHPV infection causes a (E1) rapid symptom onset (E1) and a (E1) high case-fatality ratio (56-75%) (E1). There is no specific treatment or vaccine available, and management is symptomatic; timely referral of suspected AES cases to designated facilities can improve 

 33%|██████████████                            | 10/30 [01:30<02:35,  7.79s/it]

API Response for Article ID 2024-DON529: 1. Raw text with marked causes and effects:
   WHO assessed the risk as moderate at the national level based on above considerations. The risk assessment will be reviewed as the situation of the outbreak evolves.

2. Extracted causes and effects:
   No relevant causality related to the emergence or transmission of pests and pathogens is present in the input text.
API Response for Article ID 2024-DON528: 1. Raw text with marked causes and effects:
The current expansion of mpox in the African continent is unprecedented. At least four countries have identified cases for the first time and others, such as Côte d'Ivoire, are reporting re-emerging outbreaks. The modes of transmission in these countries are not fully described yet and are likely to include (C1) exclusive human-to-human transmission (C1). Clade I mpox is being identified for the first time outside of the countries that had been previously affected.

2. Extracted causes and effects:
C1: 

 37%|███████████████▍                          | 11/30 [01:52<03:49, 12.10s/it]

API Response for Article ID 2024-DON528: 1. Raw text with marked causes and effects:
Countries outside of Africa that seemed to have achieved control of human-to-human transmission continue to detect sporadic cases and outbreaks, and an unprecedented (C1) increase of cases and reporting countries (C1) has been observed in the African Region, especially in the Democratic Republic of the Congo, increasing the (E1) risk of further transmission in the region and the whole world (E1).

2. Extracted causes and effects:
C1: increase of cases and reporting countries -> E1: risk of further transmission in the region and the whole world, Causality type: T1
API Response for Article ID 2024-DON527: 1. Raw text with marked causes and effects:
Globally, there is no systematic surveillance that allows for the routine identification and information collection of hvKp strains. Identification of hvKp is challenging given that it is determined by (C1) available laboratory capacity to perform genomic sequ

 40%|████████████████▊                         | 12/30 [02:16<04:45, 15.85s/it]

API Response for Article ID 2024-DON527: 1. Raw text with marked causes and effects:
With the concurrence of (C1) hypervirulence and antibiotic resistance (C1), it is expected that there will be an (E1) increased risk of spread of these strains at both the community and hospital levels (E1). As with other resistance mechanisms, the (E2) risk of spread (E2) could increase due to (C2) high movements of people (within and between countries and regions) (C2). There are very limited antimicrobial treatment options for the carbapenem-resistant hvKp isolates and these strains have the capacity to generate outbreaks. The (C3) high conjugation capacity of the carbapenem-resistant hvKp (CR-hvKp) (C3) and the potential for further dissemination in clinical settings; hvKp ST23 particularly out-competes other gut bacteria facilitating (E3) colonization and spread (E3). Detection of the emergence of multi-resistant or extensively resistant pathogens requires established resistance laboratory surveil

 43%|██████████████████▏                       | 13/30 [02:22<03:40, 12.98s/it]

API Response for Article ID 2024-DON526: 1. Raw text with marked causes and effects:
This represents the first report of autochthonous dengue cases ever documented in Iran. The confirmation of local dengue transmission in 2024 is thus an atypical yet foreseeable event due to the (C1) presence of the vector in the country (C1) and the (C1) movement of people from endemic areas to Iran (C1). Based on entomological surveillance, to date, (E1) Aedes aegypti and Aedes albopictus are present in the provinces of Baluchistan, Bushehr, Fars, Gilan, Golestan, Hormozgan, Khuzestan, Mazandaran and Sistan (E1). On 16 May 2024, WHO reassessed the global risk of dengue, confirming it to be high and emphasizing that dengue continues to pose a significant public health threat worldwide. The national risk for Iran is also high due to the (C2) presence of the vector in the country (C2), (C2) favorable climate conditions for the competent vector (C2) and the (C2) movement of people from countries experien

 47%|███████████████████▌                      | 14/30 [03:03<05:41, 21.32s/it]

API Response for Article ID 2024-DON525: 1. Raw text with marked causes and effects:
Concurrent (C1) outbreaks of mpox (C1) are occurring in Africa and elsewhere, increasing the (E1) risk of further transmission (E1).

2. Extracted causes and effects:
C1: outbreaks of mpox -> E1: risk of further transmission, Causality type: T1
API Response for Article ID 2024-DON522: 1. Raw text with marked causes and effects:
In the Democratic Republic of the Congo, most reported cases in known endemic provinces continue to be among children under 15 years of age, especially in young children. (C1) Infants and children under five years of age are at highest risk of severe disease and death (C1), particularly where (C2) prompt optimal case management is limited or unavailable (C2). The (E1) number of cases reported weekly remains consistently high (E1) while the (E1) outbreak continues to expand geographically (E1).

2. Extracted causes and effects:
C1: Infants and children under five years of age are

 50%|█████████████████████                     | 15/30 [04:04<08:21, 33.41s/it]

API Response for Article ID 2024-DON522: 1. Raw text with marked causes and effects:
This suggests significant under detection or underreporting of transmission. While the government has activated an emergency response across the country with support from in-country and global partners, resources to respond over such a wide geographic area remain insufficient, and (C1) resource mobilization is slow (C1). Public awareness remains limited, resources are scarce, and (C2) technical as well as financial support is needed (C2) to ensure a robust response at provincial/local, national, and international levels. A concurrent outbreak of mpox is occurring in the Republic of Congo, with cases genetically similar to the MPXV strain circulating in neighbouring endemic provinces of the Democratic Republic of the Congo provinces. A new outbreak of mpox due to clade IIb MPXV linked to the ongoing global outbreak is occurring among key populations in the Republic of South Africa, with to date only cas

 53%|██████████████████████▍                   | 16/30 [04:23<06:46, 29.02s/it]

API Response for Article ID 2024-DON524: 1. Raw text with marked causes and effects:
If needed, the risk assessment will be reviewed should further epidemiological or virological information, including information on (C1) A(H5N2) viruses detected in local animal populations (C1), become available.

2. Extracted causes and effects:
C1: A(H5N2) viruses detected in local animal populations -> E1: risk assessment will be reviewed, Causality type: T1
API Response for Article ID 2024-DON523: 1. Raw text with marked causes and effects:
Most human cases of infection with avian influenza A(H9N2) viruses are exposed to the virus through (C1) contact with infected poultry or contaminated environments (C1). Human infection tends to result in (E1) mild clinical illness (E1). However, globally, there have been some (E2) hospitalized cases (E2) and (E2) two fatal cases (E2) reported in the past.

2. Extracted causes and effects:
C1: contact with infected poultry or contaminated environments -> E1: mi

 57%|███████████████████████▊                  | 17/30 [04:36<05:14, 24.15s/it]

API Response for Article ID 2024-DON523: 1. Raw text with marked causes and effects:
However, the risk assessment will be reviewed should further epidemiological or virological information become available. International travellers from affected regions may present with infections either during their travels or after arrival in other countries. Even if this were to occur, further (E1) community-level spread (E1) is considered unlikely as this virus has not acquired the (C1) ability to transmit easily among humans (C1).

2. Extracted causes and effects:
C1: ability to transmit easily among humans -> E1: community-level spread, Causality type: T1
API Response for Article ID 2024-DON521: 1. Raw text with marked causes and effects:
This is the first detection of the disease in the country, therefore, the (C1) population is likely highly susceptible (C1) and there is a significant (E1) risk of additional case detection (E1). To date, there is no evidence of human-to-human Oropouche virus tr

 60%|█████████████████████████▏                | 18/30 [04:43<03:49, 19.11s/it]

API Response for Article ID 2024-DON521: 1. Raw text with marked causes and effects:
The virus is endemic in many South American countries, in both rural and urban communities. Outbreaks are periodically reported in Brazil, Bolivia, Colombia, Ecuador, French Guiana, Panama, Peru, and Trinidad and Tobago. There is a (E1) risk of the disease spreading internationally (E1) as (C1) Cuba is an international tourist destination (C1) and the (C1) putative vector is widely distributed in the Americas region (C1). Additionally, there are currently other countries with (C2) active OROV circulation (C2).

2. Extracted causes and effects:
C1: Cuba is an international tourist destination and the putative vector is widely distributed in the Americas region -> E1: risk of the disease spreading internationally, Causality type: T3
C2: active OROV circulation -> E1: risk of the disease spreading internationally, Causality type: T3
API Response for Article ID 2024-DON519: 1. Raw text with marked causes a

 63%|██████████████████████████▌               | 19/30 [04:50<02:48, 15.33s/it]

API Response for Article ID 2024-DON519: 1. Raw text with marked causes and effects:
These A(H5N1) influenza viruses, belonging to different genetic groups, do not easily infect humans, and human-to-human transmission thus far appears unusual. As the (C1) virus continues to circulate in poultry, particularly in rural areas (C1), the (E1) potential for further sporadic human cases (E1) remains. Currently, available epidemiological and virological evidence suggests that A(H5) viruses have not acquired the ability of sustained transmission among humans, thus, the likelihood of human-to-human spread is low. Based on available information, WHO assesses the current risk to the general population posed by this virus as low. The risk assessment will be reviewed if additional virological and epidemiological information becomes available.

2. Extracted causes and effects:
C1: virus continues to circulate in poultry, particularly in rural areas -> E1: potential for further sporadic human cases, C

 67%|████████████████████████████              | 20/30 [05:00<02:16, 13.66s/it]

API Response for Article ID 2024-DON520: 1. Raw text with marked causes and effects:
Close analysis of the epidemiological situation, further characterization of the most recent viruses (in human and birds) and serological investigations are critical to assess associated risks and to adjust risk management measures in a timely manner. Based on the available information, WHO assesses the current risk to the general population posed by this virus to be low. If needed, the risk assessment will be reviewed should further epidemiological or virological information, including information on A(H5N2) viruses detected in local animal populations, become available.

2. Extracted causes and effects:
No relevant causality related to the emergence or transmission of pests and pathogens is identified in the input text.
API Response for Article ID 2024-DON518: 1. Raw text with marked causes and effects:
Dengue is a mosquito-borne viral disease caused by the dengue virus, with the potential to cause a

 70%|█████████████████████████████▍            | 21/30 [05:24<02:32, 16.96s/it]

API Response for Article ID 2024-DON518: 1. Raw text with marked causes and effects:
The (E1) implementation of policies aimed at improving conditions related to the risk of transmission (E1), such as urban planning, water and sanitation provision, solid waste management, housing improvement, etc. (C1) Lack of engagement and mobilization of local communities in vector control activities (C1). WHO assessed the global risk of dengue as high on 30 November 2023, and subsequently assigned a WHO internal emergency response grade of G3 at the global level on 1 December 2023. Given the current scale of the dengue outbreaks, the potential risk of further international spread and the complexity of factors impacting transmission, the overall risk at the global level is still assessed as high and thus dengue remains a global threat to public health.

2. Extracted causes and effects:
C1: Lack of engagement and mobilization of local communities in vector control activities -> E1: implementation of 

 73%|██████████████████████████████▊           | 22/30 [05:31<01:50, 13.75s/it]

API Response for Article ID 2024-DON516: 1. Raw text with marked causes and effects:
WHO continues to monitor the epidemiological situation and conducts risk assessments based on the latest available information.

2. Extracted causes and effects:
No relevant causality related to the emergence or transmission of pests and pathogens is present in the input text.
API Response for Article ID 2024-DON517: 1. Raw text with marked causes and effects:
The outbreak is occurring in the Ouaddai province which has been heavily affected by an (C1) influx of refugees and returnees fleeing armed conflict in neighbouring Sudan since April 2023 (C1). The majority of hepatitis E cases have been reported from the Adr&eacute; health district hosting three refugee camps with an average of 50&nbsp;000 refugees per camp and a temporary refugee site housing approximately 170&nbsp;000 refugees (UNHCR estimate). The (E1) risk at the national level (E1) is assessed as high due to the (C2) continuous population m

 77%|████████████████████████████████▏         | 23/30 [05:39<01:25, 12.28s/it]

API Response for Article ID 2024-DON517: 1. Raw text with marked causes and effects:
At the global level, the risk is considered low.

2. Extracted causes and effects:
No relevant causality related to the emergence or transmission of pests and pathogens is present in the input text.
API Response for Article ID 2024-DON514: 1. Raw text with marked causes and effects:
Most human cases of infection with avian influenza A(H9N2) viruses are exposed to the virus through (C1) contact with infected poultry or contaminated environments (C1). Human infection tends to result in (E1) mild clinical illness (E1). Further human cases can be expected since the (C2) virus continues to be detected in poultry populations (C2).

2. Extracted causes and effects:
C1: contact with infected poultry or contaminated environments -> E1: mild clinical illness, Causality type: T1
C2: virus continues to be detected in poultry populations -> E2: further human cases can be expected, Causality type: T1
API Response fo

 80%|█████████████████████████████████▌        | 24/30 [05:47<01:05, 10.85s/it]

API Response for Article ID 2024-DON514: 1. Raw text with marked causes and effects:
If this were to occur, further (E1) community-level spread (E1) is considered unlikely as this virus has not acquired the (C1) ability to transmit easily among humans (C1).

2. Extracted causes and effects:
C1: ability to transmit easily among humans -> E1: community-level spread, Causality type: T1
API Response for Article ID 2024-DON513: 1. Raw text with marked causes and effects:
The (E1) risk at the national level (E1) is assessed as ‘High’ due to the following: The country was previously classified as “rabies free” and has now reported the first confirmed human case. As such, (C1) experience and awareness of community and health care workers on rabies are likely limited (C1). Oecusse, the municipality where the current case was bitten and reported from, is an enclave of Timor-Leste located within East Nusa Tenggara province (NTT) in Indonesia where (C2) rabies is endemic in both dogs and humans (C

 83%|███████████████████████████████████       | 25/30 [06:04<01:03, 12.63s/it]

API Response for Article ID 2024-DON513: 1. Raw text with marked causes and effects:
Available data indicates only the current one fatal case of rabies in Timor-Leste, with no links to international travel, tourism or international gatherings.

2. Extracted causes and effects:
No relevant causality related to the emergence or transmission of pests and pathogens is present in the input text.
API Response for Article ID 2024-DON512: 1. Raw text with marked causes and effects:
This human case was reportedly exposed to dairy cattle in Texas, where HPAI A (H5N1) has recently been confirmed in dairy herds. From 2003 to 1 April 2024, a total of 889 cases and 463 deaths (CFR 52%) caused by influenza A(H5N1) virus have been reported worldwide from 23 countries. The most recently reported case in humans prior to the current case, was in March 2024 in Viet Nam. The human case in Texas is the fourth reported in the region of the Americas, the most recent prior case having been reported in Chile in

 87%|████████████████████████████████████▍     | 26/30 [06:15<00:48, 12.14s/it]

API Response for Article ID 2024-DON512: 1. Raw text with marked causes and effects:
Close analysis of the epidemiological situation, further characterization of the most recent viruses (from human cases and animal) and comprehensive investigations around human cases are critical to assess associated risk and to adjust risk management measures in a timely manner. If needed, the risk assessment will be reviewed should further epidemiological or virological information become available.

2. Extracted causes and effects:
There are no relevant causes and effects related to the emergence or transmission of pests and pathogens in the provided text. The text primarily discusses the importance of analysis and investigation for risk assessment and management, which does not directly describe drivers or contributors to disease emergence or transmission.
API Response for Article ID 2024-DON511: 1. Raw text with marked causes and effects:
This is the first human infection with an avian influenza A

 90%|█████████████████████████████████████▊    | 27/30 [06:23<00:32, 10.84s/it]

API Response for Article ID 2024-DON511: 1. Raw text with marked causes and effects:
As the (C1) virus continues to circulate in poultry, particularly in rural areas of Viet Nam (C1), the (E1) potential for further sporadic human cases (E1) remains. Currently, available epidemiological and virological evidence suggests that A(H5) viruses have not acquired the ability of sustained transmission among humans; thus, the likelihood of human-to-human spread is low. Based on available information, WHO assesses the risk to the general population posed by this virus as low. The risk assessment will be reviewed if additional virological and epidemiological information becomes available.

2. Extracted causes and effects:
C1: virus continues to circulate in poultry, particularly in rural areas of Viet Nam -> E1: potential for further sporadic human cases, Causality type: T1
API Response for Article ID 2024-DON510: 1. Raw text with marked causes and effects:
As part of the ongoing efforts to monito

 93%|███████████████████████████████████████▏  | 28/30 [06:40<00:28, 14.30s/it]

API Response for Article ID 2024-DON510: 1. Raw text with marked causes and effects:
Furthermore, (C1) public health and medical personnel are overburdened, managing multiple parallel outbreaks alongside other health emergencies (C1). (C2) Socio-economic factors, high levels of poverty and limited resource allocation (C2) contribute to the (E1) challenge of controlling outbreaks effectively (E1). While the global risk remains low, active surveillance is required due to the potential for onward transmission through (C3) viremic travellers (C3) and the (C3) presence of the competent vector in neighbouring regions (C3). While progress has been made in controlling outbreaks, ongoing challenges and vulnerabilities underscore the need for sustained and coordinated efforts to protect public health. The (E2) impact on public health (E2) will persist until the ongoing outbreaks are controlled, vaccination coverage is high and immunity gaps in the population closed. The (C4) importation of cases




AttributeError: 'float' object has no attribute 'split'

In [10]:
# Create a DataFrame with the causes, effects, and other related information
result_df = create_causes_effects_dataframe(causal_chain.outlines)

# Display the resulting DataFrame
display(result_df)

Unnamed: 0,DonId,Cause,Effect,Causality_Type,Raw_Text
0,2024-DON540,high CFR (24-88%),MVD is an epidemic-prone disease,T1,Marburg virus disease (MVD) is caused by the s...
1,2024-DON540,MVD is challenging to distinguish from other i...,[No relevant effect related to disease transmi...,[Not applicable],Marburg virus disease (MVD) is caused by the s...
2,2024-DON540,Healthcare-associated infections (also known a...,further spread,T1,Epidemiologic features can help differentiate ...
3,2024-DON540,,,No causality,The importance of screening all persons enteri...
4,2024-DON540,contact with the body fluids of a sick patient...,MVD transmission,T1,"However, based on the evolution of the outbrea..."
...,...,...,...,...,...
207,2024-DON510,"food insecurity, security constraints, and com...",compounded challenges in response,T1,As part of the ongoing efforts to monitor and ...
208,2024-DON510,public health and medical personnel are overbu...,challenge of controlling outbreaks effectively,T1,"Furthermore, (C1) public health and medical pe..."
209,2024-DON510,"Socio-economic factors, high levels of poverty...",challenge of controlling outbreaks effectively,T3,"Furthermore, (C1) public health and medical pe..."
210,2024-DON510,viremic travellers and the presence of the com...,impact on public health,T3,"Furthermore, (C1) public health and medical pe..."


In [12]:
# Export data to csv
result_df.to_csv('result_df_31 Oct.csv', index=False)

## Drivers Categorization

In [13]:
# Import result_df_31 Oct.csv
result_df = pd.read_csv('../data/result_df_31 Oct.csv')

In [11]:
# Function to get a summary for each cause-effect pair
def get_summary(text):
    prompt = (
        f"Analyze the following text to identify common categories related to drivers of infectious diseases. "
        f"Avoid mentioning specific diseases or too specific terms. "
        f"Summarize the text into two words only. For example:\n"
        f"- Text: 'transmission of ebola' -> Summary: 'disease transmission'\n"
        f"- Text: 'COVID-19 infection' -> Summary: 'disease transmission'\n"
        f"Please summarize the following: '{text}'"
    )
    
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=100,
        temperature=0
    )
    return response.choices[0].message.content

# Apply function to each row in cause and effect columns
result_df['Cause_category'] = result_df['Cause'].apply(get_summary)
result_df['Effect_category'] = result_df['Effect'].apply(get_summary)


                                                 Cause           Cause_category                                             Effect       Effect_category
0                                    high CFR (24-88%)           mortality rate                   MVD is an epidemic-prone disease          disease risk
1    MVD is challenging to distinguish from other i...  disease differentiation  [No relevant effect related to disease transmi...             No impact
2    Healthcare-associated infections (also known a...      hospital infections                                     further spread        disease spread
3                                                 None               No context                                               None            No context
4    contact with the body fluids of a sick patient...     disease transmission                                   MVD transmission  disease transmission
..                                                 ...                      ...   

### Using predefined list to categorize drivers

In [5]:
# Sort column "Consolidated Name" and remove duplicates in that column, write in method chaining and save to new object
driver_cat_rm_dups = (
    driver_cat
    .rename(columns={"Peter's name": "Category"})
    .assign(Category=lambda df: df['Category'].ffill()) # fill the data of column "Category" by the value above it for missing values
    .drop_duplicates(subset=["Category", "Consolidated Name"])
) 

predefined_driver_category = (
    driver_cat_rm_dups
    .groupby("Category")["Consolidated Name"]
    .apply(lambda x: x.dropna().unique().tolist())
    .to_dict()
)

# Print the result
print(predefined_driver_category)

{'Build infrastructure': ['technological', 'hunting technology', 'road infrastructure', 'transport infrastructure', 'road-building', 'dam building', 'housing'], 'Climate/Weather': ['seasonality', 'climate change', 'flood', 'storms', 'hurricanes', 'temperature shift (cold to hotter)', 'temperature', 'water stress', 'weather conditions', 'natural disasters', 'precipitation', 'humidity', 'wind speed', 'evapotranspiration', 'daytime length', 'vapor pressure', 'radiation', 'hydroclimatic', 'balance', 'climatic', 'climate', 'weather', 'deforestation', 'drought', 'rainy season', 'tsunami', 'hurricane', 'typhoon', 'earthquake', 'stagnant pools', 'heat wave'], 'Conflict': ['war', 'conflict', 'political unrest', 'substandard living conditions due to conflict', 'refugee camp conditions', 'harm intent', 'socioeconomic impact of conflict'], 'Disease': ['coinfection', 'human susceptibility to infection', 'immune deficiency from an underlying disease', 'scenario', 'comorbidity', 'immunosuppression'],

In [11]:
def format_prompt(text, category_dict):
    # Format the dictionary into a string for the prompt
    category_examples = "\n".join(
        f"- {category}: {', '.join(consolidated_names[:50])}..."  
        for category, consolidated_names in category_dict.items()
    )
    
    # Construct the prompt
    prompt = (
        f"Analyze the following text and map it to a predefined category from the list below. "
        f"Return the output in this exact format:\n"
        f"consolidate_name: [name], category: [category]\n\n"
        f"Categories and examples:\n{category_examples}\n\n"
        f"Example mappings:\n"
        f"- Text: 'Socio-economic factors, high levels of poverty' -> consolidate_name: socioeconomic, category: Economy\n"
        f"- Text: 'favorable conditions for vector populations during the monsoon season in affected areas' -> consolidate_name: climate, category: Climate/Weather\n"
        f"- Text: 'Lack of laboratory capacity' -> consolidate_name: infrastructure, category: Build infrastructure\n\n"
        f"In case you cannot match the orginal text with any of the consolidated names, but the original text is about diseases transmission process\n\n"
        f"Summarize the text into two words only for the consolidate_name. Avoid mentioning specific diseases or too specific terms. For example:\n"
        f"- Text: 'contact with infected poultry or environments that have been contaminated' -> consolidate_name: 'poultry exposure', category: Disease transmission\n"
        f"- Text: 'close contact with A(H5N1)-infected live or dead birds or mammals' -> consolidate_name: 'animal exposure', category: Disease transmission\n"
        f"If none of the above applies, please return consolidate_name: 'Undefined', category: Undefined\n\n"
        f"Now, analyze the following text:\n"
        f"'{text}'\n"
        f"Provide your answer in this format: consolidate_name: [name], category: [category]"
    )
    return prompt

def get_summary_with_prelist(text, category_dict):
    # Format the prompt with the dictionary
    prompt = format_prompt(text, category_dict)
    
    # Send the prompt to OpenAI
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=100,
        temperature=0
    )
    response_text = response.choices[0].message.content
    print(f"API Response: {response_text}")  # Debugging line
    
    # Extract consolidate_name and category
    try:
        if response_text.startswith("consolidate_name:") and ", category:" in response_text:
            consolidate_name, category = map(str.strip, response_text.split(", category:"))
            consolidate_name = consolidate_name.replace("consolidate_name:", "").strip()
            category = category.replace("'", "").strip()
            return consolidate_name, category
        else:
            return None, None
    except Exception as e:
        print(f"Error processing response: {e}")
        return None, None


def categorize_text(row, column_name, category_dict):
    """
    Categorize the text from the specified column using OpenAI and the category dictionary.
    """
    consolidate_name, category = get_summary_with_prelist(row[column_name], category_dict)
    return pd.Series({f'{column_name}_consolidate_name': consolidate_name, f'{column_name}_category_new': category})



In [14]:
result_df = result_df.apply(
    lambda row: pd.concat([
        pd.Series(row),  # Keep the original row data
        categorize_text(row, 'Cause', predefined_driver_category),  # Add Cause-related columns
        categorize_text(row, 'Effect', predefined_driver_category)  # Add Effect-related columns
    ]),
    axis=1
)

API Response: consolidate_name: Undefined, category: Undefined
API Response: consolidate_name: epidemic-prone disease, category: Disease
API Response: consolidate_name: disease differentiation, category: Disease
API Response: consolidate_name: Undefined, category: Undefined
API Response: consolidate_name: nosocomial infections, category: Disease
API Response: consolidate_name: Undefined, category: Undefined
API Response: consolidate_name: Undefined, category: Undefined
API Response: consolidate_name: Undefined, category: Undefined
API Response: consolidate_name: patient exposure, category: Disease transmission
API Response: consolidate_name: MVD transmission, category: Disease transmission
API Response: consolidate_name: fluid exposure, category: Disease transmission
API Response: consolidate_name: MVD transmission, category: Disease transmission
API Response: consolidate_name: public health measures, category: Health system
API Response: consolidate_name: public health measures, categ

In [16]:
(result_df
 .filter(['DonId', 'Cause', 'Cause_consolidate_name', 'Cause_category_new', 'Effect', 'Effect_consolidate_name', 'Effect_category_new']))

Unnamed: 0,DonId,Cause,Cause_consolidate_name,Cause_category_new,Effect,Effect_consolidate_name,Effect_category_new
0,2024-DON540,high CFR (24-88%),Undefined,Undefined,MVD is an epidemic-prone disease,epidemic-prone disease,Disease
1,2024-DON540,MVD is challenging to distinguish from other i...,disease differentiation,Disease,[No relevant effect related to disease transmi...,Undefined,Undefined
2,2024-DON540,Healthcare-associated infections (also known a...,nosocomial infections,Disease,further spread,Undefined,Undefined
3,2024-DON540,,Undefined,Undefined,,Undefined,Undefined
4,2024-DON540,contact with the body fluids of a sick patient...,patient exposure,Disease transmission,MVD transmission,MVD transmission,Disease transmission
...,...,...,...,...,...,...,...
207,2024-DON510,"food insecurity, security constraints, and com...",food insecurity,Food Security,compounded challenges in response,Undefined,Undefined
208,2024-DON510,public health and medical personnel are overbu...,health system,Health system,challenge of controlling outbreaks effectively,outbreak control,Disease transmission
209,2024-DON510,"Socio-economic factors, high levels of poverty...",socioeconomic,Economy,challenge of controlling outbreaks effectively,outbreak control,Public Policy
210,2024-DON510,viremic travellers and the presence of the com...,vector exposure,Disease transmission,impact on public health,public health,Health system


In [17]:
result_df.to_csv('../data/result_df_19 Nov.csv', index=False)

# NLP algorithms

- Sentence Embeddings with Clustering: Create a vector capturing semantic meaning, then using clustering algorithms like K-Mean, Hierarchical Clustering, DBSCAN
- Topic Modeling: Latent Dirichlet Allocation (LDA) or Non-Negative Matrix Factorization (NMF) -> typically used for longer texts
- Text Similarity using Cosine Similarity and Clustering
- Self-Supervised Clustering with Transformers: BERTopic uses transformer embeddings with dimensionality reduction and topic representation, enabling a more dynamic clustering approach suitable for nuanced or dense datasets

In [28]:
cosine_test = pd.read_csv('result_df_31 Oct.csv')

# Create a list of unique categories
unique_drivers_categories = list(set(cosine_test['Cause_category'].tolist() + cosine_test['Effect_category'].tolist()))

In [29]:
# Load the transformer model
model = SentenceTransformer('all-mpnet-base-v2')

# Encode the unique categories
category_embeddings = model.encode(unique_drivers_categories, convert_to_tensor=True)

# Generate embeddings for each unique category
category_embeddings = model.encode(unique_drivers_categories, convert_to_tensor=True)

# Calculate the cosine similarity matrix
cosine_similarity_matrix = util.pytorch_cos_sim(category_embeddings, category_embeddings)

In [31]:
# Convert cosine similarities to distances for clustering
cosine_distance_matrix = 1 - cosine_similarity_matrix.cpu().numpy()

# Apply Agglomerative Clustering
clustering_model = AgglomerativeClustering(
    metric='precomputed',
    linkage='average',
    n_clusters=5  # Choose the number of clusters or use distance_threshold
)
cluster_labels = clustering_model.fit_predict(cosine_distance_matrix)

In [34]:
# Create a DataFrame for unique categories and their cluster labels
clustered_categories = pd.DataFrame({
    'category': unique_drivers_categories,
    'cluster': cluster_labels
})

# Display grouped categories
display(clustered_categories.sort_values(by='cluster'))

Unnamed: 0,category,cluster
19,hidden cases,0
36,research gaps,0
95,case frequency,0
33,funding challenges,0
89,detection challenges,0
...,...,...
76,disease management,4
77,crisis factors,4
79,virus transmission,4
67,disease interactions,4


In [35]:
# Merge cluster labels for Cause and Effect categories back
cosine_test = cosine_test.merge(clustered_categories, left_on='Cause_category', right_on='category', how='left').rename(columns={'cluster': 'Cause_cluster'}).drop(columns=['category'])
cosine_test = cosine_test.merge(clustered_categories, left_on='Effect_category', right_on='category', how='left').rename(columns={'cluster': 'Effect_cluster'}).drop(columns=['category'])

# Display result with cluster labels
display(cosine_test[['Cause_category', 'Cause_cluster', 'Effect_category', 'Effect_cluster']])


Unnamed: 0,Cause_category,Cause_cluster,Cause_cluster.1,Effect_category,Effect_cluster,Effect_cluster.1
0,mortality rate,4,4,disease risk,4,4
1,disease differentiation,4,4,No impact,3,3
2,hospital infections,4,4,disease spread,4,4
3,No context,3,3,No context,3,3
4,disease transmission,4,4,disease transmission,4,4
...,...,...,...,...,...,...
207,crisis factors,4,4,response challenges,0,0
208,healthcare strain,4,4,outbreak management,4,4
209,social determinants,4,4,outbreak management,4,4
210,travel transmission,1,1,health impact,4,4


In [None]:
cosine_test.to_csv('cosine_test_31 Oct.csv', index=False)

In [25]:
# Map embeddings for Cause and Effect categories
cosine_test['Cause_embedding'] = cosine_test['Cause_category'].apply(lambda x: category_embedding_dict[x])
cosine_test['Effect_embedding'] = cosine_test['Effect_category'].apply(lambda x: category_embedding_dict[x])

# Calculate cosine similarity between each Cause and Effect embedding
cosine_test['cosine_similarity'] = cosine_test.apply(
    lambda row: util.pytorch_cos_sim(row['Cause_embedding'], row['Effect_embedding']).item(),
    axis=1
)

print(cosine_test[['Cause_category', 'Effect_category', 'cosine_similarity']])


              Cause_category       Effect_category  cosine_similarity
0             mortality rate          disease risk           0.431920
1    disease differentiation             No impact           0.048635
2        hospital infections        disease spread           0.467465
3                 No context            No context           1.000000
4       disease transmission  disease transmission           1.000000
..                       ...                   ...                ...
207           crisis factors   response challenges           0.291704
208        healthcare strain   outbreak management           0.297092
209      social determinants   outbreak management           0.073511
210      travel transmission         health impact           0.040067
211            global spread          risk factors           0.089944

[212 rows x 3 columns]


In [None]:
# Grouping by DonId and calculating the percentage of "No causality"
def calculate_no_relevant_causality_percentage(df):
    grouped = df.groupby('DonId')['Causality_Type'].apply(lambda x: (x == 'No causality').mean() * 100)
    return grouped

# Example usage
# Assuming `df` is your dataframe created from the outlines
grouped_percentage = calculate_no_relevant_causality_percentage(result_df)
print(grouped_percentage)