In [None]:
# Homework 2 Part 2 (due 7/07/2024)
# Paul Chirkov. Partners: Daniel Duan, Divik Verma.

# Health-care assistance via probabilistic graphical modeling

### Objective
In this project, you will create a health-care assistance bot that can suggest diagnoses for a set of symptoms based on a probabilistic graphical model.

### Step 1: Review 
Review the code from the Bayesian networks exercise.

### Step 2: Acquire data
View this [research article](https://www.nature.com/articles/ncomms5212) and download its supplementary data sets 1, 2 and 3. These data sets include the occurrences of diseases, symptoms, and their co-occurrences in the scientific literature. (For the purpose of this exercise, we are going to assume that the frequency of co-occurrences of diseases and symptoms in scientific papers is proportional to the co-occurence frequencies of actual disease cases and symptoms.)

### Step 3: Create a Bayesian network
Using commands from the `pgmpy` library, create a Bayesian network in which the probability of exhibiting a symptom is conditional on the probability of having an associated disease. 

### Step 4: Initialize priors
Use the disease occurrence data to assign prior probabilities for diseases.

### Step 5: Calculate conditional probability tables
Use the co-occurrence data to define CPTs for each connected pair of disease and symptoms. (Hint: You may need to assign some occurrences of symptoms to an "idiopathic disease" to create valid CPTs.)

### Step 6:
Create a minimal interface in which your bot asks a users for a list of observed symptoms and then returns the name of the disease that is the most likely match to the symptoms. (Hint: Review the input/output commands that you have used in last week's homework.)

In [1]:

import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination

# Load the datasets
data1 = pd.read_csv('data1.txt', sep='\t')  # Contains diseases and their occurrences
data2 = pd.read_csv('data2.txt', sep='\t')  # Contains symptoms and their occurrences
data3 = pd.read_csv('data3.txt', sep='\t')  # Contains co-occurrences of diseases and symptoms


def clean_data3(df):
    if 'MeSH Symptom Term' in df.columns and 'MeSH Disease Term' in df.columns and 'PubMed occurrence' in df.columns:
        df = df.dropna(subset=['MeSH Symptom Term', 'MeSH Disease Term', 'PubMed occurrence'])
        df['PubMed occurrence'] = pd.to_numeric(df['PubMed occurrence'], errors='coerce')
        #Set to 450 so takes less time to run.
        df = df[df['PubMed occurrence'] >= 450]
    return df


data3 = clean_data3(data3)


unique_symptoms = set(data3["MeSH Symptom Term"].unique())
unique_diseases = set(data3["MeSH Disease Term"].unique())







  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,MeSH Symptom Term,MeSH Disease Term,PubMed occurrence,TFIDF score
668,Fever,Bacterial Infections,651,402.928399
2420,Fever,Neutropenia,1236,765.006914
5952,Body Weight,"Kidney Failure, Chronic",636,218.140271
6173,Body Weight,Hypertension,3054,1047.484885
6404,Body Weight,Diabetes Mellitus,1100,377.286632


In [2]:
data1.head()

Unnamed: 0,MeSH Disease Term,PubMed occurrence
0,Breast Neoplasms,122226
1,Hypertension,107294
2,Coronary Artery Disease,82819
3,Lung Neoplasms,78009
4,Myocardial Infarction,75945


In [3]:
data2.head()

Unnamed: 0,MeSH Symptom Term,PubMed occurrence
0,Body Weight,147857
1,Pain,103168
2,Obesity,100301
3,Anoxia,47351
4,Mental Retardation,43883


In [4]:
data3.head()

Unnamed: 0,MeSH Symptom Term,MeSH Disease Term,PubMed occurrence,TFIDF score
668,Fever,Bacterial Infections,651,402.928399
2420,Fever,Neutropenia,1236,765.006914
5952,Body Weight,"Kidney Failure, Chronic",636,218.140271
6173,Body Weight,Hypertension,3054,1047.484885
6404,Body Weight,Diabetes Mellitus,1100,377.286632


# Getting total counts

In [2]:
total_disease_occurrences = data1['PubMed occurrence'].sum()
#data1['Prior_Probability'] = data1['PubMed occurrence'] / total_disease_occurrences
total_symptom_occurrences = data2['PubMed occurrence'].sum()
total_interaction_occurrences=data3['PubMed occurrence'].sum()

# Creating the Bayesian Net

In [31]:

# Looping throught the co-occurences and adding them to the bayesian net. 

#ADD (DISEASE) to each disease. (SYMPTOM) for each symptom


structure = []
symptoms_list = []
disease_nodes=[]

for index, row in data3.iterrows():
    #if(row['MeSH Symptom Term'] != row['MeSH Disease Term']):
        #maybe check if symptom not in the disease list.
        #if row['MeSH Disease Term'] not in unique_symptoms:
        
    symptoms_list.append(row['MeSH Symptom Term']+"(symptom)")
    disease_nodes.append(row['MeSH Disease Term'] + "(disease)")
                
            #Adding edges from disease to symptom
    structure.append((row['MeSH Disease Term'] + "(disease)", row['MeSH Symptom Term'] + "(symptom)"))
        
# Create the Bayesian Network
model = BayesianNetwork(structure)

# Initializing Prior Probabilities

In [32]:
# Create and add CPDs for diseases. ADD PRIOR PROBABILITES
for disease in disease_nodes:
   
    
    prior_prob = data1.loc[data1["MeSH Disease Term"] == disease[:-9], "PubMed occurrence"].values/ total_disease_occurrences
    prior_prob = float(prior_prob)
    
    value=[[1 - prior_prob], [prior_prob]]
        
    cpd_disease = TabularCPD(variable=disease, variable_card=2,
                            values=value)
    model.add_cpds(cpd_disease)



  prior_prob = float(prior_prob)


In [33]:
import itertools
import numpy as np

def grabFromDF(df, value, iv_index=0, dv_index=1):
    '''
    From a 2-column data frame, look for the row in which the value is of 
    the first column is `value`. Return the corresponding value of the second 
    column.
    '''
    sub_df = df[df[df.columns[iv_index]] == value][df.columns[dv_index]]
    if len(sub_df):
        return sub_df.iloc[0]
    else:
        return 0

def grabFromDF2(df, value1, value2, iv1_index=0, iv2_index=1, dv_index=2):
    '''
    From a 3-column data frame, look for the row in which the value is of 
    the first column is `value1` AND the value of the second column is 
    `value2`. Return the corresponding value of the third column.
    '''
    # select variables
    var1 = df.columns[iv1_index]
    var2 = df.columns[iv2_index]
    var3 = df.columns[dv_index]

    # select sub-dataframes in which these variables have the desired values
    sub_df = df[df[var1] == value1]
    subsub_df = sub_df[sub_df[var2] == value2]

    # return dependent variable
    if len(subsub_df):
        return subsub_df[var3].iloc[0]
    else:
        return 0

def CPT2x2(disease_occ, symptom_occ, interaction_occ, 
           total_disease_occ, total_symptom_occ, total_interaction_occ):
    '''
    Set the 2x2 CPTs for a disease-symptom pair based on the occurrences 
    of the disease, symptom, their cooccurrences and the total occurrences 
    of disease, symptoms, and interactions in the data set.
    '''
    # probability of disease
    p_disease = disease_occ / total_disease_occ
    # joint probability of symptom and disease occurring
    p_joint = interaction_occ / total_interaction_occ

    # conditional probability of symptom occurrence given disease
    pTT = (p_joint / p_disease) if p_joint > 0.0 else 0.0
    # conditional probability of symptom non-occurrence given disease
    pFT = 1 - pTT

    # conditional probability of symptom occurrence given disease absence
    pTF = (symptom_occ - interaction_occ) / total_symptom_occ
    # conditional probability of symptom non-occurrence given disease absence
    pFF = 1 - pTF

    return [pFF, pTF, pFT, pTT]



In [34]:
# Define CPTs for symptom nodes
CPTs_symptoms = []
#i0, symptom in enumerate(list(set(symptoms2)))
for symptom in symptoms_list:
    # get all parent nodes
    parents = list(model.predecessors(symptom))
    print(len(parents), end=' ')

    # collect 2x2 CPTs for each parent
    little_cpts = []

    for disease in parents:
        # occurrence of the selected disease
        disease_occurrence = grabFromDF(data1, disease)
        # occurrence of the selected symptom
        symptom_occurrence = grabFromDF(data2, symptom)
        # occurrence of interaction
        interaction_occurrence = grabFromDF2(data3, symptom, disease)
        
        little_cpt = CPT2x2(disease_occurrence, symptom_occurrence, 
                            interaction_occurrence, total_disease_occurrences, 
                            total_symptom_occurrences, total_interaction_occurrences)
        # add 2x2-CPT to list of 2x2-CPTs
        little_cpts += [little_cpt]

    # For the purpose of this exercise, we are going to assume that the 
    # occurrence of one disease is always independent of the occurrence of 
    # another disease (i.e., no comorbidities, naive Bayes).
    rowT = [] # row of probabilities where symptom == True
    for bool_combo in itertools.product([0, 1], repeat=len(parents)):
        cond_probs = [little_cpts[i][2+b] for i, b in enumerate(bool_combo)]
        rowT += [np.prod(cond_probs)]
    
    rowF = [1-val for val in rowT] # row of probs where symptom == False

    cpt = TabularCPD(variable=symptom, variable_card=2, values=[rowF, rowT], 
                     evidence=parents, evidence_card=[2 for _ in parents])
    CPTs_symptoms += [cpt]
    
    
#for cpd in little_cpts:
  #  model.add_cpds(cpd)
    
for cpd in CPTs_symptoms:
    model.add_cpds(cpd)
    

2 2 12 12 12 12 12 12 12 12 12 12 12 12 4 4 4 4 1 1 3 3 3 1 12 12 12 12 12 12 12 12 12 12 12 12 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 3 3 3 1 1 1 1 2 2 1 2 2 1 5 5 5 5 5 1 6 6 6 6 6 6 1 1 1 3 3 3 1 1 1 1 2 2 1 4 4 4 4 1 1 1 1 12 12 12 12 12 12 12 12 12 12 12 12 2 2 1 2 2 2 2 1 2 2 2 2 1 1 1 1 3 3 3 1 1 2 2 2 2 1 1 4 4 4 4 1 1 1 3 3 3 1 3 3 3 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 4 4 4 4 2 2 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 4 4 4 4 7 7 7 7 7 7 7 1 6 6 6 6 6 6 1 1 2 2 1 5 5 5 5 5 1 1 1 1 1 1 4 4 4 4 1 1 2 2 4 4 4 4 1 1 1 1 2 



2 1 1 1 1 1 1 1 1 1 1 8 8 8 8 8 8 8 8 7 7 7 7 7 7 7 1 1 2 2 2 2 

# Validating the model

In [35]:
model.check_model()

True

# Inference and the user interface

In [46]:
def main_interface():
    
    maximum_list = {}
    
    
    print("Welcome to the Disease Prediction Bot!")
    print("Please enter the symptoms you are experiencing, separated by commas.")
    user_input = input("Symptoms: ")
    
    
    observed_symptoms = [symptom.capitalize() + "(symptom)" for symptom in user_input.split(",")]
    observed_evidence = {}

    for symp in observed_symptoms:
        observed_evidence[symp] = 1
    
    #do inference
    inference = VariableElimination(model)

    for disease in disease_nodes:
        
        prob_dis = inference.query(variables=[disease], evidence=observed_evidence)
        
        
        maximum_list[disease] = prob_dis.values[1]


    #Top 5 possible disease
    most_likely = sorted(maximum_list.items(), key=lambda x: x[1], reverse=True)[:5]
    
    print(f"\nThe 5 most likely diseases based on your symptoms are:\n")
    for tuple in most_likely:
        print(f"{tuple[0]} with probability: {round(tuple[1]* 100, 2)} percent. \n")


main_interface()


Welcome to the Disease Prediction Bot!
Please enter the symptoms you are experiencing, separated by commas.
Symptoms: fever

The 5 most likely diseases based on your symptoms are:

Hypertension(disease) with probability: 1.05 percent. 

Coronary Artery Disease(disease) with probability: 0.81 percent. 

Myocardial Infarction(disease) with probability: 0.74 percent. 

Coronary Disease(disease) with probability: 0.63 percent. 

Asthma(disease) with probability: 0.62 percent. 

