In [None]:
# Homework 2 Part 2 (due 7/07/2024)
# Paul Chirkov

# Health-care assistance via probabilistic graphical modeling

### Objective
In this project, you will create a health-care assistance bot that can suggest diagnoses for a set of symptoms based on a probabilistic graphical model.

### Step 1: Review 
Review the code from the Bayesian networks exercise.

### Step 2: Acquire data
View this [research article](https://www.nature.com/articles/ncomms5212) and download its supplementary data sets 1, 2 and 3. These data sets include the occurrences of diseases, symptoms, and their co-occurrences in the scientific literature. (For the purpose of this exercise, we are going to assume that the frequency of co-occurrences of diseases and symptoms in scientific papers is proportional to the co-occurence frequencies of actual disease cases and symptoms.)

### Step 3: Create a Bayesian network
Using commands from the `pgmpy` library, create a Bayesian network in which the probability of exhibiting a symptom is conditional on the probability of having an associated disease. 

### Step 4: Initialize priors
Use the disease occurrence data to assign prior probabilities for diseases.

### Step 5: Calculate conditional probability tables
Use the co-occurrence data to define CPTs for each connected pair of disease and symptoms. (Hint: You may need to assign some occurrences of symptoms to an "idiopathic disease" to create valid CPTs.)

### Step 6:
Create a minimal interface in which your bot asks a users for a list of observed symptoms and then returns the name of the disease that is the most likely match to the symptoms. (Hint: Review the input/output commands that you have used in last week's homework.)

In [120]:

import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination

# Load the datasets
data1 = pd.read_csv('data1.txt', sep='\t')  # Contains diseases and their occurrences
data2 = pd.read_csv('data2.txt', sep='\t')  # Contains symptoms and their occurrences
data3 = pd.read_csv('data3.txt', sep='\t')  # Contains co-occurrences of diseases and symptoms


def clean_data1(df):
    if 'MeSH Disease Term' in df.columns and 'PubMed occurrence' in df.columns:
        df = df[df['PubMed occurrence'] > 500]
    return df

# Function to clean data2
def clean_data2(df):
    if 'MeSH Symptom Term' in df.columns and 'PubMed occurrence' in df.columns:
        df = df[df['PubMed occurrence'] > 500]
    return df

def clean_data3(df):
    if 'MeSH Symptom Term' in df.columns and 'MeSH Disease Term' in df.columns and 'PubMed occurrence' in df.columns:
        df = df.dropna(subset=['MeSH Symptom Term', 'MeSH Disease Term', 'PubMed occurrence'])
        df['PubMed occurrence'] = pd.to_numeric(df['PubMed occurrence'], errors='coerce')
        df = df[df['PubMed occurrence'] >= 500]
    return df

data1 = clean_data1(data1)
data2 = clean_data2(data2)
data3 = clean_data3(data3)

# Display the datasets to understand their structure

unique_symptoms = set(data3["MeSH Symptom Term"].unique())
unique_diseases = set(data3["MeSH Disease Term"].unique())
data3.head()
#print(unique_diseases)






Unnamed: 0,MeSH Symptom Term,MeSH Disease Term,PubMed occurrence,TFIDF score
668,Fever,Bacterial Infections,651,402.928399
2420,Fever,Neutropenia,1236,765.006914
5952,Body Weight,"Kidney Failure, Chronic",636,218.140271
6173,Body Weight,Hypertension,3054,1047.484885
6404,Body Weight,Diabetes Mellitus,1100,377.286632


In [86]:
data1.head()

Unnamed: 0,MeSH Disease Term,PubMed occurrence
0,Breast Neoplasms,122226
1,Hypertension,107294
2,Coronary Artery Disease,82819
3,Lung Neoplasms,78009
4,Myocardial Infarction,75945


In [87]:
data2.head()

Unnamed: 0,MeSH Symptom Term,PubMed occurrence
0,Body Weight,147857
1,Pain,103168
2,Obesity,100301
3,Anoxia,47351
4,Mental Retardation,43883


In [88]:
data3.head()

Unnamed: 0,MeSH Symptom Term,MeSH Disease Term,PubMed occurrence,TFIDF score
668,Fever,Bacterial Infections,651,402.928399
2420,Fever,Neutropenia,1236,765.006914
5952,Body Weight,"Kidney Failure, Chronic",636,218.140271
6173,Body Weight,Hypertension,3054,1047.484885
6404,Body Weight,Diabetes Mellitus,1100,377.286632


# The Prior Probabilities

In [121]:
total_occurrences = data1['PubMed occurrence'].sum()
data1['Prior_Probability'] = data1['PubMed occurrence'] / total_occurrences

# Creating the Bayesian Net

In [162]:

# Looping throught the co-occurences and adding them to the bayesian net. 

structure = []
symptoms_list = []
for index, row in data3.iterrows():
    if(row['MeSH Symptom Term'] != row['MeSH Disease Term']):
        if row['MeSH Disease Term'] not in unique_symptoms:
            symptoms_list.append(row['MeSH Symptom Term'])
            
            #Adding edges from disease to symptom
            structure.append((row['MeSH Disease Term'], row['MeSH Symptom Term']))
        
        
structure.append(('Idiopathic', 'Fatigue'))
structure.append(('Idiopathic', 'Obesity'))


# Create the Bayesian Network
model = BayesianNetwork(structure)

# The Disease CPDS

In [163]:
# Create and add CPDs for diseases. ADD PRIOR PROBABILITES
for index, row in data1.iterrows():
    disease = row['MeSH Disease Term']
    
    #make sure that the disease is in the bayesian net.
    if(disease not in model.nodes()):
        continue
    
    prior_prob = row['Prior_Probability']
    
    cpd_disease = TabularCPD(variable=disease, variable_card=2,
                             values=[[1 - prior_prob], [prior_prob]])
    model.add_cpds(cpd_disease)
    

# Add an "Idiopathic" disease to account for unexplained symptoms
# Assuming low prior for idiopathic disease
cpd_idiopathic = TabularCPD(variable='Idiopathic', variable_card=2,
                            values=[[0.95], [0.05]])
model.add_cpds(cpd_idiopathic)


cpdFever = TabularCPD(variable='Fever', variable_card=2,
                            values=[[0.95], [0.05]])



In [182]:
# GIVEN YOU HAVE A DISEASE GET CPD OF SYMPTOM. ADD CONDITIONAL CPTS.
idiopathic_occurrences = {}

#symptoms = data3['MeSH Symptom Term'].unique()
unique_diseases = data3['MeSH Disease Term'].unique()

cpds=[]

for symptom in symptoms_list:

     # This will hold the probabilities for P(Symptom=0) and P(Symptom=1)
    dis_list = []
    
    for disease in unique_diseases:
        values = [[], []]
        co_occurrences = data3[(data3['MeSH Disease Term'] == disease) & (data3['MeSH Symptom Term'] == symptom)]['PubMed occurrence'].sum()
        total_disease_occurrences = data3[data3['MeSH Disease Term'] == disease]['PubMed occurrence'].sum()
        #print(total_disease_occurrences)
        
        # Calculate P(Symptom | Disease)
        if total_disease_occurrences > 0:
            prob_symptom_given_disease = co_occurrences / total_disease_occurrences
        else:
            prob_symptom_given_disease = 0.0
        
        values[0].append(1 - prob_symptom_given_disease)
        values[1].append(prob_symptom_given_disease)
        
        #Prior of not having symptom
        prior_symp_prob = data2.loc[data2['MeSH Symptom Term'] == symptom, 'PubMed occurrence'].values[0] / total_occurrences
        
        
        values[0].append(1 - prior_symp_prob)
        values[1].append(prior_symp_prob)
        
        cpd_symptom = TabularCPD(variable=symptom, evidence=[disease], variable_card=2, values=values, evidence_card=[2])
    

ValueError: Variable names cannot be same

In [159]:
model.check_model()

ValueError: No CPD associated with Fever

# Inference and the user interface

In [None]:
def main_interface():
    
    maximum_list = {}
    
    
    print("Welcome to the Disease Prediction Bot!")
    print("Please enter the symptoms you are experiencing, separated by commas.")
    user_input = input("Symptoms: ")
    
    
    observed_symptoms = [symptom.strip().lower() for symptom in user_input.split(",")]
    observed_evidence = {}

    for symp in observed_symptoms:
        observed_evidence[symp] = 1
    
    #Make dictionary with observed 
    inference = VariableElimination(model)

    for disease in unique_diseases:
        
        prob_dis = inference.query(variables=[disease], evidence=observed_evidence)
        maximum_list[disease] = prob_dis

    sorted_dict = dict(sorted(my_dict.items(), key=lambda item: item[1], reverse=True))

    top_5 = list(sorted_dict.items())[:5]
    
    most_likely_disease, likelihood = get_most_likely_disease(observed_symptoms, diseases, merged_data)
    print(f"The 5 most likely diseases based on your symptoms are: {top_5}")


main_interface()


Welcome to the Disease Prediction Bot!
Please enter the symptoms you are experiencing, separated by commas.
