In [None]:
# Homework 2 Part 2 (due 7/07/2024)

# Health-care assistance via probabilistic graphical modeling

### Objective
In this project, you will create a health-care assistance bot that can suggest diagnoses for a set of symptoms based on a probabilistic graphical model.

### Step 1: Review
Review the code from the Bayesian networks exercise.

### Step 2: Acquire data
View this [research article](https://www.nature.com/articles/ncomms5212) and download its supplementary data sets 1, 2 and 3. These data sets include the occurrences of diseases, symptoms, and their co-occurrences in the scientific literature. (For the purpose of this exercise, we are going to assume that the frequency of co-occurrences of diseases and symptoms in scientific papers is proportional to the co-occurence frequencies of actual disease cases and symptoms.)

### Step 3: Create a Bayesian network
Using commands from the `pgmpy` library, create a Bayesian network in which the probability of exhibiting a symptom is conditional on the probability of having an associated disease.

### Step 4: Initialize priors
Use the disease occurrence data to assign prior probabilities for diseases.

### Step 5: Calculate conditional probability tables
Use the co-occurrence data to define CPTs for each connected pair of disease and symptoms. (Hint: You may need to assign some occurrences of symptoms to an "idiopathic disease" to create valid CPTs.)

### Step 6:
Create a minimal interface in which your bot asks a users for a list of observed symptoms and then returns the name of the disease that is the most likely match to the symptoms. (Hint: Review the input/output commands that you have used in last week's homework.)

In [1]:
#import libraries
!pip install pgmpy
import pandas as pd
import numpy as np
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination
from collections import defaultdict
import networkx as nx


#from google.colab import drive
#drive.mount('/content/drive')

#import dataset
path1 = '/content/drive/My Drive/data1.txt'
path2 = '/content/drive/My Drive/data2.txt'
path3 = '/content/drive/My Drive/data3.txt'

# Load the datasets
data1 = pd.read_csv(path1, delimiter='\t')
data2 = pd.read_csv(path2, delimiter='\t')
data3 = pd.read_csv(path3, delimiter='\t')


Collecting pgmpy
  Downloading pgmpy-0.1.25-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->pgmpy)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->pgmpy)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->pgmpy)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->pgmpy)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->pgmpy)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->pgmpy)
  Usi

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/data1.txt'

In [None]:
import pandas as pd

# Function to clean data1
def clean_data1(df):
    if 'MeSH Disease Term' in df.columns and 'PubMed occurrence' in df.columns:
        # Keep rows with 'PubMed occurrence' > 300
        df = df[df['PubMed occurrence'] > 300]
    return df

# Function to clean data2
def clean_data2(df):
    if 'MeSH Symptom Term' in df.columns and 'PubMed occurrence' in df.columns:
        # Keep rows with 'PubMed occurrence' > 300
        df = df[df['PubMed occurrence'] > 300]
    return df

def clean_data3(df):
    # Check if necessary columns exist before processing
    if 'MeSH Symptom Term' in df.columns and 'MeSH Disease Term' in df.columns and 'PubMed occurrence' in df.columns:
        # Drop rows with missing values in these key columns
        df = df.dropna(subset=['MeSH Symptom Term', 'MeSH Disease Term', 'PubMed occurrence'])

        # Convert 'PubMed occurrence' to numeric and filter for occurrences >= 300
        df['PubMed occurrence'] = pd.to_numeric(df['PubMed occurrence'], errors='coerce')
        df = df[df['PubMed occurrence'] >= 300]

    return df

# Apply cleaning functions
data1 = clean_data1(data1)
data2 = clean_data2(data2)
data3 = clean_data3(data3)

# Display the cleaned data to verify
print(data1.head())
print(data2.head())
print(data3.head())


In [None]:
import pandas as pd
import networkx as nx
from pgmpy.models import BayesianNetwork

# Function to clean data1
def clean_data1(df):
    if 'MeSH Disease Term' in df.columns and 'PubMed occurrence' in df.columns:
        df = df[df['PubMed occurrence'] > 300]
    return df

# Function to clean data2
def clean_data2(df):
    if 'MeSH Symptom Term' in df.columns and 'PubMed occurrence' in df.columns:
        df = df[df['PubMed occurrence'] > 300]
    return df

def clean_data3(df):
    if 'MeSH Symptom Term' in df.columns and 'MeSH Disease Term' in df.columns and 'PubMed occurrence' in df.columns:
        df = df.dropna(subset=['MeSH Symptom Term', 'MeSH Disease Term', 'PubMed occurrence'])
        df['PubMed occurrence'] = pd.to_numeric(df['PubMed occurrence'], errors='coerce')
        df = df[df['PubMed occurrence'] >= 300]
    return df

# Load datasets
data1 = pd.read_csv('/content/drive/My Drive/data1.txt', delimiter='\t')
data2 = pd.read_csv('/content/drive/My Drive/data2.txt', delimiter='\t')
data3 = pd.read_csv('/content/drive/My Drive/data3.txt', delimiter='\t')

# Apply cleaning functions
data1 = clean_data1(data1)
data2 = clean_data2(data2)
data3 = clean_data3(data3)

# Display the cleaned data to verify
print(data1.head())
print(data2.head())
print(data3.head())

# Define the Bayesian Network structure
model = BayesianNetwork()

# Add nodes (filtered diseases and symptoms)
diseases = data1['MeSH Disease Term'].unique()
symptoms = data2['MeSH Symptom Term'].unique()

model.add_nodes_from(diseases)
model.add_nodes_from(symptoms)

# Function to check if there is a path from start to end
def is_reachable(graph, start, end):
    try:
        return nx.has_path(graph, start, end)
    except nx.NetworkXError:
        return False

# Add edges from diseases to symptoms, avoiding self-loops and ensuring both nodes exist in the model
existing_edges = set()
for _, row in data3.iterrows():
    disease = row['MeSH Disease Term']
    symptom = row['MeSH Symptom Term']

    # Ensure both nodes exist in the model
    if disease not in model.nodes() or symptom not in model.nodes():
        continue

    # Avoid self-loops
    if disease == symptom:
        continue

    # Check if adding this edge would form a loop
    if disease != symptom and (symptom, disease) not in existing_edges:
        # Check for existing path from symptom to disease to avoid cycles
        if not is_reachable(model, symptom, disease):
            model.add_edge(disease, symptom)
            existing_edges.add((disease, symptom))

# Print the nodes and edges in the Bayesian Network
print("Nodes in the Bayesian Network:")
print(model.nodes())

print("Edges in the Bayesian Network:")
print(model.edges())


         MeSH Disease Term  PubMed occurrence
0         Breast Neoplasms             122226
1             Hypertension             107294
2  Coronary Artery Disease              82819
3           Lung Neoplasms              78009
4    Myocardial Infarction              75945
    MeSH Symptom Term  PubMed occurrence
0         Body Weight             147857
1                Pain             103168
2             Obesity             100301
3              Anoxia              47351
4  Mental Retardation              43883
     MeSH Symptom Term     MeSH Disease Term  PubMed occurrence  TFIDF score
668              Fever  Bacterial Infections                651   402.928399
837              Fever             Infection                397   245.718240
869              Fever                Sepsis                326   201.773668
2420             Fever           Neutropenia               1236   765.006914
4996       Body Weight      Breast Neoplasms                373   127.934467
Nodes in the Bay

In [None]:
merged_data = pd.merge(data3, data1[['MeSH Disease Term', 'PubMed occurrence']],
                       left_on='MeSH Disease Term', right_on='MeSH Disease Term', suffixes=('', '_Disease'))

# Merge with data2 to get the symptom occurrence
merged_data = pd.merge(merged_data, data2[['MeSH Symptom Term', 'PubMed occurrence']],
                       left_on='MeSH Symptom Term', right_on='MeSH Symptom Term', suffixes=('', '_Symptom'))

# Calculate the conditional probability P(Symptom|Disease)
merged_data['P(Symptom|Disease)'] = merged_data['PubMed occurrence'] / merged_data['PubMed occurrence_Disease']

# Display the results
print(merged_data[['MeSH Disease Term', 'MeSH Symptom Term', 'P(Symptom|Disease)']])


               MeSH Disease Term            MeSH Symptom Term  \
0           Bacterial Infections                        Fever   
1                      Infection                        Fever   
2                         Sepsis                        Fever   
3                    Neutropenia                        Fever   
4           Bacterial Infections                     Diarrhea   
..                           ...                          ...   
528                     Oliguria                     Oliguria   
529                     Polyuria                     Polyuria   
530               Hemoglobinuria               Hemoglobinuria   
531  Urinary Bladder, Overactive  Urinary Bladder, Overactive   
532            Ovarian Neoplasms                     Virilism   

     P(Symptom|Disease)  
0              0.023215  
1              0.030087  
2              0.015753  
3              0.187956  
4              0.015726  
..                  ...  
528            1.000000  
529        

In [None]:
# Function to compute the conditional probability for a specific pair
def compute_conditional_probability(disease, symptom, df):
    subset = df[(df['MeSH Disease Term'] == disease) & (df['MeSH Symptom Term'] == symptom)]
    if not subset.empty:
        cooccurrence = subset['PubMed occurrence'].values[0]
        disease_occurrence = subset['PubMed occurrence_Disease'].values[0]
        return cooccurrence / disease_occurrence
    else:
        return None

# Compute and print the conditional probabilities for all disease-symptom pairs
for disease in diseases:
    for symptom in symptoms:
        conditional_prob = compute_conditional_probability(disease, symptom, merged_data)
        if conditional_prob is not None:
            print(f"The conditional probability P({symptom}|{disease}) is {conditional_prob:.2f}")

The conditional probability P(Body Weight|Breast Neoplasms) is 0.00
The conditional probability P(Pain|Breast Neoplasms) is 0.00
The conditional probability P(Obesity|Breast Neoplasms) is 0.00
The conditional probability P(Fatigue|Breast Neoplasms) is 0.00
The conditional probability P(Nausea|Breast Neoplasms) is 0.00
The conditional probability P(Body Weight|Hypertension) is 0.03
The conditional probability P(Obesity|Hypertension) is 0.03
The conditional probability P(Angina Pectoris|Hypertension) is 0.00
The conditional probability P(Birth Weight|Hypertension) is 0.00
The conditional probability P(Weight Loss|Hypertension) is 0.00
The conditional probability P(Proteinuria|Hypertension) is 0.01
The conditional probability P(Albuminuria|Hypertension) is 0.01
The conditional probability P(Body Weight|Coronary Artery Disease) is 0.01
The conditional probability P(Pain|Coronary Artery Disease) is 0.00
The conditional probability P(Obesity|Coronary Artery Disease) is 0.01
The conditional p

In [None]:
# Function to validate the probabilities
def validate_probabilities(df):
    invalid_probs = df[(df['P(Symptom|Disease)'] < 0) | (df['P(Symptom|Disease)'] > 1)]
    if not invalid_probs.empty:
        print("Invalid probabilities found:")
        print(invalid_probs)
    else:
        print("All probabilities are within the valid range [0, 1].")

    for disease in df['MeSH Disease Term'].unique():
        total_prob = df[df['MeSH Disease Term'] == disease]['P(Symptom|Disease)'].sum()
        if total_prob > 1:
            print(f"Warning: Sum of probabilities for disease {disease} exceeds 1. Total: {total_prob}")

# Compute and print the conditional probabilities for all disease-symptom pairs
results = []
for disease in diseases:
    for symptom in symptoms:
        conditional_prob = compute_conditional_probability(disease, symptom, merged_data)
        if conditional_prob is not None:
            results.append({'MeSH Disease Term': disease, 'MeSH Symptom Term': symptom, 'P(Symptom|Disease)': conditional_prob})

results_df = pd.DataFrame(results)

# Validate the probabilities
validate_probabilities(results_df)

# Print the valid results
print(results_df)

All probabilities are within the valid range [0, 1].
  MeSH Disease Term MeSH Symptom Term  P(Symptom|Disease)
0           Obesity           Obesity            0.994512


In [None]:
import pandas as pd

# Assuming the previous data cleaning and merging steps are done and we have `merged_data` ready

# Function to compute the conditional probability for a specific pair
def compute_conditional_probability(disease, symptom, df):
    subset = df[(df['MeSH Disease Term'] == disease) & (df['MeSH Symptom Term'] == symptom)]
    if not subset.empty:
        cooccurrence = subset['PubMed occurrence'].values[0]
        disease_occurrence = subset['PubMed occurrence_Disease'].values[0]
        return cooccurrence / disease_occurrence
    else:
        return None

# Function to calculate the likelihood of a disease given a list of symptoms
def calculate_disease_likelihood(symptoms, diseases, df):
    likelihoods = {disease: 1.0 for disease in diseases}
    for disease in diseases:
        for symptom in symptoms:
            prob = compute_conditional_probability(disease, symptom, df)
            if prob is not None:
                likelihoods[disease] *= prob
            else:
                likelihoods[disease] *= 0.01  # Small value for missing probabilities
    return likelihoods

# Function to get the most likely disease based on symptoms
def get_most_likely_disease(symptoms, diseases, df):
    likelihoods = calculate_disease_likelihood(symptoms, diseases, df)
    most_likely_disease = max(likelihoods, key=likelihoods.get)
    return most_likely_disease, likelihoods[most_likely_disease]

# Interface for user input
def main_interface():
    print("Welcome to the Disease Prediction Bot!")
    print("Please enter the symptoms you are experiencing, separated by commas.")
    user_input = input("Symptoms: ")
    observed_symptoms = [symptom.strip().lower() for symptom in user_input.split(",")]

    most_likely_disease, likelihood = get_most_likely_disease(observed_symptoms, diseases, merged_data)
    print(f"The most likely disease based on your symptoms is: {most_likely_disease}")
    print(f"Likelihood: {likelihood:.2f}")

# Get the list of diseases and symptoms
diseases = data1['MeSH Disease Term'].unique()
symptoms = data2['MeSH Symptom Term'].unique()

main_interface()


Welcome to the Disease Prediction Bot!
Please enter the symptoms you are experiencing, separated by commas.
Symptoms: pain
The most likely disease based on your symptoms is: Breast Neoplasms
Likelihood: 0.01


In [None]:
import pandas as pd
import networkx as nx
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD

# Load the datasets
path1 = '/content/drive/My Drive/data1.txt'
path2 = '/content/drive/My Drive/data2.txt'
path3 = '/content/drive/My Drive/data3.txt'

data1 = pd.read_csv(path1, delimiter='\t')
data2 = pd.read_csv(path2, delimiter='\t')
data3 = pd.read_csv(path3, delimiter='\t')

# Constructing the Bayesian Network
directed_graph = nx.DiGraph()

# Collect potential edges
potential_edges = []

for index, row in data3.iterrows():
    disease = row['MeSH Disease Term'].strip().lower()
    symptom = row['MeSH Symptom Term'].strip().lower()
    if disease != symptom:
        potential_edges.append((disease, symptom))

# Add edges and check for DAG
for edge in potential_edges:
    directed_graph.add_edge(*edge)
    if not nx.is_directed_acyclic_graph(directed_graph):
        directed_graph.remove_edge(*edge)

bn_model = BayesianNetwork(directed_graph.edges())

# Adding CPDs to the Bayesian Network based on its structure
for node in bn_model.nodes():
    parents = list(directed_graph.predecessors(node))
    variable_card = 2  # Assuming binary variables

    if parents:
        # Setup a basic CPD for nodes with parents
        evidence_card = [2] * len(parents)
        values = [[0.8, 0.2], [0.2, 0.8]]  # Simplified example
        cpd = TabularCPD(variable=node, variable_card=variable_card, values=values,
                         evidence=parents, evidence_card=evidence_card)
    else:
        # Setup a default CPD for nodes without parents
        cpd = TabularCPD(variable=node, variable_card=variable_card, values=[[0.5], [0.5]])

    bn_model.add_cpds(cpd)

# Check the model's validity after all CPDs have been added
if bn_model.check_model():
    print("The Bayesian network is correctly defined and valid.")
else:
    print("There was a problem with the network definition or CPDs.")

print("Nodes in the Bayesian Network:", bn_model.nodes())
print("Edges in the Bayesian Network:", bn_model.edges())


ValueError: values must be of shape (2, 0). Got shape: (2, 2)

In [None]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination
import networkx as nx

# Assuming the data has been loaded into data1, data2, and data3 DataFrames
# Loading the datasets
path1 = '/content/drive/My Drive/data1.txt'
path2 = '/content/drive/My Drive/data2.txt'
path3 = '/content/drive/My Drive/data3.txt'

data1 = pd.read_csv(path1, delimiter='\t')
data2 = pd.read_csv(path2, delimiter='\t')
data3 = pd.read_csv(path3, delimiter='\t')

# Constructing the Bayesian Network
directed_graph = nx.DiGraph()

for index, row in data3.iterrows():
    disease = row['MeSH Disease Term'].strip().lower()
    symptom = row['MeSH Symptom Term'].strip().lower()
    if disease != symptom:
        if not directed_graph.has_edge(disease, symptom):
            directed_graph.add_edge(disease, symptom)
            if not nx.is_directed_acyclic_graph(directed_graph):
                directed_graph.remove_edge(disease, symptom)

bn_model = BayesianNetwork(directed_graph.edges())

# Adding CPDs to the Bayesian Network based on its structure
for node in bn_model.nodes():
    parents = list(directed_graph.predecessors(node))
    variable_card = 2  # Assuming binary variables

    if parents:
        # Setup a basic CPD for nodes with parents
        evidence_card = [2] * len(parents)
        values = [[0.8, 0.2], [0.2, 0.8]]  # Assuming binary states and one parent for simplicity
        cpd = TabularCPD(variable=node, variable_card=variable_card, values=values,
                         evidence=parents, evidence_card=evidence_card)
    else:
        # Setup a default CPD for nodes without parents
        cpd = TabularCPD(variable=node, variable_card=variable_card, values=[[0.5], [0.5]])

    bn_model.add_cpds(cpd)

# Check the model's validity after all CPDs have been added
if bn_model.check_model():
    print("The Bayesian network is correctly defined and valid.")
else:
    print("There was a problem with the network definition or CPDs.")

print("Nodes in the Bayesian Network:", bn_model.nodes())
print("Edges in the Bayesian Network:", bn_model.edges())


KeyboardInterrupt: 

In [None]:
network_structure = []
directed_graph = nx.DiGraph()

for index, row in data3.iterrows():
    disease = row['MeSH Disease Term'].strip().lower()
    symptom = row['MeSH Symptom Term'].strip().lower()
    if disease != symptom:
        if not directed_graph.has_edge(disease, symptom):
            directed_graph.add_edge(disease, symptom)
            if not nx.is_directed_acyclic_graph(directed_graph):
                directed_graph.remove_edge(disease, symptom)

bn_model = BayesianNetwork(directed_graph.edges())

# Incrementally add CPDs and check the model
for node in bn_model.nodes():
    parents = list(directed_graph.predecessors(node))
    if parents:
        # Simplified CPD for demonstration: Binary states, reduced complexity
        values = [[0.8, 0.2], [0.2, 0.8]]  # Assuming only one parent for simplicity
        cpd = TabularCPD(variable=node, variable_card=2, values=values, evidence=parents, evidence_card=[2])
    else:
        # Nodes without parents: Assume equal probability for states
        cpd = TabularCPD(variable=node, variable_card=2, values=[[0.5], [0.5]])
    bn_model.add_cpds(cpd)

    # Check model incrementally to catch issues early
    if not bn_model.check_model():
        print(f"Problem found after adding CPD for {node}.")
        break

if bn_model.check_model():
    print("The Bayesian network is correctly defined and valid.")
else:
    print("There was a problem with the network definition or CPDs.")

print("Nodes in the Bayesian Network:", bn_model.nodes())
print("Edges in the Bayesian Network:", bn_model.edges())


ValueError: No CPD associated with fever

In [None]:
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
import networkx as nx

# Assuming your data is already loaded into data3
# data3 = pd.read_csv('/path/to/your/data3.csv')  # Uncomment and set path as needed

# Constructing the Bayesian Network structure, ensuring no cycles
network_structure = []
directed_graph = nx.DiGraph()

for index, row in data3.iterrows():
    disease = row['MeSH Disease Term'].strip().lower()  # Corrected column name
    symptom = row['MeSH Symptom Term'].strip().lower()  # Corrected column name
    if disease != symptom:  # Ensure disease is not added as its own symptom
        directed_graph.add_edge(disease, symptom)
        if not nx.is_directed_acyclic_graph(directed_graph):
            directed_graph.remove_edge(disease, symptom)

bn_model = BayesianNetwork(directed_graph.edges())

# Adding CPDs to the Bayesian Network based on its structure
for node in bn_model.nodes():
    # Check if the node is a symptom and has diseases as parents
    parents = list(directed_graph.predecessors(node))
    if parents:  # Node is a symptom and has disease parents
        cpd_values = [[0.95 if i % 2 == 0 else 0.05 for i in range(2**len(parents))],
                      [0.05 if i % 2 == 0 else 0.95 for i in range(2**len(parents))]]
        cpd = TabularCPD(variable=node, variable_card=2, values=cpd_values,
                         evidence=parents, evidence_card=[2] * len(parents))
    else:  # Node is a disease or no parents, assign a default prior
        cpd = TabularCPD(variable=node, variable_card=2, values=[[0.5], [0.5]])

    bn_model.add_cpds(cpd)

# Validate the model
if bn_model.check_model():
    print("The Bayesian network is correctly defined and valid.")
else:
    print("There was a problem with the network definition or CPDs.")

print("Nodes in the Bayesian Network:", bn_model.nodes())
print("Edges in the Bayesian Network:", bn_model.edges())


In [None]:
# Constructing the Bayesian Network structure, ensuring no cycles
network_structure = []
directed_graph = nx.DiGraph()

for index, row in data3.iterrows():
    disease = row['MESH Disease Term'].strip().lower()
    symptom = row['PubMed occurrence'].strip().lower()
    if disease != symptom:
        directed_graph.add_edge(disease, symptom)
        if not nx.is_directed_acyclic_graph(directed_graph):
            directed_graph.remove_edge(disease, symptom)

bn_model = BayesianNetwork(directed_graph.edges())

# Adding CPDs to the Bayesian Network based on its structure
for node in bn_model.nodes():
    # Check if the node is a symptom and has diseases as parents
    parents = list(directed_graph.predecessors(node))
    if parents:  # Node is a symptom and has disease parents
        cpd_values = [[0.95, 0.05] + [0.95, 0.05] * (2 ** len(parents) - 1),  # Generic example values, adjust as needed
                      [0.05, 0.95] + [0.05, 0.95] * (2 ** len(parents) - 1)]
        cpd = TabularCPD(variable=node, variable_card=2, values=cpd_values,
                         evidence=parents, evidence_card=[2] * len(parents))
    else:  # Node is a disease or no parents, assign a default prior
        cpd = TabularCPD(variable=node, variable_card=2, values=[[0.99], [0.01]])

    bn_model.add_cpds(cpd)

# Validate the model
if bn_model.check_model():
    print("The Bayesian network is correctly defined and valid.")
else:
    print("There was a problem with the network definition or CPDs.")

print("Nodes in the Bayesian Network:", bn_model.nodes())
print("Edges in the Bayesian Network:", bn_model.edges())

KeyError: 'MESH Disease Term'

In [None]:
bn_structure = []
graph = nx.DiGraph()

for index, row in data3.iterrows():
    disease = row['Disease'].strip().lower()
    symptom = row['Symptom'].strip().lower()
    if disease != symptom:
        # Add edge and check for cycles
        graph.add_edge(disease, symptom)
        if not nx.is_directed_acyclic_graph(graph):
            graph.remove_edge(disease, symptom)

bn_model = BayesianNetwork(graph.edges())

# Adding CPDs
for node in graph.nodes():
    # Find parents of the node within the graph, which indicates dependencies
    parents = list(graph.predecessors(node))
    if not parents:
        # Assuming no parents means it's a root node (disease)
        cpd = TabularCPD(variable=node, variable_card=2, values=[[0.5], [0.5]])
    else:
        # Create a CPD for a symptom with its diseases as evidence
        evidence_card = [2] * len(parents)
        cpd = TabularCPD(variable=node, variable_card=2,
                         values=[[0.95, 0.2], [0.05, 0.8]],
                         evidence=parents, evidence_card=evidence_card)
    bn_model.add_cpds(cpd)

# Validate the model
if bn_model.check_model():
    print("The Bayesian network is correctly defined and valid.")
else:
    print("There was a problem with the network definition or CPDs.")



ValueError: values must be of shape (2, 16). Got shape: (2, 2)

In [None]:
network_structure = []
directed_graph = nx.DiGraph()

for index, row in data3.iterrows():
    disease = row['Disease'].strip().lower()
    symptom = row['Symptom'].strip().lower()
    if disease != symptom:
        # Temporarily add edge to check for cycles
        directed_graph.add_edge(disease, symptom)
        if nx.is_directed_acyclic_graph(directed_graph):
            network_structure.append((disease, symptom))
        else:
            # If a cycle is created, remove the edge
            directed_graph.remove_edge(disease, symptom)

# Create the Bayesian Network with the filtered structure
bn_model = BayesianNetwork(network_structure)

# Check nodes and edges to confirm no cycles
print("Nodes in the Bayesian Network:", bn_model.nodes())
print("Edges in the Bayesian Network:", bn_model.edges())

# Assuming filtered_structure is properly set as before and bn_model is created
for disease, symptom in filtered_structure:
    cpd_disease = TabularCPD(variable=disease, variable_card=2, values=[[0.99], [0.01]])
    cpd_symptom_given_disease = TabularCPD(variable=symptom, variable_card=2,
                                           values=[[0.95, 0.2],  # P(Symptom=No | Disease=No), P(Symptom=No | Disease=Yes)
                                                   [0.05, 0.8]], # P(Symptom=Yes | Disease=No), P(Symptom=Yes | Disease=Yes)
                                           evidence=[disease], evidence_card=[2])
    bn_model.add_cpds(cpd_disease, cpd_symptom_given_disease)

# Ensure all nodes have CPDs
for node in bn_model.nodes():
    if not bn_model.get_cpds(node):
        # Assign a default CPD (uniform distribution as an example)
        uniform_cpd = TabularCPD(variable=node, variable_card=2, values=[[0.5], [0.5]])
        bn_model.add_cpds(uniform_cpd)

# Check if the model is valid
if bn_model.check_model():
    print("The Bayesian network is correctly defined and valid.")
else:
    print("There was a problem with the network definition.")

# Validate CPDs
if bn_model.check_model():
    print("CPDs added successfully and the model is valid.")
else:
    print("There was a problem adding CPDs or the model is not valid.")


KeyError: 'Disease'

In [None]:
network_structure = [(row['Disease'].strip().lower(), row['Symptom'].strip().lower())
                     for index, row in data3.iterrows()
                     if row['Disease'].strip().lower() != row['Symptom'].strip().lower()]

# Debug: Check what's in network_structure before filtering
print("Network Structure Pre-filter: ", network_structure)

# Create a directed graph to check for cycles
directed_graph = nx.DiGraph()
directed_graph.add_edges_from(network_structure)

# Check for cycles in the graph
try:
    cycle = nx.find_cycle(directed_graph, orientation='original')
    print("Cycle Found: ", cycle)
except nx.NetworkXNoCycle:
    print("No cycles found.")

# Since no cycles, let's proceed with the original structure
bn_model = BayesianNetwork(network_structure)

# Check nodes and edges
print("Nodes in the Bayesian Network:", bn_model.nodes())
print("Edges in the Bayesian Network:", bn_model.edges())

KeyError: 'Disease'

In [None]:
network_structure = [(row['Disease'].strip().lower(), row['Symptom'].strip().lower())
                     for index, row in data3.iterrows()
                     if row['Disease'].strip().lower() != row['Symptom'].strip().lower()]

def would_form_cycle(edges, test_edge):
    directed_graph = nx.DiGraph()
    directed_graph.add_edges_from(edges)
    directed_graph.add_edge(*test_edge)
    try:
        nx.find_cycle(directed_graph, orientation='original')
        return True
    except nx.NetworkXNoCycle:
        return False

# Filter out edges that would form cycles
filtered_structure = [edge for edge in network_structure if not would_form_cycle(network_structure, edge)]

# Create the Bayesian Network with the filtered structure
bn_model = BayesianNetwork(filtered_structure)

# Adding CPDs
for disease, symptom in filtered_structure:
    cpd_disease = TabularCPD(variable=disease, variable_card=2, values=[[0.99], [0.01]])
    cpd_symptom_given_disease = TabularCPD(variable=symptom, variable_card=2,
                                           values=[[0.95, 0.2], [0.05, 0.8]],
                                           evidence=[disease], evidence_card=[2])
    bn_model.add_cpds(cpd_disease, cpd_symptom_given_disease)

# Validate the model
if bn_model.check_model():
    print("The Bayesian network is correctly defined and valid.")
else:
    print("There was a problem adding CPDs or the model is not valid.")

# Display nodes and edges
print("Nodes in the Bayesian Network:", bn_model.nodes())
print("Edges in the Bayesian Network:", bn_model.edges())

KeyError: 'Disease'

In [None]:
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
import networkx as nx

# Assuming data3 is loaded and contains 'Disease' and 'Symptom' columns correctly set
network_structure = [(row['Disease'].strip().lower(), row['Symptom'].strip().lower())
                     for index, row in data3.iterrows()
                     if row['Disease'].strip().lower() != row['Symptom'].strip().lower()]

def would_form_cycle(edges, test_edge):
    directed_graph = nx.DiGraph()
    directed_graph.add_edges_from(edges)
    directed_graph.add_edge(*test_edge)
    try:
        nx.find_cycle(directed_graph, orientation='original')
        return True
    except nx.NetworkXNoCycle:
        return False

# Filter out edges that would form cycles
filtered_structure = [edge for edge in network_structure if not would_form_cycle(network_structure, edge)]

# Create the Bayesian Network with the filtered structure
bn_model = BayesianNetwork(filtered_structure)

# Adding CPDs based on simple probabilities (these should ideally be based on data)
for disease, symptom in filtered_structure:
    cpd_disease = TabularCPD(variable=disease, variable_card=2, values=[[0.99], [0.01]])
    cpd_symptom_given_disease = TabularCPD(variable=symptom, variable_card=2,
                                           values=[[0.95, 0.2],  # P(Symptom=No | Disease=No), P(Symptom=No | Disease=Yes)
                                                   [0.05, 0.8]], # P(Symptom=Yes | Disease=No), P(Symptom=Yes | Disease=Yes)
                                           evidence=[disease], evidence_card=[2])
    bn_model.add_cpds(cpd_disease, cpd_symptom_given_disease)

# Check if the model is valid after adding CPDs
if bn_model.check_model():
    print("CPDs added successfully and the model is valid.")
else:
    print("There was a problem adding CPDs or the model is not valid.")

print("Nodes in the Bayesian Network:", bn_model.nodes())
print("Edges in the Bayesian Network:", bn_model.edges())


CPDs added successfully and the model is valid.
Nodes in the Bayesian Network: []
Edges in the Bayesian Network: []


In [None]:
#Create network

import pandas as pd
from pgmpy.models import BayesianNetwork

#network_structure = []
#for index, row in data3.iterrows():
    #disease = row['Disease'].strip().lower()
    #symptom = row['Symptom'].strip().lower()
    #if disease != symptom:  # This check ensures no self-loops
        #network_structure.append((disease, symptom))

from pgmpy.models import BayesianNetwork
import networkx as nx

# Assuming network_structure contains all intended edges
network_structure = [(row['Disease'].strip().lower(), row['Symptom'].strip().lower()) for index, row in data3.iterrows() if row['Disease'].strip().lower() != row['Symptom'].strip().lower()]
def would_form_cycle(edges, test_edge):
    directed_graph = nx.DiGraph()
    directed_graph.add_edges_from(edges)
    directed_graph.add_edge(*test_edge)
    try:
        # Check for cycles in the graph
        cycle = nx.find_cycle(directed_graph, orientation='original')
        return True
    except nx.NetworkXNoCycle:
        return False

# Filter out edges that would form cycles
filtered_structure = [edge for edge in network_structure if not would_form_cycle(network_structure, edge)]

# Create the Bayesian Network
bn_model = BayesianNetwork(filtered_structure)

# Check if the model is valid
if bn_model.check_model():
    print("The Bayesian network is correctly defined and valid.")
else:
    print("The Bayesian network has errors.")

# Create the Bayesian Network
bn_model = BayesianNetwork(network_structure)

# Check if there are still errors
try:
    bn_model.check_model()
    print("The Bayesian network is correctly defined and valid.")
except Exception as e:
    print(f"Error in the Bayesian network: {str(e)}")

for disease, symptom in network_structure:
    cpd_disease = TabularCPD(variable=disease, variable_card=2, values=[[0.99], [0.01]])
    cpd_symptom_given_disease = TabularCPD(variable=symptom, variable_card=2,
                                           values=[[0.95, 0.2],  # P(Symptom=No | Disease=No), P(Symptom=No | Disease=Yes)
                                                   [0.05, 0.8]], # P(Symptom=Yes | Disease=No), P(Symptom=Yes | Disease=Yes)
                                           evidence=[disease], evidence_card=[2])

    bn_model.add_cpds(cpd_disease, cpd_symptom_given_disease)

# Check if the model is valid after adding CPDs
if bn_model.check_model():
    print("CPDs added successfully and the model is valid.")
else:
    print("There was a problem adding CPDs or the model is not valid.")


The Bayesian network is correctly defined and valid.


NetworkXError: Input is not a valid edge list

In [None]:
#handle the data

if 'MeSH Disease Term' in data1.columns:
    data1.drop(columns=['MeSH Disease Term'], inplace=True)

data1.rename(columns={'Occurences': 'Occurrences'}, inplace=True)  # Correcting spelling mistake

data1.drop(columns=['TFIDF score', 'PubMed occurrence'], inplace=True, errors='ignore')  # 'errors' parameter to avoid KeyError if column doesn't exist

if 'MeSH Disease Term' in data2.columns:
    data2.drop(columns=['MeSH Disease Term'], inplace=True)
data2.rename(columns={'Occurences': 'Occurrences'}, inplace=True)  # Correcting spelling mistake

data2.drop(columns=['TFIDF score', 'PubMed occurrence'], inplace=True, errors='ignore')  # 'errors' parameter to avoid KeyError if column doesn't exist

if 'MeSH Disease Term' in data3.columns:
    data3.drop(columns=['MeSH Disease Term'], inplace=True)

data3.rename(columns={'Occurences': 'Occurrences'}, inplace=True)  # Correcting spelling mistake

data3.drop(columns=['TFIDF score', 'PubMed occurrence'], inplace=True, errors='ignore')  # 'errors' parameter to avoid KeyError if column doesn't exist

In [None]:
#handle the data

all_data = pd.concat([data1, data2, data3], ignore_index=True)

print(all_data.head())
all_data['MeSH Disease Term'] = all_data['MeSH Disease Term'].str.strip().str.lower()
diseases_over_300 = all_data[all_data['PubMed occurrence'] > 300]

network_structure = set()
for index, row in diseases_over_300.iterrows():
    disease = row['MeSH Disease Term']
    symptom = row['PubMed occurrence']

    if pd.notna(disease) and pd.notna(symptom) and disease != symptom:
        network_structure.add((disease, symptom.lower().strip()))


cleaned_network_structure = set()

for disease, symptom in network_structure:
    if pd.notna(disease) and pd.notna(symptom) and disease != symptom:
        cleaned_network_structure.add((disease, symptom))

all_diseases_in_data = set(all_data['MeSH Disease Term'].unique())
diseases_in_network = set(disease for disease, symptom in cleaned_network_structure)

# Diseases in the network not in the data (missing CPDs potential)
missing_cpd_diseases = diseases_in_network - all_diseases_in_data
print(missing_cpd_diseases)



                   Disease  Occurrences MeSH Symptom Term
0         Breast Neoplasms     122226.0               NaN
1             Hypertension     107294.0               NaN
2  Coronary Artery Disease      82819.0               NaN
3           Lung Neoplasms      78009.0               NaN
4    Myocardial Infarction      75945.0               NaN


KeyError: 'MeSH Disease Term'

In [None]:
#filter only for the instances with over 300
diseases_over_300 = all_data[all_data['PubMed occurrence'] > 300]

network_structure = set()  # Use a set to automatically handle duplicates

# Iterate through diseases to associate symptoms
current_disease = None
for index, row in diseases_over_300.iterrows():
    if row['PubMed occurrence'] > 3000:
        current_disease = row['MeSH Disease Term']
    elif current_disease and pd.notna(row['MeSH Symptom Term']):
        symptom = row['MeSH Symptom Term']
        if current_disease != symptom:  # Ensure no self-loops
            # Add (Disease, Symptom) tuple to the network structure
            network_structure.add((current_disease, symptom))

# Convert the set back to a list to create the Bayesian Network
network_structure = list(network_structure)

# Create the Bayesian Network with the structure determined from data
model = BayesianNetwork(network_structure)

# Validate the model
if model.check_model():
    print("The Bayesian model is correctly defined.")
else:
    print("The model has errors.")




ValueError: No CPD associated with paraplegia

In [None]:
# Assuming you have a way to get occurrence data for each disease
disease_occurrences = diseases_over_300[diseases_over_300['PubMed occurrence'] > 3000].set_index('MeSH Disease Term')['PubMed occurrence']
total_occurrences = disease_occurrences.sum()

# Define priors for diseases
for disease in disease_occurrences.index:
    prior_probability = disease_occurrences[disease] / total_occurrences
    cpd_disease = TabularCPD(variable=disease, variable_card=2,
                             values=[[1 - prior_probability], [prior_probability]])
    model.add_cpds(cpd_disease)

# Example for defining symptom probabilities based on disease presence
for (disease, symptom) in network_structure:
    # You need data here to specify these probabilities
    cpd_symptom = TabularCPD(variable=symptom, variable_card=2,
                             values=[[0.95, 0.2],  # Probability of not having symptom given disease absence or presence
                                     [0.05, 0.8]], # Probability of having symptom given disease absence or presence
                             evidence=[disease], evidence_card=[2])
    model.add_cpds(cpd_symptom)

try:
    assert model.check_model()
    print("The Bayesian model is correctly defined.")
except ValueError as e:
    print(f"Model validation failed: {e}")


ValueError: ('CPD defined on variable not in the model', <TabularCPD representing P(Breast Neoplasms:2) at 0x78c5fc39b880>)

In [None]:
# Calculate total occurrences of diseases
total_disease_occurrences = diseases_over_300[diseases_over_300['PubMed occurrence'] > 3000]['PubMed occurrence'].sum()

# Initialize dictionary for priors
disease_priors = {}

# Calculate priors for each disease
for index, row in diseases_over_300.iterrows():
    if row['PubMed occurrence'] > 3000:
        disease = row['MeSH Disease Term']
        disease_priors[disease] = row['PubMed occurrence'] / total_disease_occurrences

# Convert priors to TabularCPD objects for pgmpy
from pgmpy.factors.discrete import TabularCPD
disease_cpds = []
for disease, prior in disease_priors.items():
    cpd = TabularCPD(variable=disease, variable_card=2, values=[[1-prior], [prior]])
    disease_cpds.append(cpd)
    model.add_cpds(cpd)


ValueError: ('CPD defined on variable not in the model', <TabularCPD representing P(Breast Neoplasms:2) at 0x78c5fc399f90>)