Importing the necessary libraries.

In [1]:
import pandas as pd
import numpy as np

Loading the one-hot encoded dataset.

In [20]:
DATASET_PATH = 'dataset/processed_data.csv'
df = pd.read_csv(DATASET_PATH)
df

Unnamed: 0,Disease,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,diabetes,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,depression mental,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,depressive disorder,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,coronary arteriosclerosis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,ileus,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
144,adhesion,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
145,delusion,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
146,affect labile,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Defining the official list of symptoms from column names.

In [21]:
valid_symptoms = df.columns.tolist()
valid_symptoms.remove('Disease')
valid_symptoms

["Heberden's node",
 "Murphy's sign",
 "Stahli's line",
 'abdomen acute',
 'abdominal bloating',
 'abdominal tenderness',
 'abnormal sensation',
 'abnormally hard consistency',
 'abortion',
 'abscess bacterial',
 'absences finding',
 'achalasia',
 'ache',
 'adverse effect',
 'adverse reaction',
 'agitation',
 'air fluid level',
 'alcohol binge episode',
 'alcoholic withdrawal symptoms',
 'ambidexterity',
 'angina pectoris',
 'anorexia',
 'anosmia',
 'aphagia',
 'apyrexial',
 'arthralgia',
 'ascites',
 'asterixis',
 'asthenia',
 'asymptomatic',
 'ataxia',
 'atypia',
 'aura',
 'awakening early',
 'barking cough',
 'bedridden',
 'behavior hyperactive',
 'behavior showing increased motor activity',
 'blackout',
 'blanch',
 'bleeding of vagina',
 'bowel sounds decreased',
 'bradycardia',
 'bradykinesia',
 'breakthrough pain',
 'breath sounds decreased',
 'breath-holding spell',
 'breech presentation',
 'bruit',
 'burning sensation',
 'cachexia',
 'cardiomegaly',
 'cardiovascular event',
 'c

Getting the list of diseases from each entry.

In [27]:
diseases = pd.Series(df['Disease']).tolist()
diseases

['hypertensive disease',
 'diabetes',
 'depression mental',
 'depressive disorder',
 'coronary arteriosclerosis',
 'coronary heart disease',
 'pneumonia',
 'failure heart congestive',
 'accident cerebrovascular',
 'asthma',
 'myocardial infarction',
 'hypercholesterolemia',
 'infection',
 'infection urinary tract',
 'anemia',
 'chronic obstructive airway disease',
 'dementia',
 'insufficiency renal',
 'confusion',
 'degenerative polyarthritis',
 'hypothyroidism',
 'anxiety state',
 'malignant neoplasms',
 'primary malignant neoplasm',
 'acquired immuno-deficiency syndrome',
 'HIV',
 'hiv infections',
 'cellulitis',
 'gastroesophageal reflux disease',
 'septicemia',
 'systemic infection',
 'sepsis (invertebrate)',
 'deep vein thrombosis',
 'dehydration',
 'neoplasm',
 'embolism pulmonary',
 'epilepsy',
 'cardiomyopathy',
 'chronic kidney failure',
 'carcinoma',
 'hepatitis C',
 'peripheral vascular disease',
 'psychotic disorder',
 'hyperlipidemia',
 'bipolar disorder',
 'obesity',
 'is

Function for prediction based on the number of matching symptoms.

In [103]:
def predict(symptoms):
  # Check if the symptoms from provided list is in the valid list of symptoms.
  validated_symptoms = []
  for symptom in symptoms:
    # Retain only the valid ones.
    if symptom in valid_symptoms:
      validated_symptoms.append(symptom)

  summed_df = df  # Recreate the dataframe.
  summed_df['Matching Symptoms'] = summed_df[validated_symptoms].sum(axis=1)  # Obtain the number of matching symptoms from each entry based on the provided symptoms list by getting the sum.
  disease_symptoms_sum_df = summed_df[['Disease', 'Matching Symptoms']] # Refactor the dataframe into disease and number of matching symptoms.
  top_predictions_df = disease_symptoms_sum_df.sort_values('Matching Symptoms', ascending=False)  # Sort the entries by number of matching symptoms.
  top_predictions = top_predictions_df.to_dict('records') # Export the data frame as dictionary.
  return top_predictions

Checking the prediction.

In [107]:
sample = {"disease": "failure heart congestive",
          "symptoms": ["cough", "dyspnea", "jugular", "venous", "distention",
               "orthopnea", "rale", "shortness of breath", "wheezing"]}

symptoms = sample["symptoms"]
prediction = predict(symptoms)
# print(prediction)

if sample["disease"] == prediction[0]["Disease"]:
     print("Correct")
else:
     print("Wrong")


Correct
