In [4]:
import requests
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)
from bs4 import BeautifulSoup
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from itertools import combinations
from collections import Counter
import pandas as pd
import pickle
import operator
from sklearn.model_selection import cross_val_score
from numpy import mean

# Utility Functions
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
splitter = RegexpTokenizer(r'\w+')

def synonyms(term):
    """Fetch synonyms of the input term from WordNet."""
    synonyms = set()
    for syn in wordnet.synsets(term):
        synonyms.update(syn.lemma_names())
    return synonyms

# Load Data and Model
df_comb = pd.read_csv("Dataset/dis_sym_dataset_comb.csv")  # Disease combination
df_norm = pd.read_csv("Dataset/dis_sym_dataset_norm.csv")  # Individual Disease

X = df_comb.iloc[:, 1:]
Y = df_comb.iloc[:, 0:1]

with open('lr_model.pkl', 'rb') as file:
    lr = pickle.load(file)

dataset_symptoms = list(X.columns)

# User Input
def preprocess_symptoms(input_symptoms):
    """Preprocess user-input symptoms."""
    processed = []
    for sym in input_symptoms:
        sym = sym.strip().replace('-', ' ').replace("'", '')
        sym = ' '.join([lemmatizer.lemmatize(word) for word in splitter.tokenize(sym)])
        processed.append(sym)
    return processed

def expand_symptoms(symptoms):
    """Expand symptoms using synonyms."""
    expanded = []
    for user_sym in symptoms:
        user_sym_tokens = user_sym.split()
        expanded_set = set()
        for comb in range(1, len(user_sym_tokens) + 1):
            for subset in combinations(user_sym_tokens, comb):
                expanded_set.update(synonyms(' '.join(subset)))
        expanded_set.add(user_sym)
        expanded.append(' '.join(expanded_set).replace('_', ' '))
    return expanded

# Match Symptoms to Dataset
def match_symptoms(user_symptoms):
    """Find matching symptoms from the dataset."""
    found = set()
    for data_sym in dataset_symptoms:
        data_sym_tokens = data_sym.split()
        for user_sym in user_symptoms:
            match_count = sum(1 for token in data_sym_tokens if token in user_sym.split())
            if match_count / len(data_sym_tokens) > 0.5:
                found.add(data_sym)
    return list(found)

# Co-occurrence-Based Symptom Suggestion
def suggest_cooccurring_symptoms(selected_symptoms):
    """Suggest additional symptoms based on co-occurrence."""
    counter_list = []
    dis_list = set()
    for sym in selected_symptoms:
        dis_list.update(set(df_norm[df_norm[sym] == 1]['label_dis']))

    for dis in dis_list:
        row = df_norm.loc[df_norm['label_dis'] == dis].values.tolist()[0][1:]
        for idx, val in enumerate(row):
            if val != 0 and dataset_symptoms[idx] not in selected_symptoms:
                counter_list.append(dataset_symptoms[idx])

    dict_symp = dict(Counter(counter_list))
    return sorted(dict_symp.items(), key=operator.itemgetter(1), reverse=True)

# Predict Disease
def predict_disease(final_symptoms):
    """Predict diseases based on selected symptoms."""
    sample_x = [0] * len(dataset_symptoms)
    for sym in final_symptoms:
        sample_x[dataset_symptoms.index(sym)] = 1

    prediction = lr.predict_proba([sample_x])
    k = 10
    diseases = sorted(set(Y['label_dis']))
    topk = prediction[0].argsort()[-k:][::-1]
    
    topk_dict = {}
    for t in topk:
        match_sym = set()
        row = df_norm.loc[df_norm['label_dis'] == diseases[t]].values.tolist()[0][1:]
        for idx, val in enumerate(row):
            if val != 0:
                match_sym.add(dataset_symptoms[idx])
        prob = (len(match_sym.intersection(set(final_symptoms))) + 1) / (len(set(final_symptoms)) + 1)
        scores = cross_val_score(lr, X, Y, cv=2)
        prob *= mean(scores)
        topk_dict[t] = prob

    return {diseases[key]: round(topk_dict[key] * 100, 2) for key in sorted(topk_dict, key=topk_dict.get, reverse=True)}

# Main Workflow
user_symptoms = input("Please enter symptoms separated by commas:").lower().split(',')
processed_user_symptoms = preprocess_symptoms(user_symptoms)
expanded_user_symptoms = expand_symptoms(processed_user_symptoms)

print("\nMatching Symptoms...")
found_symptoms = match_symptoms(expanded_user_symptoms)
print("Top matching symptoms:")
for idx, symp in enumerate(found_symptoms):
    print(f"{idx}: {symp}")

selected_indices = input("\nSelect relevant symptoms (space-separated indices):").split()
final_symptoms = [found_symptoms[int(idx)] for idx in selected_indices]

cooccurring_symptoms = suggest_cooccurring_symptoms(final_symptoms)
for idx, (symp, count) in enumerate(cooccurring_symptoms[:10]):
    print(f"{idx}: {symp} ({count})")

predicted_diseases = predict_disease(final_symptoms)
print("\nPredicted Diseases:")
for disease, probability in predicted_diseases.items():
    print(f"{disease}: {probability}%")

Please enter symptoms separated by commas: Joint pain, Stiffness or reduced range of motion (how far you can move a joint), Swelling (inflammation),Skin discoloration,Tenderness or sensitivity to touch around a joint, A feeling of heat or warmth near your joints



Matching Symptoms...
Top matching symptoms:
0: lump bump neck
1: back
2: high body temperature
3: muscle joint pain
4: painful
5: neck
6: swelling
7: trouble sensation
8: painful swollen joint
9: joint bone pain
10: multiple painful joint
11: decreased range motion
12: redness
13: stiffness



Select relevant symptoms (space-separated indices): 3 4 9 10 11 13


0: fever (5)
1: headache (3)
2: maculopapular rash (2)
3: redness (1)
4: swelling (1)
5: bad breath (1)
6: bleeding gum (1)
7: loose teeth (1)
8: red (1)
9: swollen (1)

Predicted Diseases:
Arthritis: 50.32%
Osteoarthritis: 50.32%
Influenza: 25.16%
Bleeding Gums: 25.16%
Dengue: 25.16%
Zika virus disease: 25.16%
Rheumatic fever: 25.16%
Yaws: 25.16%
Impetigo: 25.16%
Chikungunya Fever: 25.16%


In [6]:
import requests
import warnings
from tqdm import tqdm
import time
from bs4 import BeautifulSoup
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from itertools import combinations
from collections import Counter
import pandas as pd
import pickle
import operator
from sklearn.model_selection import cross_val_score
from numpy import mean

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
# warnings.filterwarnings("ignore", category=DataConversionWarning)

# Utility Functions
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
splitter = RegexpTokenizer(r'\w+')

def synonyms(term):
    """Fetch synonyms of the input term from WordNet."""
    synonyms = set()
    for syn in wordnet.synsets(term):
        synonyms.update(syn.lemma_names())
    return synonyms

# Load Data and Model
df_comb = pd.read_csv("Dataset/dis_sym_dataset_comb.csv")  # Disease combination
df_norm = pd.read_csv("Dataset/dis_sym_dataset_norm.csv")  # Individual Disease

X = df_comb.iloc[:, 1:]
Y = df_comb.iloc[:, 0:1]

with open('lr_model.pkl', 'rb') as file:
    lr = pickle.load(file)

dataset_symptoms = list(X.columns)

# User Input
def preprocess_symptoms(input_symptoms):
    """Preprocess user-input symptoms."""
    print("Processing symptoms...")
    time.sleep(1)  # Simulate processing delay
    processed = []
    for sym in tqdm(input_symptoms, desc="Preprocessing symptoms"):
        sym = sym.strip().replace('-', ' ').replace("'", '')
        sym = ' '.join([lemmatizer.lemmatize(word) for word in splitter.tokenize(sym)])
        processed.append(sym)
    return processed

def expand_symptoms(symptoms):
    """Expand symptoms using synonyms."""
    expanded = []
    print("Expanding symptoms...")
    time.sleep(1)  # Simulate processing delay
    for user_sym in tqdm(symptoms, desc="Expanding symptoms"):
        user_sym_tokens = user_sym.split()
        expanded_set = set()
        for comb in range(1, len(user_sym_tokens) + 1):
            for subset in combinations(user_sym_tokens, comb):
                expanded_set.update(synonyms(' '.join(subset)))
        expanded_set.add(user_sym)
        expanded.append(' '.join(expanded_set).replace('_', ' '))
    return expanded

# Match Symptoms to Dataset
def match_symptoms(user_symptoms):
    """Find matching symptoms from the dataset."""
    print("Matching symptoms from the dataset...")
    time.sleep(1)
    found = set()
    for data_sym in tqdm(dataset_symptoms, desc="Matching symptoms"):
        data_sym_tokens = data_sym.split()
        for user_sym in user_symptoms:
            match_count = sum(1 for token in data_sym_tokens if token in user_sym.split())
            if match_count / len(data_sym_tokens) > 0.5:
                found.add(data_sym)
    return list(found)

# Co-occurrence-Based Symptom Suggestion
def suggest_cooccurring_symptoms(selected_symptoms):
    """Suggest additional symptoms based on co-occurrence."""
    print("Suggesting co-occurring symptoms...")
    time.sleep(1)
    counter_list = []
    dis_list = set()
    for sym in selected_symptoms:
        dis_list.update(set(df_norm[df_norm[sym] == 1]['label_dis']))

    for dis in dis_list:
        row = df_norm.loc[df_norm['label_dis'] == dis].values.tolist()[0][1:]
        for idx, val in enumerate(row):
            if val != 0 and dataset_symptoms[idx] not in selected_symptoms:
                counter_list.append(dataset_symptoms[idx])

    dict_symp = dict(Counter(counter_list))
    return sorted(dict_symp.items(), key=operator.itemgetter(1), reverse=True)

# Predict Disease
def predict_disease(final_symptoms):
    """Predict diseases based on selected symptoms."""
    print("Predicting diseases...")
    time.sleep(1)
    sample_x = [0] * len(dataset_symptoms)
    for sym in final_symptoms:
        sample_x[dataset_symptoms.index(sym)] = 1

    prediction = lr.predict_proba([sample_x])
    k = 10
    diseases = sorted(set(Y['label_dis']))
    topk = prediction[0].argsort()[-k:][::-1]
    
    topk_dict = {}
    for t in topk:
        match_sym = set()
        row = df_norm.loc[df_norm['label_dis'] == diseases[t]].values.tolist()[0][1:]
        for idx, val in enumerate(row):
            if val != 0:
                match_sym.add(dataset_symptoms[idx])
        prob = (len(match_sym.intersection(set(final_symptoms))) + 1) / (len(set(final_symptoms)) + 1)
        scores = cross_val_score(lr, X, Y, cv=2)
        prob *= mean(scores)
        topk_dict[t] = prob

    return {diseases[key]: round(topk_dict[key] * 100, 2) for key in sorted(topk_dict, key=topk_dict.get, reverse=True)}

# Main Workflow
user_symptoms = input("Please enter symptoms separated by commas:").lower().split(',')
processed_user_symptoms = preprocess_symptoms(user_symptoms)
expanded_user_symptoms = expand_symptoms(processed_user_symptoms)

found_symptoms = match_symptoms(expanded_user_symptoms)
print("\nTop matching symptoms:")
for idx, symp in enumerate(found_symptoms):
    print(f"{idx}: {symp}")

selected_indices = input("\nSelect relevant symptoms (space-separated indices):").split()
final_symptoms = [found_symptoms[int(idx)] for idx in selected_indices]

cooccurring_symptoms = suggest_cooccurring_symptoms(final_symptoms)
print("\nSuggested co-occurring symptoms:")
for idx, (symp, count) in enumerate(cooccurring_symptoms[:10]):
    print(f"{idx}: {symp} ({count})")

predicted_diseases = predict_disease(final_symptoms)
print("\nPredicted Diseases:")
for disease, probability in predicted_diseases.items():
    print(f"{disease}: {probability}%")


Please enter symptoms separated by commas:  Dark urine,clay-colored stools, Diarrhea, Feeling tired, Fever, Joint pain,Loss of appetite,Nausea, stomach pain, vomiting,Yellow skin


Processing symptoms...


Preprocessing symptoms: 100%|███████████████████████████████████████████████████████████████| 11/11 [00:00<?, ?it/s]


Expanding symptoms...


Expanding symptoms: 100%|███████████████████████████████████████████████████████████████████| 11/11 [00:00<?, ?it/s]


Matching symptoms from the dataset...


Matching symptoms: 100%|███████████████████████████████████████████████████████| 489/489 [00:00<00:00, 15102.53it/s]



Top matching symptoms:
0: diarrhoea
1: trouble sensation
2: vomiting
3: dark urine
4: feeling tired
5: red
6: diarrhea
7: fever
8: stomach pain
9: neck
10: yellowish skin crust
11: painful swollen joint
12: loss appetite
13: joint bone pain
14: multiple painful joint
15: blue
16: yellow skin
17: muscle joint pain
18: painful
19: yellowish skin
20: feeling tired time
21: nausea
22: fatigue



Select relevant symptoms (space-separated indices): 3 4 6 17 


Suggesting co-occurring symptoms...

Suggested co-occurring symptoms:
0: fever (12)
1: testicular pain (9)
2: vomiting (6)
3: muscle weakness (5)
4: shortness breath (5)
5: nausea (4)
6: chest pain (3)
7: headache (3)
8: unintended weight loss (2)
9: maculopapular rash (2)
Predicting diseases...

Predicted Diseases:
Influenza: 52.84%
Hepatitis A: 52.84%
Thalassaemia: 52.84%
Hyperthyroidism: 35.22%
Dengue: 35.22%
Myocardial Infarction (Heart Attack): 35.22%
Scurvy: 35.22%
Lupus erythematosus: 35.22%
Lymphoma: 35.22%
Hepatitis B: 35.22%


In [22]:
Y

Unnamed: 0,label_dis
0,Abscess
1,Abscess
2,Abscess
3,Abscess
4,Abscess
...,...
8830,papilloedema
8831,papilloedema
8832,papilloedema
8833,papilloedema


In [2]:
import warnings
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, cross_val_score
from statistics import mean
from nltk.corpus import wordnet 
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from itertools import combinations
from time import time
from collections import Counter
import operator
from xgboost import XGBClassifier
import math
from Treatment import diseaseDetail
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
warnings.simplefilter("ignore")

In [6]:
# utlities for pre-processing
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
splitter = RegexpTokenizer(r'\w+')

df_comb = pd.read_csv("Dataset/dis_sym_dataset_comb.csv") # Disease combination
df_norm = pd.read_csv("Dataset/dis_sym_dataset_norm.csv") # Individual Disease

X = df_comb.iloc[:, 1:]
Y = df_comb.iloc[:, 0:1]

import pickle
with open('lr_model.pkl', 'rb') as file:
    lr = pickle.load(file)
dataset_symptoms = list(X.columns)

# Taking symptoms from user as input 
user_symptoms = str(input("Please enter symptoms separated by comma(,):\n")).lower().split(',')
# Preprocessing the input symptoms
processed_user_symptoms=[]
for sym in user_symptoms:
    sym=sym.strip()
    sym=sym.replace('-',' ')
    sym=sym.replace("'",'')
    sym = ' '.join([lemmatizer.lemmatize(word) for word in splitter.tokenize(sym)])
    processed_user_symptoms.append(sym)

# Taking each user symptom and finding all its synonyms and appending it to the pre-processed symptom string
print(".........Processing all of that symtomps...........")
for user_sym in processed_user_symptoms:
    user_sym = user_sym.split()
    str_sym = set()
    for comb in range(1, len(user_sym)+1):
        for subset in combinations(user_sym, comb):
            subset=' '.join(subset)
            subset = synonyms(subset) 
            str_sym.update(subset)
    str_sym.add(' '.join(user_sym))
    user_symptoms.append(' '.join(str_sym).replace('_',' '))
# query expansion performed by joining synonyms found for each symptoms initially entered
# print("After query expansion done by using the symptoms entered")
# print(user_symptoms)
print("Processed")
found_symptoms = set()
for idx, data_sym in enumerate(dataset_symptoms):
    data_sym_split=data_sym.split()
    for user_sym in user_symptoms:
        count=0
        for symp in data_sym_split:
            if symp in user_sym.split():
                count+=1
        if count/len(data_sym_split)>0.5:
            found_symptoms.add(data_sym)
found_symptoms = list(found_symptoms)
# Print all found symptoms
print("Top matching symptoms from your search!")
for idx, symp in enumerate(found_symptoms):
    print(idx,":",symp)
    
# Show the related symptoms found in the dataset and ask user to select among them
select_list = input("\nPlease select the relevant symptoms. Enter indices (separated-space):\n").split()

# Find other relevant symptoms from the dataset based on user symptoms based on the highest co-occurance with the
# ones that is input by the user
dis_list = set()
final_symp = [] 
counter_list = []
for idx in select_list:
    symp=found_symptoms[int(idx)]
    final_symp.append(symp)
    dis_list.update(set(df_norm[df_norm[symp]==1]['label_dis']))
   
for dis in dis_list:
    row = df_norm.loc[df_norm['label_dis'] == dis].values.tolist()
    row[0].pop(0)
    for idx,val in enumerate(row[0]):
        if val!=0 and dataset_symptoms[idx] not in final_symp:
            counter_list.append(dataset_symptoms[idx])
dict_symp = dict(Counter(counter_list))
dict_symp_tup = sorted(dict_symp.items(), key=operator.itemgetter(1),reverse=True)   
print(dict_symp_tup)
# Iteratively, suggest top co-occuring symptoms to the user and ask to select the ones applicable 
found_symptoms=[]
count=0
for tup in dict_symp_tup:
    count+=1
    found_symptoms.append(tup[0])
    if count%5==0 or count==len(dict_symp_tup):
        print("\nCommon co-occuring symptoms:")
        for idx,ele in enumerate(found_symptoms):
            print(idx,":",ele)
        select_list = input("Do you have have of these symptoms? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:\n").lower().split();
        if select_list[0]=='no':
            break
        if select_list[0]=='-1':
            found_symptoms = [] 
            continue
        for idx in select_list:
            final_symp.append(found_symptoms[int(idx)])
        found_symptoms = [] 
# Create query vector based on symptoms selected by the user
print("\nFinal list of Symptoms that will be used for prediction:")
sample_x = [0 for x in range(0,len(dataset_symptoms))]
for val in final_symp:
    print(val)
    sample_x[dataset_symptoms.index(val)]=1
prediction = lr.predict_proba([sample_x])
k = 10
diseases = list(set(Y['label_dis']))
diseases.sort()
topk = prediction[0].argsort()[-k:][::-1]
print(f"\nTop {k} diseases predicted based on symptoms")
topk_dict = {}
# Show top 10 highly probable disease to the user.
for idx,t in  enumerate(topk):
    match_sym=set()
    row = df_norm.loc[df_norm['label_dis'] == diseases[t]].values.tolist()
    row[0].pop(0)

    for idx,val in enumerate(row[0]):
        if val!=0:
            match_sym.add(dataset_symptoms[idx])
    prob = (len(match_sym.intersection(set(final_symp)))+1)/(len(set(final_symp))+1)
    scores = cross_val_score(lr, X, Y, cv=2)
    prob *= mean(scores)
    topk_dict[t] = prob
j = 0
topk_index_mapping = {}
topk_sorted = dict(sorted(topk_dict.items(), key=lambda kv: kv[1], reverse=True))
for key in topk_sorted:
  prob = topk_sorted[key]*100
  print(str(j) + " Disease name:",diseases[key], "\tProbability:",str(round(prob, 2))+"%")
  topk_index_mapping[j] = key
  j += 1

select = input("\nMore details about the disease? Enter index of disease or '-1' to discontinue and close the system:\n")
if select!='-1':
    dis=diseases[topk_index_mapping[int(select)]]
    print()
    print(diseaseDetail(dis))

Please enter symptoms separated by comma(,):
  Dark urine,clay-colored stools, Diarrhea, Feeling tired, Fever, Joint pain,Loss of appetite,Nausea, stomach pain, vomiting,Yellow skin


.........Processing all of that symtomps...........
Processed
Top matching symptoms from your search!
0 : trouble sensation
1 : diarrhoea
2 : joint bone pain
3 : multiple painful joint
4 : fatigue
5 : yellow skin
6 : nausea
7 : stomach pain
8 : neck
9 : red
10 : yellowish skin
11 : yellowish skin crust
12 : blue
13 : muscle joint pain
14 : vomiting
15 : loss appetite
16 : dark urine
17 : fever
18 : feeling tired
19 : diarrhea
20 : painful
21 : feeling tired time
22 : painful swollen joint



Please select the relevant symptoms. Enter indices (separated-space):
 0 1 3 5 6


[('headache', 5), ('testicular pain', 5), ('fever', 5), ('diarrhea', 3), ('vomiting', 3), ('dizziness', 2), ('jaundice', 2), ('chest pain', 2), ('shortness breath', 2), ('confusion', 1), ('dry damp skin', 1), ('high body temperature', 1), ('red', 1), ('bloating', 1), ('gas', 1), ('blindness one eye', 1), ('double vision', 1), ('muscle weakness', 1), ('trouble coordination', 1), ('eye pain', 1), ('mid dilated pupil', 1), ('redness eye', 1), ('vision loss', 1), ('light sensitivity', 1), ('sensitivity smell', 1), ('sensitivity sound', 1), ('abdominal distention', 1), ('constipation', 1), ('dermatitis herpetiformis', 1), ('malabsorption', 1), ('none non specific', 1), ('unintended weight loss', 1), ('arm', 1), ('back', 1), ('cold sweat', 1), ('feeling faint upon standing', 1), ('feeling tired', 1), ('jaw', 1), ('neck', 1), ('stomach pain', 1), ('chill', 1), ('abscess', 1), ('small blister surrounding swelling', 1), ('muscular pain', 1), ('sore throat', 1), ('vaginal bleeding', 1), ('dark u

Do you have have of these symptoms? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:
 0 3 4



Common co-occuring symptoms:
0 : dizziness
1 : jaundice
2 : chest pain
3 : shortness breath
4 : confusion


Do you have have of these symptoms? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:
 no



Final list of Symptoms that will be used for prediction:
trouble sensation
diarrhoea
multiple painful joint
yellow skin
nausea
headache
diarrhea
vomiting

Top 10 diseases predicted based on symptoms
0 Disease name: Anthrax 	Probability: 39.14%
1 Disease name: Hepatitis A 	Probability: 39.14%
2 Disease name: Crimean Congo haemorrhagic fever (CCHF) 	Probability: 39.14%
3 Disease name: Ebola 	Probability: 29.35%
4 Disease name: Yellow Fever 	Probability: 29.35%
5 Disease name: lactose intolerance 	Probability: 29.35%
6 Disease name: Heat-Related Illnesses and Heat waves 	Probability: 29.35%
7 Disease name: Migraine 	Probability: 29.35%
8 Disease name: Dehydration 	Probability: 29.35%
9 Disease name: Calculi 	Probability: 29.35%



More details about the disease? Enter index of disease or '-1' to discontinue and close the system:
 0



Anthrax
Specialty -  Infectious disease 
Symptoms -    
Skin form : small blister with surrounding swelling   
Inhalational form : fever, chest pain, shortness of breath   
Intestinal form : nausea, vomiting, diarrhea, abdominal pain   
Injection form : fever, abscess     
Usual onset -  1 day to 2 months post contact     
Causes -   Bacillus anthracis      
Risk factors -  Working with animals, travelers, postal workers, military personnel     
Diagnostic method -  Based on antibodies or toxin in the blood, microbial culture     
Prevention -  Anthrax vaccination, antibiotics     
Treatment -  Antibiotics, antitoxin     
Prognosis -  20–80% die without treatment     
Frequency -  >,2,000 cases per year     



In [15]:
Y.value_counts()

label_dis                           
Myocardial Infarction (Heart Attack)    2047
Polycystic ovary syndrome (PCOS)         511
Anthrax                                  511
Porphyria                                255
Rabies                                   255
                                        ... 
Neoplasm                                   1
Burns                                      1
Fibroids                                   1
Taeniasis/cysticercosis                    1
Hypotonia                                  1
Name: count, Length: 261, dtype: int64