In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from collections import Counter

In [3]:
pd.set_option('display.max_colwidth', 100)

In [4]:
dis_sym_data = pd.read_csv("/content/Original_Dataset.csv")

In [5]:
dis_sym_data.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Unnamed: 18,Unnamed: 19
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,,,


In [6]:
dis_sym_data.shape

(4920, 20)

In [7]:
columns_to_check = []
for col in dis_sym_data.columns:
    if col != 'Disease':
        columns_to_check.append(col)

In [8]:
symptoms = dis_sym_data.iloc[:, 1:].values.flatten()
symptoms = list(set(symptoms))

In [9]:
for symptom in symptoms:
    dis_sym_data[symptom] = dis_sym_data.iloc[:, 1:].apply(lambda row: int(symptom in row.values), axis=1)

dis_sym_data_v1 = dis_sym_data.drop(columns=columns_to_check)

In [10]:
dis_sym_data_v1 = dis_sym_data_v1.loc[:, dis_sym_data_v1.columns.notna()]

In [11]:
dis_sym_data_v1.shape

(4920, 133)

In [12]:
dis_sym_data_v1.columns = dis_sym_data_v1.columns.str.strip()

In [13]:
dis_sym_data_v1.columns

Index(['Disease', 'sunken_eyes', 'indigestion', 'distention_of_abdomen',
       'nodal_skin_eruptions', 'dischromic _patches',
       'receiving_blood_transfusion', 'stiff_neck', 'anxiety',
       'extra_marital_contacts',
       ...
       'patches_in_throat', 'red_sore_around_nose', 'acute_liver_failure',
       'dizziness', 'obesity', 'throat_irritation', 'prominent_veins_on_calf',
       'itching', 'phlegm', 'fluid_overload'],
      dtype='object', length=133)

In [14]:
var_mod = ['Disease']
le = LabelEncoder()
for i in var_mod:
    dis_sym_data_v1[i] = le.fit_transform(dis_sym_data_v1[i])

In [15]:
X = dis_sym_data_v1.drop(columns="Disease")
y = dis_sym_data_v1['Disease']

In [16]:
def class_algo(model,independent,dependent):
    model.fit(independent,dependent)
    pred = model.predict(independent)
    accuracy = metrics.accuracy_score(pred,dependent)
    print(model_name,'Accuracy : %s' % '{0:.3%}'.format(accuracy))


In [17]:
algorithms = {'Logistic Regression':
              {"model": LogisticRegression()},

              'Decision Tree':
              {"model": tree.DecisionTreeClassifier()},

              'Random Forest':
              {"model": RandomForestClassifier()},

              'SVM':
              {"model": svm.SVC(probability=True)},

              'NaiveBayes' :
              {"model": GaussianNB()},

              'K-Nearest Neighbors' :
              {"model": KNeighborsClassifier()},
             }

In [18]:
for model_name, values in algorithms.items():
    class_algo(values["model"],X,y)

Logistic Regression Accuracy : 100.000%
Decision Tree Accuracy : 100.000%
Random Forest Accuracy : 100.000%
SVM Accuracy : 100.000%
NaiveBayes Accuracy : 100.000%
K-Nearest Neighbors Accuracy : 100.000%


In [19]:
doc_data = pd.read_csv("/content/Doctor_Versus_Disease.csv", encoding='latin1', names=['Disease','Specialist','time_slot'])

In [20]:
doc_data.tail(5)

Unnamed: 0,Disease,Specialist,time_slot
36,Bronchial Asthma,Pulmonologist,7:00 PM
37,Pneumonia,Pulmonologist,7:10 PM
38,Osteoarthristis,Rheumatologists,7:20 PM
39,Arthritis,Rheumatologists,7:40 PM
40,Tuberculosis,Tuberculosis,8:00 PM


In [21]:
doc_data['Specialist'] = np.where((doc_data['Disease'] == 'Tuberculosis'),'Pulmonologist', doc_data['Specialist'])

In [22]:
doc_data.tail(5)

Unnamed: 0,Disease,Specialist,time_slot
36,Bronchial Asthma,Pulmonologist,7:00 PM
37,Pneumonia,Pulmonologist,7:10 PM
38,Osteoarthristis,Rheumatologists,7:20 PM
39,Arthritis,Rheumatologists,7:40 PM
40,Tuberculosis,Pulmonologist,8:00 PM


In [23]:
des_data = pd.read_csv("/content/Disease_Description.csv")

In [24]:
des_data.head()

Unnamed: 0,Disease,Description
0,Drug Reaction,An adverse drug reaction (ADR) is an injury caused by taking medication. ADRs may occur followin...
1,Malaria,An infectious disease caused by protozoan parasites from the Plasmodium family that can be trans...
2,Allergy,An allergy is an immune system response to a foreign substance that's not typically harmful to y...
3,Hypothyroidism,"Hypothyroidism, also called underactive thyroid or low thyroid, is a disorder of the endocrine s..."
4,Psoriasis,"Psoriasis is a common skin disorder that forms thick, red, bumpy patches covered with silvery sc..."


In [25]:
test_col = []
for col in dis_sym_data_v1.columns:
    if col != 'Disease':
        test_col.append(col)


test_data = {}
symptoms = []
predicted = []
def test_input():
    symptoms.clear()
    predicted.clear()
    num_inputs = int(input("Enter the number of symptoms you have: "))
    for i in range(num_inputs):
        user_input = input("Enter Symptoms #{}: ".format(i+1))
        symptoms.append(user_input)
    print("Symptoms you have:", symptoms)
    for column in test_col:
        test_data[column] = 1 if column in symptoms else 0
    test_df = pd.DataFrame(test_data, index=[0])
    print("Predicting Disease based on 6 ML algorithms...")
    for model_name, values in algorithms.items():
        predict_disease = values["model"].predict(test_df)
        predict_disease = le.inverse_transform(predict_disease)
        predicted.extend(predict_disease)
    disease_counts = Counter(predicted)
    percentage_per_disease = {disease: (count / 6) * 100 for disease, count in disease_counts.items()}
    result_df = pd.DataFrame({"Disease": list(percentage_per_disease.keys()),
                               "Chances": list(percentage_per_disease.values())})
    result_df = result_df.merge(doc_data, on='Disease', how='left')
    result_df = result_df.merge(des_data, on='Disease', how='left')
    return result_df

In [26]:
test_input()

Enter the number of symptoms you have: 2
Enter Symptoms #1: headache
Enter Symptoms #2: cough
Symptoms you have: ['headache', 'cough']
Predicting Disease based on 6 ML algorithms...


Unnamed: 0,Disease,Chances,Specialist,time_slot,Description
0,Paralysis (brain hemorrhage),83.333333,Neurologist,4:15 PM,"Intracerebral hemorrhage (ICH) is when blood suddenly bursts into brain tissue, causing damage t..."
1,GERD,16.666667,Gastroenterologist,12:00 PM,"Gastroesophageal reflux disease, or GERD, is a digestive disorder that affects the lower esophag..."
