In [2]:
import pandas as pd
import numpy as np
import os
import csv

### Data Preprocessing and data Scraping

Data:
http://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html

In [3]:
#Copy the table from this link and paste it into a csv file named dataset_uncleaned.csv
from collections import defaultdict
#A defaultdict works exactly like a normal dict, but it is initialized with a function (“default factory”) that takes no arguments and provides the default value for a nonexistent key.
disease_list=[]

def return_list(disease):
    disease_list=[]
    match = disease.replace('^','_').split('_')
    ctr = 1
    for group in match:
        if(ctr%2==0):
            disease_list.append(group)
        ctr=ctr+1
    return disease_list

with open('dataset_uncleaned.csv') as csvfile:
    reader=csv.reader(csvfile)
    disease=''
    weight=0
    disease_list=[]
    dict_wt={}
    dict_=defaultdict(list)
    for row in reader:
        #print(row) 
        if row[0]!="\xc2\xa0" and row[0]!="":
            #"\xc2\xa0" is called Non-breaking space It is a kind of invisible control character in UTF-8 encodings
            disease = row[0]
            disease_list = return_list(disease)
            weight = row[1]

        if row[2]!="\xc2\xa0" and row[2]!="":
            symptom_list = return_list(row[2])

            for d in disease_list:
                for s in symptom_list:
                    dict_[d].append(s)
                dict_wt[d] = weight

In [4]:
#making a dataset with clean values

with open("dataset_clean.csv","w") as csvfile:
    writer=csv.writer(csvfile)
    for key,values in dict_.items():
        for v in values:
            key = str.encode(key).decode('utf-8')
            writer.writerow([key,v,dict_wt[key]])
    

In [5]:
columns = ['Source','Target','Weight']

In [6]:
data = pd.read_csv("dataset_clean.csv",names=columns, encoding ="ISO-8859-1")

In [7]:
data.head()

Unnamed: 0,Source,Target,Weight
0,hypertensive disease,pain chest,3363
1,hypertensive disease,shortness of breath,3363
2,hypertensive disease,dizziness,3363
3,hypertensive disease,asthenia,3363
4,hypertensive disease,fall,3363


In [8]:
data.to_csv("dataset_clean.csv",index=False)

In [9]:
slist = []
dlist = []
with open("nodetable.csv","w") as csvfile:
    writer = csv.writer(csvfile)

    for key,values in dict_.items():
        for v in values:
            if v not in slist:
                writer.writerow([v,v,"symptom"])
                slist.append(v)
        if key not in dlist:
            writer.writerow([key,key,"disease"])
            dlist.append(key)

In [10]:
nt_columns = ['Id','Label','Attribute']

In [11]:
nt_data = pd.read_csv("nodetable.csv",names=nt_columns, encoding ="ISO-8859-1",)

In [12]:
nt_data.head()

Unnamed: 0,Id,Label,Attribute
0,pain chest,pain chest,symptom
1,shortness of breath,shortness of breath,symptom
2,dizziness,dizziness,symptom
3,asthenia,asthenia,symptom
4,fall,fall,symptom


In [13]:
nt_data.to_csv("nodetable.csv",index=False)

In [14]:
data = pd.read_csv("dataset_clean.csv", encoding ="ISO-8859-1")

In [15]:
data.head()

Unnamed: 0,Source,Target,Weight
0,hypertensive disease,pain chest,3363
1,hypertensive disease,shortness of breath,3363
2,hypertensive disease,dizziness,3363
3,hypertensive disease,asthenia,3363
4,hypertensive disease,fall,3363


### Number of diseases and symptoms in our dataset

In [16]:
#disease
len(data['Source'].unique())

149

In [17]:
#symptoms
len(data['Target'].unique())

405

In [18]:
df = pd.DataFrame(data)

In [19]:
df_1 = pd.get_dummies(df.Target)

In [20]:
df_1.head()

Unnamed: 0,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,abscess bacterial,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
df.head()

Unnamed: 0,Source,Target,Weight
0,hypertensive disease,pain chest,3363
1,hypertensive disease,shortness of breath,3363
2,hypertensive disease,dizziness,3363
3,hypertensive disease,asthenia,3363
4,hypertensive disease,fall,3363


In [22]:
df_s = df['Source']

In [23]:
df_pivoted = pd.concat([df_s,df_1], axis=1)

In [24]:
df_pivoted.drop_duplicates(keep='first',inplace=True)

In [25]:
df_pivoted.head()

Unnamed: 0,Source,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
len(df_pivoted)

2116

In [27]:
cols = df_pivoted.columns

In [28]:
cols = cols[1:]

In [29]:
df_pivoted = df_pivoted.groupby('Source').sum()
df_pivoted = df_pivoted.reset_index()
df_pivoted.head()

Unnamed: 0,Source,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,HIV,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,PneumocystisÂ cariniiÂ pneumonia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,accidentÂ cerebrovascular,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,acquiredÂ immuno-deficiency syndrome,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
df_pivoted.to_csv("df_pivoted_final, dataset1.csv")

In [31]:
X = df_pivoted[cols]
y = df_pivoted['Source']

In [32]:
#Trying out our classifier to learn diseases from the symptoms

In [33]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [36]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb = mnb.fit(X_train, y_train)

In [37]:
mnb.score(X_test, y_test)

0.0

It can't work on unseen data because it has never seen that disease before. Also, there is only one point for each disease and hence no point for this. So we need to train the model entirely. Then what will we test it on? Missing data? Say given one symptom what is the disease? This is again multilabel classification. We can work symptom on symptom. What exactly is differential diagnosis, we need to replicate that.

In [38]:
X.head(2)

Unnamed: 0,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,abscess bacterial,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
y.head(5)

0                     Alzheimer's disease
1                                     HIV
2        PneumocystisÂ cariniiÂ pneumonia
3               accidentÂ cerebrovascular
4    acquiredÂ immuno-deficiency syndrome
Name: Source, dtype: object

In [41]:
mnb_tot = MultinomialNB()
mnb_tot = mnb_tot.fit(X, y)

In [42]:
mnb_tot.score(X, y)

0.8993288590604027

In [43]:
disease_pred = mnb_tot.predict(X)

In [44]:
disease_real = y.values

In [45]:
#prediction wrong
for i in range(0, len(disease_real)):
    if disease_pred[i]!=disease_real[i]:
        print ('Pred: {0} Actual:{1}'.format(disease_pred[i], disease_real[i]))

Pred: HIV Actual:acquiredÂ immuno-deficiency syndrome
Pred: biliary calculus Actual:cholelithiasis
Pred: coronary arteriosclerosis Actual:coronary heart disease
Pred: depression mental Actual:depressive disorder
Pred: HIV Actual:hiv infections
Pred: carcinoma breast Actual:malignant neoplasm of breast
Pred: carcinoma of lung Actual:malignant neoplasm of lung
Pred: carcinoma prostate Actual:malignant neoplasm of prostate
Pred: carcinoma colon Actual:malignant tumor of colon
Pred: candidiasis Actual:oralcandidiasis
Pred: effusion pericardial Actual:pericardial effusion body substance
Pred: malignant neoplasms Actual:primary malignant neoplasm
Pred: sepsis (invertebrate) Actual:septicemia
Pred: sepsis (invertebrate) Actual:systemic infection
Pred: tonic-clonic epilepsy Actual:tonic-clonic seizures


### Taking  decision tree

In [46]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [47]:
#print ("DecisionTree")
dt = DecisionTreeClassifier()
clf_dt=dt.fit(X,y)
print ("Acurracy: ", clf_dt.score(X,y))

Acurracy:  0.8993288590604027


In [48]:
from sklearn import tree 
from sklearn.tree import export_graphviz

export_graphviz(dt, 
                out_file='DOT-files/tree.dot', 
                feature_names=cols)

Running the following command we can get the decision tree image.

```dot -Tpng tree.dot -o tree.png```

In [None]:
from IPython.display import Image
Image(filename='tree.png')

### According to the plotted decision tree, `Jugular venous distention` is the attribute symptom that has the highest gini score of 0.9846. Thus this symptom would play a major role in predicting diseases.

In [49]:
#Dataset 1
df_pivoted.head()

Unnamed: 0,Source,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,HIV,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,PneumocystisÂ cariniiÂ pneumonia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,accidentÂ cerebrovascular,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,acquiredÂ immuno-deficiency syndrome,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
data = pd.read_csv("Training.csv")

In [51]:
data.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [52]:
data.columns

Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'blackheads', 'scurring', 'skin_peeling', 'silver_like_dusting',
       'small_dents_in_nails', 'inflammatory_nails', 'blister',
       'red_sore_around_nose', 'yellow_crust_ooze', 'prognosis'],
      dtype='object', length=133)

In [53]:
len(data.columns)

133

In [54]:
len(data['prognosis'].unique())

41

In [None]:
#41 more common diseases

In [55]:
df = pd.DataFrame(data)

In [56]:
df.head()
df.drop_duplicates(keep='first',inplace=True)

In [57]:
df = df.groupby('prognosis').sum()
df= df.reset_index()

In [58]:
df.head()

Unnamed: 0,prognosis,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
0,(vertigo) Paroymsal Positional Vertigo,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AIDS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Acne,0,4,0,0,0,0,0,0,0,...,4,4,4,0,0,0,0,0,0,0
3,Alcoholic hepatitis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Allergy,0,0,0,4,4,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#2 dataset df and df_pivoted

In [None]:
df.head()

In [None]:
df_pivoted.head()

In [59]:
sym1=cols

In [60]:
sym1

Index(['Heberden's node', 'Murphy's sign', 'Stahli's line', 'abdomen acute',
       'abdominal bloating', 'abdominal tenderness', 'abnormal sensation',
       'abnormally hard consistency', 'abortion', 'abscess bacterial',
       ...
       'vision blurred', 'vomiting', 'weepiness', 'weight gain', 'welt',
       'wheelchair bound', 'wheezing', 'withdraw', 'worry', 'yellow sputum'],
      dtype='object', length=404)

In [61]:
symptoms=[]
for i in sym1:
    symptoms.append(i)

In [62]:
sym2=df.columns

In [63]:
for i in sym2:
    symptoms.append(i)

In [64]:
len(symptoms)

537

### Total 537 symptoms to chose from

### Predict from here

In [84]:
feature_dict = {}
for i,f in enumerate(sym1):
    feature_dict[f] = i

### Change Symptom in here

In [78]:
#Give value here
a=feature_dict['weight gain']
#of eyes


In [79]:
a

397

In [80]:
sample_x = [i/a if i ==a else i*0 for i in range(len(cols))]
#This means predicting the disease where the only symptom is redness_of_eyes.

In [81]:
sample_x = np.array(sample_x).reshape(1,len(sample_x))

In [82]:
dt.predict(sample_x)

array(['overload fluid'], dtype=object)

Hence it has 100% confidence that the disease would be Overload fluid. The prediction would improve once we take more symptoms as input.