In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

<div style = "color: #923333; font-size: 30px">
    Reading Dataset
</div>

In [2]:
data = pd.read_csv("dis_symp.csv", engine='python') 
data

Unnamed: 0,ï»¿Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall
...,...,...,...
1960,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0232257_systolic murmur
1961,,,UMLS:C0871754_frail
1962,,,UMLS:C0015967_fever
1963,UMLS:C0011127_decubitus ulcer,20.0,UMLS:C0232257_systolic murmur


In [3]:
data = data.rename(columns={'ï»¿Disease': 'Disease'})

In [4]:
data.describe()

Unnamed: 0,Count of Disease Occurrence
count,143.0
mean,265.664336
std,399.594829
min,14.0
25%,86.5
50%,140.0
75%,274.5
max,3363.0


<div style = "color: #923333; font-size: 30px">
    Cleaning dataset and fill the nan value
</div>

In [5]:
data_set = data.copy();
data_set["Disease"] = data_set["Disease"].fillna(method='ffill')
data_set["Count of Disease Occurrence"] = data_set["Count of Disease Occurrence"].fillna(method='ffill')

In [6]:
data_set.dropna()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall
...,...,...,...
1960,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0232257_systolic murmur
1961,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0871754_frail
1962,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0015967_fever
1963,UMLS:C0011127_decubitus ulcer,20.0,UMLS:C0232257_systolic murmur


In [7]:
data_set[data_set["Disease"] == "UMLS:C0020538_hypertensive disease"]

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall
5,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0039070_syncope
6,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0042571_vertigo
7,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0038990_sweat^UMLS:C0700590_sweating inc...
8,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0030252_palpitation
9,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0027497_nausea


<div style = "color: #923333; font-size: 20px">
    Groupby data and sum the similar data
</div>

In [8]:
data_set = data_set.groupby(by=["Disease", "Symptom"], as_index=False).aggregate(pd.np.sum)

In [9]:
data_set[data_set["Disease"] == "UMLS:C0020538_hypertensive disease"]

Unnamed: 0,Disease,Symptom,Count of Disease Occurrence
859,UMLS:C0020538_hypertensive disease,UMLS:C0002962_angina pectoris,5513.0
860,UMLS:C0020538_hypertensive disease,UMLS:C0004093_asthenia,5513.0
861,UMLS:C0020538_hypertensive disease,UMLS:C0008031_pain chest,5513.0
862,UMLS:C0020538_hypertensive disease,UMLS:C0012833_dizziness,5513.0
863,UMLS:C0020538_hypertensive disease,UMLS:C0015967_fever,2150.0
864,UMLS:C0020538_hypertensive disease,UMLS:C0027497_nausea,5513.0
865,UMLS:C0020538_hypertensive disease,UMLS:C0030252_palpitation,3363.0
866,UMLS:C0020538_hypertensive disease,UMLS:C0038990_sweat^UMLS:C0700590_sweating inc...,5513.0
867,UMLS:C0020538_hypertensive disease,UMLS:C0039070_syncope,5513.0
868,UMLS:C0020538_hypertensive disease,UMLS:C0042571_vertigo,5513.0


<div style = "color: #923333; font-size: 20px">
    Smoothing data and estimate prior distribution and liklihood of each symptom in dataset
</div>

In [10]:
Disease = data_set["Disease"].drop_duplicates()
clean_Symptom = data_set["Symptom"].drop_duplicates()
d = []
s = []
c = []
for i in range(53734):
    c.append(0)

In [11]:
for i,v in Disease.iteritems():
    for j,value in clean_Symptom.iteritems():
        s1 = v
        d.append(s1)
        s1 = value
        s.append(s1)

In [12]:
d_s = pd.DataFrame(list(zip(d, s, c)), columns =['Disease', 'Symptom', 'Count of Disease Occurrence'])
data_smooth = pd.concat([data_set, d_s])
data_smooth = data_smooth.drop_duplicates(subset=['Disease', 'Symptom'])
data_smooth["Count of Disease Occurrence"] += 1

In [13]:
data_smooth

Unnamed: 0,Disease,Symptom,Count of Disease Occurrence
0,UMLS:C0001175_acquiredÂ immuno-deficiency synd...,UMLS:C0008033_pleuritic pain,351.0
1,UMLS:C0001175_acquiredÂ immuno-deficiency synd...,UMLS:C0010200_cough,351.0
2,UMLS:C0001175_acquiredÂ immuno-deficiency synd...,UMLS:C0011991_diarrhea,351.0
3,UMLS:C0001175_acquiredÂ immuno-deficiency synd...,UMLS:C0015967_fever,351.0
4,UMLS:C0001175_acquiredÂ immuno-deficiency synd...,UMLS:C0026827_muscle hypotonia^UMLS:C0241938_h...,351.0
...,...,...,...
53729,UMLS:C1623038_cirrhosis,UMLS:C0000727_abdomen acute,1.0
53730,UMLS:C1623038_cirrhosis,UMLS:C0425491_catching breath,1.0
53731,UMLS:C1623038_cirrhosis,UMLS:C0740844_air fluid level,1.0
53732,UMLS:C1623038_cirrhosis,UMLS:C0232894_pneumatouria,1.0


In [14]:
summ = data_smooth.groupby("Disease").transform('sum')
summ = summ["Count of Disease Occurrence"]
data_smooth["sum"] = summ
data_smooth["probability"] = data_smooth["Count of Disease Occurrence"] / data_smooth["sum"]
data_smooth

Unnamed: 0,Disease,Symptom,Count of Disease Occurrence,sum,probability
0,UMLS:C0001175_acquiredÂ immuno-deficiency synd...,UMLS:C0008033_pleuritic pain,351.0,5301,0.0662139
1,UMLS:C0001175_acquiredÂ immuno-deficiency synd...,UMLS:C0010200_cough,351.0,5301,0.0662139
2,UMLS:C0001175_acquiredÂ immuno-deficiency synd...,UMLS:C0011991_diarrhea,351.0,5301,0.0662139
3,UMLS:C0001175_acquiredÂ immuno-deficiency synd...,UMLS:C0015967_fever,351.0,5301,0.0662139
4,UMLS:C0001175_acquiredÂ immuno-deficiency synd...,UMLS:C0026827_muscle hypotonia^UMLS:C0241938_h...,351.0,5301,0.0662139
...,...,...,...,...,...
53729,UMLS:C1623038_cirrhosis,UMLS:C0000727_abdomen acute,1.0,3017,0.000331455
53730,UMLS:C1623038_cirrhosis,UMLS:C0425491_catching breath,1.0,3017,0.000331455
53731,UMLS:C1623038_cirrhosis,UMLS:C0740844_air fluid level,1.0,3017,0.000331455
53732,UMLS:C1623038_cirrhosis,UMLS:C0232894_pneumatouria,1.0,3017,0.000331455


<div style = "color: #923333; font-size: 20px">
    Build find function to guess the disease from some symptom by doctors
</div>

In [15]:
def find():
    ele = [];
    symptom = {"Disease":[], "Symptom":[], "Count of Disease Occurrence":[], "sum":[], "probability":[]};
    symptom = pd.DataFrame(symptom)
    
    n = int(input("Enter number of Symptom : "))
    
    for i in range(0, n): 
        ele.append(str(input("Enter Symptom " + str(i+1))))
    
    for i in range(0, n):
        symptom = pd.concat([symptom, data_smooth[data_smooth["Symptom"] == ele[i]]]).groupby(by = ["Disease"], as_index=False)["probability"].aggregate(pd.np.prod)
        
    symptom = symptom.nlargest(10,'probability');
    return symptom

In [16]:
Expected_diseases = find()
Expected_diseases

Enter number of Symptom : 5
Enter Symptom 1UMLS:C0008033_pleuritic pain
Enter Symptom 2UMLS:C0011991_diarrhea
Enter Symptom 3UMLS:C0012833_dizziness
Enter Symptom 4UMLS:C0008031_pain chest
Enter Symptom 5UMLS:C0438716_pressure chest


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if sys.path[0] == '':


Unnamed: 0,Disease,probability
51,UMLS:C0019112_hemorrhoids,4.888003e-13
118,UMLS:C0149931_migraine disorders,1.593089e-13
62,UMLS:C0020538_hypertensive disease,1.512638e-13
11,UMLS:C0004610_bacteremia,1.498571e-13
61,UMLS:C0020473_hyperlipidemia,1.425535e-13
69,UMLS:C0022116_ischemia,1.241565e-13
70,UMLS:C0022658_kidney disease,1.08968e-13
109,UMLS:C0039239_tachycardia sinus,5.239262e-14
25,UMLS:C0008350_cholelithiasis^UMLS:C0242216_bil...,4.379432e-14
78,UMLS:C0025202_melanoma,3.782023e-14


<div style = "color: #923333; font-size: 20px">
    Build symptom function to guess some symptoms from disease from fin function
</div>

In [17]:
Expected_diseases = Expected_diseases.head(3)
def symptoms():
    symptom = {"Disease":[], "Symptom":[], "Count of Disease Occurrence":[], "sum":[], "probability":[]};
    symptom = pd.DataFrame(symptom)
    for i,j in Expected_diseases["Disease"].iteritems():
        x = data_smooth[data_smooth["Disease"] == j]
        x = x.nlargest(3,'Count of Disease Occurrence');
        symptom = pd.concat([symptom, x])
    return symptom

In [18]:
symptoms()

Unnamed: 0,Disease,Symptom,Count of Disease Occurrence,sum,probability
716,UMLS:C0019112_hemorrhoids,UMLS:C0009024_clonus,81.0,1681,0.0481856
717,UMLS:C0019112_hemorrhoids,UMLS:C0010200_cough,81.0,1681,0.0481856
718,UMLS:C0019112_hemorrhoids,UMLS:C0011991_diarrhea,81.0,1681,0.0481856
1629,UMLS:C0149931_migraine disorders,UMLS:C0002416_ambidexterity,62.0,828,0.0748792
1630,UMLS:C0149931_migraine disorders,UMLS:C0012833_dizziness,62.0,828,0.0748792
1631,UMLS:C0149931_migraine disorders,UMLS:C0015967_fever,62.0,828,0.0748792
859,UMLS:C0020538_hypertensive disease,UMLS:C0002962_angina pectoris,5514.0,64407,0.0856118
860,UMLS:C0020538_hypertensive disease,UMLS:C0004093_asthenia,5514.0,64407,0.0856118
861,UMLS:C0020538_hypertensive disease,UMLS:C0008031_pain chest,5514.0,64407,0.0856118
