In [1]:
import pandas as pd
import numpy as np
import re
import os
import glob

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.externals import joblib
from sklearn.naive_bayes import MultinomialNB

import math



In [2]:
db_table = pd.read_csv('selfmade-db.csv', encoding='utf-8', index_col=None, header=0)

In [3]:
def isValid(cui):
    cui = str(cui)
    pattern = re.compile("C\\d{7}")
    if not pattern.match(cui):
        return False
    return True

def cuiToNumber(cui):
      return cui.strip("C").strip("0")

def convertCUI(cui):
    cui = str(cui)
    if not isValid(cui):
        return "C" + cui.zfill(7)
    else:
        return cui

def clean(the_string):
    return str(the_string.encode('utf-8'))

In [4]:
db_table['Disease'] = db_table['Disease'].apply(convertCUI)
db_table['Symptom'] = db_table['Symptom'].apply(convertCUI)

db_table.to_csv("selfmade-db.csv",index=False)

df_foreign = pd.read_csv('disease-symptom-other.csv', encoding='utf-8', index_col=None, header=0)

db_table = db_table.append(df_foreign)

db_table.to_csv("disease-symptom-merged-edit.csv",index=False)

In [5]:
db_table['Disease'] = db_table['Disease'].astype(str)
db_table['Symptom'] = db_table['Symptom'].astype(str)

db_table['Symptom'].replace('', np.nan, inplace=True)
db_table.dropna(subset=['Symptom'], inplace=True)

db_table['Disease'].replace('', np.nan, inplace=True)
db_table.dropna(subset=['Disease'], inplace=True)

df = pd.DataFrame(db_table)
df.columns

df_1 = pd.get_dummies(df.Symptom)
df_s = df['Disease']
df_pivoted = pd.concat([df_s,df_1], axis=1)
df_pivoted.drop_duplicates(keep='first',inplace=True)

In [6]:
cols = df_pivoted.columns
cols = cols[1:] # skip 'Disease'

df_pivoted = df_pivoted.groupby('Disease').sum()
df_pivoted = df_pivoted.reset_index()

df_pivoted.to_csv('all_pivoted_edit.csv', index=False)

In [7]:
cols = df_pivoted.columns
cols = cols[1:] # skip 'title'
x = df_pivoted[cols] # symptom rows
y = df_pivoted['Disease'] # diseases
x.to_csv("all_x_edit.csv", index=False)

In [8]:
clf = MultinomialNB()
clf = clf.fit(x, y)

clf.score(x, y)

0.9683698296836983

In [9]:
disease_pred = clf.predict(x)

disease_real = y.values

In [10]:
for i in range(0, len(disease_real)):
    if disease_pred[i]!=disease_real[i]:
        print ('Pred: {0} Actual:{1}'.format(disease_pred[i], disease_real[i]))

Pred: C0006840 Actual:C0006849
Pred: C0001175 Actual:C0019682
Pred: C0001175 Actual:C0019693
Pred: C0036262 Actual:C0041976
Pred: C0036690 Actual:C0243026
Pred: C0014549 Actual:C0494475
Pred: C0039621 Actual:C0549346
Pred: C0006142 Actual:C0678222
Pred: C0242379 Actual:C0684249
Pred: C0007102 Actual:C0699790
Pred: C0036690 Actual:C1090821
Pred: C0031039 Actual:C1253937
Pred: C0020428 Actual:C1384514


In [11]:
joblib.dump(clf,'all_mnb.pkl', protocol=2)

data = pd.read_csv("all_x_edit.csv")

In [12]:
df = pd.DataFrame(data)
features = df.columns # = symptoms
features_raw = [str(features[x]) for x in range(len(features))]
features_raw = ','.join(map(str, features_raw))

In [13]:
# convert feature array into dict of symptom: index
feature_dict = {}
for i,f in enumerate(features):
    feature_dict[f] = i

In [14]:
def findFeatures(disease):
    return db_table.loc[db_table['Disease'] == disease]["Symptom"].values.astype(str)

In [15]:
sample = np.zeros((len(features),), dtype=np.int)
sample_list = sample.tolist()

In [16]:
sym_dictionary = pd.read_csv('full_dictionary.csv', encoding='utf-8', index_col=None, header=0)

In [17]:
def find_symtom_cui(value_list, sym_dictionary):
    search_cui = []
    symptom_list = sym_dictionary["Symptom"].tolist() 
    symptom_cui_list = sym_dictionary["Symptom_CUI"].tolist()
    
    for s in value_list: 
        search_cui.append(symptom_cui_list[symptom_list.index(s)])
    return search_cui

In [18]:
def find_symptom(input_sym_cui, sym_dictionary):
    search_sym = []
    symptom_list = sym_dictionary["Symptom"].tolist() 
    symptom_cui_list = sym_dictionary["Symptom_CUI"].tolist()
    
    for s in input_sym_cui: 
        search_sym.append(symptom_list[symptom_cui_list.index(s)])
    return search_sym

In [19]:
def find_disease(input_disease_cui, sym_dictionary):
    search_di = []
    disease_list = sym_dictionary["Disease_UMLS"].tolist() 
    disease_cui_list = sym_dictionary["Disease_CUI"].tolist()
    
    for s in input_disease_cui: 
        search_di.append(disease_list[disease_cui_list.index(s)])
    return search_di

In [20]:
def symptom_disease(value_list, sym_dictionary, sample_list):
    search_cui = find_symtom_cui(value_list, sym_dictionary)
    for s in search_cui:
        sample_list[feature_dict[s]] = 1
        
    sample_list = np.array(sample_list).reshape(1,len(sample_list))

    predict_results = clf.predict_proba(sample_list)[0]
    
    return predict_results

In [21]:
def fill_disease_list(sym_dictionary, clf):
    # gets a dictionary of {'class_name': probability}
    #prob_per_class_dictionary = dict(zip(clf.classes_, predict_results))
    disease_list = sym_dictionary["Disease_UMLS"].tolist()
    #prob_per_class_dictionary = dict(zip(disease_list, predict_results))
    
    disease_list_1 = find_disease(clf.classes_, sym_dictionary)
    
    return disease_list_1

In [22]:
def fill_feature_list(clf):
    feature_list = []
    for s in clf.classes_:
        feature_list.append(find_symptom(findFeatures(s), sym_dictionary))
        
    return feature_list

In [23]:
def convert_string(str_):
    st = ''
    str_ = str(str_).lower()
    for s in str_:
        if 'a'<=s<='z' or s == ' ':
            st = st+s
    st.strip()
    return st

In [24]:
def sym_dict(s):
    my_dict = {'a':0,'b':0,'c':0,'d':0,'e':0,'f':0,'g':0,'h':0,'i':0, 'j':0, 'k':0, 'l':0, 'm':0, 'n':0, 'o':0, 'p':0, 'q':0, 'r':0, 's':0, 't':0, 'u':0, 'v':0,'w':0, 'x':0,'y':0, 'z':0,' ':0}
    s = convert_string(s)
    for p in s:
        my_dict[p] = my_dict[p] + 1
    return my_dict

In [24]:
'''def sym_dict(s):
    my_dict = {'a':0,'b':0,'c':0,'d':0,'e':0,'f':0,'g':0,'h':0,'i':0, 'j':0, 'k':0, 'l':0, 'm':0, 'n':0, 'o':0, 'p':0, 'q':0, 'r':0, 's':0, 't':0, 'u':0, 'v':0,'w':0, 'x':0,'y':0, 'z':0,' ':0}
    s = convert_string(s)
    for i,p in enumerate(s):
        my_dict[p] = my_dict[p] + 1 + i
    return my_dict'''

In [25]:
'''def sym_dict(s):
    my_dict = {'a':0,'b':0,'c':0,'d':0,'e':0,'f':0,'g':0,'h':0,'i':0, 'j':0, 'k':0, 'l':0, 'm':0, 'n':0, 'o':0, 'p':0, 'q':0, 'r':0, 's':0, 't':0, 'u':0, 'v':0,'w':0, 'x':0,'y':0, 'z':0,' ':0}
    s = convert_string(s)
    for p in s:
        my_dict[p] = 1
    return my_dict'''

"def sym_dict(s):\n    my_dict = {'a':0,'b':0,'c':0,'d':0,'e':0,'f':0,'g':0,'h':0,'i':0, 'j':0, 'k':0, 'l':0, 'm':0, 'n':0, 'o':0, 'p':0, 'q':0, 'r':0, 's':0, 't':0, 'u':0, 'v':0,'w':0, 'x':0,'y':0, 'z':0,' ':0}\n    s = convert_string(s)\n    for p in s:\n        my_dict[p] = 1\n    return my_dict"

In [25]:
#make frequency list for each symptom
frequency = pd.read_csv('frequency.csv', encoding='utf-8', index_col=None, header=0)
frequency_symptom_list = frequency["Symptoms"].tolist()


frequency_list = []

for s in frequency_symptom_list:
    my_dict = sym_dict(s)
    lst = list(my_dict.values())
    frequency_list.append(lst)
    
frequency['frequency'] = frequency_list
frequency.to_csv("frequency.csv",index=False)


In [26]:
#convert list to dataframe
'''from pandas.core.frame import DataFrame
c={"frequency" : frequency_list}
data=DataFrame(c)
data.to_csv("frequency_col2.csv",index=False)'''

'from pandas.core.frame import DataFrame\nc={"frequency" : frequency_list}\ndata=DataFrame(c)\ndata.to_csv("frequency_col2.csv",index=False)'

In [27]:
'''#make a frequency pivot
symp_dict = pd.read_csv('symptom dictionary.csv', encoding='utf-8', index_col=None, header=0)
symptom_dict_list = symp_dict["Symptoms"].tolist()
for s in symptom_dict_list:
    s = convert_string(s)
    for p in s:
        symp_dict[s,p] += 1'''

'#make a frequency pivot\nsymp_dict = pd.read_csv(\'symptom dictionary.csv\', encoding=\'utf-8\', index_col=None, header=0)\nsymptom_dict_list = symp_dict["Symptoms"].tolist()\nfor s in symptom_dict_list:\n    s = convert_string(s)\n    for p in s:\n        symp_dict[s,p] += 1'

In [26]:
def get_value_dict(a_value):

    value_dict = sym_dict(a_value)
    value_dict_list = list(value_dict.values())
    
    return value_dict_list

In [27]:
def calculate_similarity(a_value,frequency_list):
    frequency = pd.read_csv('frequency.csv', encoding='utf-8', index_col=None, header=0)
    # frequency_list = frequency["frequency"].tolist()
    value_dict_list = get_value_dict(a_value)
    similarity = []
    for s in frequency_list:
        B = 0
        i = 0
        while i < 27:
            B = value_dict_list[i] * s[i] + B
            i = i + 1
        # print('product = ' + str(B))
        # Take the product of the magnitudes of two vectors
        A = 0
        A1 = 0
        A2 = 0
        j = 0
        while j < 27:
            A1 = A1 + value_dict_list[j] * value_dict_list[j]
            A2 = A2 + s[j] * s[j]
            j = j + 1
        # print('A1 = ' + str(A1))
 
        A = math.sqrt(A1) * math.sqrt(A2)
        similarity.append(format(float(B) / A,".3f"))
        # print('similarity = ' + format(float(B) / A,".3f"))
        
    #correlate the similarity with symptom: 
    frequency['similarity'] = similarity
    frequency = frequency.sort_values(by="similarity" , ascending=False)
    return frequency[0:10]

In [28]:
def symptom_match(s, sym_dictionary,frequency_list):
    symptom_list = sym_dictionary["Symptom"].tolist() 
    if s in symptom_list:
        return s
    else:
        sim_3 = calculate_similarity(s,frequency_list)
        lst_1 = sim_3['Symptoms'].tolist()
        return lst_1[0]

In [29]:
'''def add_value_to_list(value):
    value_list_set = []
    value_list_set.append(value)
    return value_list_set'''

In [30]:
'''value = 'Headache'
add_value_to_list(value)'''

['Headache']

In [33]:
def symptom_check(value_list, sym_dictionary):
    symptom_list = sym_dictionary["Symptom"].tolist() 
    for s in value_list:
        if s in symptom_list:
            return True
        else:
            return False

In [34]:
# input_list = ['insomnia','general discmofort','headache','red blothies']
def fix_the_input(input_list):
    for i,s in enumerate(input_list):
        s_1 = symptom_match(s,sym_dictionary,frequency_list)
        input_list[i] = s_1
    return input_list

In [35]:
def make_table(value_list, sym_dictionary, sample_list, clf):
    
    disease_list_1 = fill_disease_list(sym_dictionary, clf)
    predict_results = symptom_disease(value_list, sym_dictionary, sample_list)
    feature_list = fill_feature_list(clf)
    
    #output the re//sults as a table
    table_1 = pd.DataFrame({'disease': disease_list_1,'probability': predict_results*100, 'symptoms': feature_list})
    #table_1 = pd.DataFrame({'disease': disease_list_1,'probability': predict_results*100})
    table_2 = table_1.sort_values(by="probability" , ascending=False)
    
    return table_2

In [43]:
'''import PySimpleGUI as sg

sg.theme('Dark Blue 3')   # Add a touch of color

    
layout = [[sg.Text('Symptoms:'), sg.InputText(size=(40,))],
     [sg.Button('Find Disease', key='submit')],
     [sg.Text('', key='disease 1', size=(100,5))],
     [sg.Text('', key='disease 2', size=(100,5))],
     [sg.Text('', key='disease 3', size=(100,5))],     
     [sg.Quit('quit', key='exit')]]
    
# Create the Window
window = sg.Window('Disease System', layout, grab_anywhere=True)

while True:
    event, value = window.Read()
    value_0 = value[0]
    value_str = str(value_0)
    value_list = value_str.split(",")
    
    if event == 'submit':
        #check if there are mistake in the input
        if symptom_check(value_list,sym_dictionary):
            the_table = make_table(value_list, sym_dictionary, sample_list, clf)
        else:
            #match the input symptoms with the exiting symptoms in the dict
            value_list_fix = fix_the_input(value_list)
            window.Element('disease 1').Update('The symptoms does not match!', text_color='red')
            window.Element('disease 2').Update('I consider your input as:', text_color='white')
            window.Element('disease 3').Update(value_list_fix, text_color='white')
            the_table = make_table(value_list_fix, sym_dictionary, sample_list, clf)

        print(the_table)
        the_table[0:101].to_csv("output disease.csv",index=False)

    elif event == 'exit':
        break
window.Close()'''

                                    disease  probability  \
116                             hemorrhoids     0.821514   
32                            bronchiektase     0.436448   
35                         bronchopneumonie     0.436448   
41                    schilddruesenkarzinom     0.435177   
369  schlafapnoesyndrom der oberen luftwege     0.433910   
..                                      ...          ...   
160                  chronic kidney failure     0.200751   
176        systemischer lupus erythematodes     0.200751   
143            hypoglycemia / hypoglycaemia     0.200182   
49          gluten - sensitive enteropathie     0.198489   
147                          hypothyroidism     0.196814   

                                              symptoms  
116  [haemorrhage, bowel sounds decreased, decrease...  
32                          [cough, lungenabszess nnb]  
35                                      [cough, fever]  
41                [schluckbeschwerden, dyspnea, cou

In [45]:
import PySimpleGUI as sg

sg.theme('Dark Blue 3')   # Add a touch of color

    
layout = [[sg.Text('Symptoms:'), sg.InputText(size=(40,))],
     [sg.Button('Find Disease', key='submit')],[sg.Button('Clear Symptoms', key='clear')],
     [sg.Text('', key='disease 1', size=(100,5))],
     [sg.Text('', key='disease 2', size=(100,5))],
     [sg.Text('', key='disease 3', size=(100,5))],
     [sg.Text('', key='disease 4', size=(100,5))], 
     [sg.Quit('quit', key='exit')]]
    
# Create the Window
window = sg.Window('Disease System', layout, grab_anywhere=True)
value_list = []

while True:
    event, value = window.Read()
    value_0 = value[0]
        
    if event == 'submit':
        #add value into a list 
        value_list.append(value_0)
        value_list_str = str(value_list)
        value_list = value_list_str.split(",")
        
        #check if there are mistake in the input
        if symptom_check(value_list,sym_dictionary):
            the_table = make_table(value_list, sym_dictionary, sample_list, clf)
            
            window.Element('disease 1').Update(value_list, text_color='red')
        else:
            #match the input symptoms with the exiting symptoms in the dict
            value_list_fix = fix_the_input(value_list)
            window.Element('disease 1').Update(value_list, text_color='red')
            window.Element('disease 2').Update('The symptoms does not match!', text_color='red')
            window.Element('disease 3').Update('I consider your input as:', text_color='white')
            window.Element('disease 4').Update(value_list_fix, text_color='white')
            the_table = make_table(value_list_fix, sym_dictionary, sample_list, clf)

        print(the_table)
        the_table[0:101].to_csv("output disease.csv",index=False)
            
    if event == 'clear':
        #clear all the input symptoms
        value_list = []
        window.Element('disease 1').Update('The symptoms in the list has been cleaned!', text_color='red')
        the_table = the_table.drop(index=the_table.index)

    elif event == 'exit':
        break
window.Close()

                        disease  probability  \
116                 hemorrhoids     2.960497   
345            pontiac - fieber     1.670517   
150                   infection     1.627563   
246              pyelonephritis     1.599610   
164  pneumonie durch legionella     1.599610   
..                          ...          ...   
200       myocardial infarction     0.090168   
262               schizophrenia     0.090168   
28             bipolar disorder     0.088640   
109               failure heart     0.088640   
147              hypothyroidism     0.081418   

                                              symptoms  
116  [haemorrhage, bowel sounds decreased, decrease...  
345  [schnupfen, cough, throat sore, fever, verwirr...  
150  [fever, erythema, decreased translucency, hepa...  
246  [fever, pain, urgency of micturition, hematuri...  
164  [pleuritis, fever, rhinitis, laryngitis, heada...  
..                                                 ...  
200  [pain chest, st seg

In [None]:
disease1 = the_table[0:1]
disease2 = the_table[1:2]
disease3 = the_table[2:3]
window.Element('disease 1').Update(disease1, text_color='white')
window.Element('disease 2').Update(disease2, text_color='white')
window.Element('disease 3').Update(disease3, text_color='white')