In [51]:
import pandas as pd
import numpy as np
import re
import os
import glob

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB

In [41]:
db_table = pd.read_csv('selfmade-db.csv', encoding='utf-8', index_col=None, header=0)

In [42]:
def isValid(cui):
    cui = str(cui)
    pattern = re.compile("C\\d{7}")
    if not pattern.match(cui):
        return False
    return True

def cuiToNumber(cui):
      return cui.strip("C").strip("0")

def convertCUI(cui):
    cui = str(cui)
    if not isValid(cui):
        return "C" + cui.zfill(7)
    else:
        return cui

def clean(the_string):
    return str(the_string.encode('utf-8'))

In [43]:
db_table['Disease'] = db_table['Disease'].apply(convertCUI)
db_table['Symptom'] = db_table['Symptom'].apply(convertCUI)

In [44]:
db_table.to_csv("selfmade-db.csv",index=False)

df_foreign = pd.read_csv('disease-symptom-other.csv', encoding='utf-8', index_col=None, header=0)

db_table = db_table.append(df_foreign)

db_table.to_csv("disease-symptom-merged-edit.csv",index=False)

In [45]:
db_table['Disease'] = db_table['Disease'].astype(str)
db_table['Symptom'] = db_table['Symptom'].astype(str)

db_table['Symptom'].replace('', np.nan, inplace=True)
db_table.dropna(subset=['Symptom'], inplace=True)

db_table['Disease'].replace('', np.nan, inplace=True)
db_table.dropna(subset=['Disease'], inplace=True)

df = pd.DataFrame(db_table)
df.columns

df_1 = pd.get_dummies(df.Symptom)
df_s = df['Disease']
df_pivoted = pd.concat([df_s,df_1], axis=1)
df_pivoted.drop_duplicates(keep='first',inplace=True)

In [46]:
cols = df_pivoted.columns
cols = cols[1:] # skip 'Disease'

In [47]:
df_pivoted = df_pivoted.groupby('Disease').sum()
df_pivoted = df_pivoted.reset_index()

In [48]:
df_pivoted.to_csv('all_pivoted_edit.csv', index=False)

In [49]:
cols = df_pivoted.columns
cols = cols[1:] # skip 'title'
x = df_pivoted[cols] # symptom rows
y = df_pivoted['Disease'] # diseases
x.to_csv("all_x_edit.csv", index=False)

In [13]:
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [12]:
#clf = DecisionTreeClassifier(random_state = 0)
#clf = clf.fit(x_train, y_train)

#clf.score(x_test, y_test)

In [52]:
clf = GaussianNB()
clf = clf.fit(x, y)

clf.score(x, y)

0.9683698296836983

In [53]:
disease_pred = clf.predict(x)

disease_real = y.values

In [54]:
for i in range(0, len(disease_real)):
    if disease_pred[i]!=disease_real[i]:
        print ('Pred: {0} Actual:{1}'.format(disease_pred[i], disease_real[i]))

Pred: C0006840 Actual:C0006849
Pred: C0001175 Actual:C0019682
Pred: C0001175 Actual:C0019693
Pred: C0036262 Actual:C0041976
Pred: C0036690 Actual:C0243026
Pred: C0014549 Actual:C0494475
Pred: C0039621 Actual:C0549346
Pred: C0006142 Actual:C0678222
Pred: C0242379 Actual:C0684249
Pred: C0007102 Actual:C0699790
Pred: C0036690 Actual:C1090821
Pred: C0031039 Actual:C1253937
Pred: C0020428 Actual:C1384514


In [55]:
joblib.dump(clf,'all_mnb.pkl', protocol=2)

data = pd.read_csv("all_x_edit.csv")

In [56]:
df = pd.DataFrame(data)
features = df.columns # = symptoms
features_raw = [str(features[x]) for x in range(len(features))]
features_raw = ','.join(map(str, features_raw))

In [57]:
# convert feature array into dict of symptom: index
feature_dict = {}
for i,f in enumerate(features):
    feature_dict[f] = i

In [58]:
def findFeatures(disease):
    return db_table.loc[db_table['Disease'] == disease]["Symptom"].values.astype(str)

In [59]:
sample = np.zeros((len(features),), dtype=np.int)
sample_list = sample.tolist()

In [60]:
sym_dictionary = pd.read_csv('full_dictionary.csv', encoding='utf-8', index_col=None, header=0)

In [61]:
sym_dictionary

Unnamed: 0.1,Unnamed: 0,Disease_UMLS,Disease_CUI,备注（英语）,Symptom,Symptom_CUI,备注
0,,acquired immuno-deficiency syndrome,C0001175,,abdomen acute,C0000727,
1,171.0,akromegalie,C0001206,acromegalic,akutes abdomen,C0000727,印尼语
2,301.0,akute pankreatitis,C0001339,acute pancreatitis,distended abdomen,C0000731,
3,,adenocarcinoma,C0001418,,pain abdominal,C0000737,
4,,adhesion,C0001511,,abdominal schmerzen,C0000737,德语
...,...,...,...,...,...,...,...
1136,,,,,voellegefuehl,C3714614,
1137,,,,,malabsorption,C3714745,
1138,,,,,unruhe,C3887611,
1139,,,,,affektive ambivalenz,C4049320,


In [62]:
def find_symtom_cui(value_list, sym_dictionary):
    search_cui = []
    symptom_list = sym_dictionary["Symptom"].tolist() 
    symptom_cui_list = sym_dictionary["Symptom_CUI"].tolist()
    
    for s in value_list: 
        search_cui.append(symptom_cui_list[symptom_list.index(s)])
    return search_cui

In [277]:
# input_symptoms = ['abdomen acute','malabsorption', 'unruhe']
# search_cui = find_symtom_cui(input_symptoms, sym_dictionary)
# search_cui

['C0000727', 'C3714745', 'C3887611']

In [63]:
def find_symptom(input_sym_cui, sym_dictionary):
    search_sym = []
    symptom_list = sym_dictionary["Symptom"].tolist() 
    symptom_cui_list = sym_dictionary["Symptom_CUI"].tolist()
    
    for s in input_sym_cui: 
        search_sym.append(symptom_list[symptom_cui_list.index(s)])
    return search_sym

In [250]:
'''input_sym_cui = ['C0232257', 'C0871754']
search_sym = find_symptom(input_sym_cui, sym_dictionary)
search_sym'''

['systolic murmur', 'frail']

In [64]:
def find_disease(input_disease_cui, sym_dictionary):
    search_di = []
    disease_list = sym_dictionary["Disease_UMLS"].tolist() 
    disease_cui_list = sym_dictionary["Disease_CUI"].tolist()
    
    for s in input_disease_cui: 
        search_di.append(disease_list[disease_cui_list.index(s)])
    return search_di

In [50]:
# for s in search_cui:
    # sample_list[feature_dict[s]] = 1

In [280]:
# sample_list = np.array(sample_list).reshape(1,len(sample_list))

# predict_results = clf.predict_proba(sample_list)[0]

In [281]:
# gets a dictionary of {'class_name': probability}
# prob_per_class_dictionary = dict(zip(clf.classes_, predict_results))
# disease_list = sym_dictionary["Disease_UMLS"].tolist()
# prob_per_class_dictionary = dict(zip(disease_list, predict_results))

In [282]:
# gets a list of ['most_probable_class', 'second_most_probable_class', ..., 'least_class']
#results_ordered_by_probability = map(lambda x: {"disease": x[0],"prop": x[1] * 100, "sy": findFeatures(x[0])}, sorted(zip(clf.classes_, predict_results), key=lambda x: x[1], reverse=True))

In [283]:
'''disease_list_1 = find_disease(clf.classes_, sym_dictionary)
feature_list = []
for s in clf.classes_:
    feature_list.append(find_symptom(findFeatures(s), sym_dictionary))'''


In [284]:
# table_1 = pd.DataFrame({'disease': disease_list_1,'probability': predict_results *100, 'symptoms': feature_list})
# table_2 = table_1.sort_values(by="probability" , ascending=False)

Unnamed: 0,disease,probability,symptoms
360,ischialgie,50.0,[paraesthesien und dysaesthesien]
276,tetanie,50.0,[paraesthesien und dysaesthesien]
270,subarachnoidalblutung,0.0,"[headache, meningismus, bewusstseinsstoerungen..."
280,thyreotoxische krise,0.0,"[sinustachykardie, fever / hyperthermie, flush..."
279,thrombocytopaenia,0.0,"[ecchymoses, monocytosis, posterior rhinorrhea..."
...,...,...,...
134,hyperkalzaemie,0.0,"[nausea, vomiting, constipation, nephrolithias..."
133,hyperbilirubinemia,0.0,"[cyanosis, tachypnea, para 1, bradycardia, bre..."
132,hyperaldosteronismus,0.0,"[hypertonie, hypokalemia, metabolische alkalose]"
131,hydrops fetalis,0.0,"[ascites, pleuraerguss, polyhydramnion]"


In [65]:
def symptom_disease(value_list, sym_dictionary, sample_list):
    search_cui = find_symtom_cui(value_list, sym_dictionary)
    for s in search_cui:
        sample_list[feature_dict[s]] = 1
        
    sample_list = np.array(sample_list).reshape(1,len(sample_list))

    predict_results = clf.predict_proba(sample_list)[0]
    
    return predict_results

In [66]:
def fill_disease_list(sym_dictionary, clf):
    # gets a dictionary of {'class_name': probability}
    #prob_per_class_dictionary = dict(zip(clf.classes_, predict_results))
    disease_list = sym_dictionary["Disease_UMLS"].tolist()
    #prob_per_class_dictionary = dict(zip(disease_list, predict_results))
    
    disease_list_1 = find_disease(clf.classes_, sym_dictionary)
    
    return disease_list_1

In [67]:
def fill_feature_list(clf):
    feature_list = []
    for s in clf.classes_:
        feature_list.append(find_symptom(findFeatures(s), sym_dictionary))
        
    return feature_list


In [68]:
def symptom_check(value_list, sym_dictionary):
    symptom_list = sym_dictionary["Symptom"].tolist() 
    for s in value_list:
        if s in symptom_list:
            return True
    return False

In [69]:
value_list = ['fever', 'cough']

In [38]:
def correct_symptom(value_list):
    my_dict = {'a':'0','b':'0','c':'0','d':'0','e':'0','f':'0','g':'0','h':'0','i':'0', 'j':'0', 'k':'0', 'l':'0', 'm':'0', 'n':'0', 'o':'0', 'p':'0', 'q':'0', 'r':'0', 's':'0', 't':'0', 'u':'0', 'v':'0','w':'0', 'x':'0','y':'0', 'z':'0',' ':'0'}
    for s in value_list:
        s_2=list(s)
        for p in s_2:
            my_dict = 
    print("".join(s_2))
        
    return s_2
    

In [39]:
correct_symptom(value_list)

cough


['c', 'o', 'u', 'g', 'h']

In [70]:
def make_table(value_list, sym_dictionary, sample_list, clf):
    
    disease_list_1 = fill_disease_list(sym_dictionary, clf)
    predict_results = symptom_disease(value_list, sym_dictionary, sample_list)
    feature_list = fill_feature_list(clf)
    
    #output the re//sults as a table
    table_1 = pd.DataFrame({'disease': disease_list_1,'probability': predict_results *100, 'symptoms': feature_list})
    table_2 = table_1.sort_values(by="probability" , ascending=False)
    
    return table_2

In [72]:
import PySimpleGUI as sg

sg.theme('Dark Blue 3')   # Add a touch of color

    
layout = [[sg.Text('Symptoms:'), sg.InputText(size=(40,))],
     [sg.Button('Find Disease', key='submit')],
     [sg.Text('', key='disease 1', size=(100,5))],
     [sg.Text('', key='disease 2', size=(100,5))],
     [sg.Text('', key='disease 3', size=(100,5))],     
     [sg.Quit('quit', key='exit')]]
    
# Create the Window
window = sg.Window('Disease System', layout, grab_anywhere=True)

while True:
    event, value = window.Read()
    value_0 = value[0]
    value_str = str(value_0)
    value_list = value_str.split(",")
    if event == 'submit':
        if symptom_check(value_list, sym_dictionary):
            the_table = make_table(value_list, sym_dictionary, sample_list, clf)
            disease1 = the_table[0:1]
            disease2 = the_table[1:2]
            disease3 = the_table[2:3]
            window.Element('disease 1').Update(disease1, text_color='white')
            window.Element('disease 2').Update(disease2, text_color='white')
            window.Element('disease 3').Update(disease3, text_color='white')
        else:
            window.Element('disease 1').Update('There is a problem!', text_color='red')
    elif event == 'exit':
        break
window.Close()


In [32]:
#print (list(results_ordered_by_probability))


In [32]:
#store the predicted probabilities for class 1
y_pred_prob = clf.predict_proba(sample)[0]

In [33]:
y_pred_prob

array([0.0037594, 0.       , 0.0037594, 0.0037594, 0.0037594, 0.0037594,
       0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594,
       0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594,
       0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594,
       0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594,
       0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594,
       0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594,
       0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594,
       0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594,
       0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594,
       0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594,
       0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594,
       0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594, 0.0037594,
       0.0037594, 0.0037594, 0.0037594, 0.0037594, 