In [2]:
import pandas as pd
import numpy as np
import re
import os
import glob

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn import metrics 
import PySimpleGUI as sg



In [3]:
result = pd.read_csv('selfmade-db.csv', encoding='utf-8', index_col=None, header=0)

In [4]:
def isValid(cui):
    cui = str(cui)
    pattern = re.compile("C\\d{7}")
    if not pattern.match(cui):
        return False
    return True

def cuiToNumber(cui):
      return cui.strip("C").strip("0")

def convertCUI(cui):
    cui = str(cui)
    if not isValid(cui):
        return "C" + cui.zfill(7)
    else:
        return cui

def clean(the_string):
    return str(the_string.encode('utf-8'))

In [5]:
result['Disease'] = result['Disease'].apply(convertCUI)
result['Symptom'] = result['Symptom'].apply(convertCUI)

In [6]:
result.to_csv("selfmade-db.csv",index=False)

df_foreign = pd.read_csv('disease-symptom-other.csv', encoding='utf-8', index_col=None, header=0)

result = result.append(df_foreign)

result.to_csv("disease-symptom-merged-edit.csv",index=False)

In [7]:
result['Disease'] = result['Disease'].astype(str)
result['Symptom'] = result['Symptom'].astype(str)

result['Symptom'].replace('', np.nan, inplace=True)
result.dropna(subset=['Symptom'], inplace=True)

result['Disease'].replace('', np.nan, inplace=True)
result.dropna(subset=['Disease'], inplace=True)

df = pd.DataFrame(result)
df.columns

df_1 = pd.get_dummies(df.Symptom)
df_s = df['Disease']
df_pivoted = pd.concat([df_s,df_1], axis=1)
df_pivoted.drop_duplicates(keep='first',inplace=True)

In [8]:
cols = df_pivoted.columns
cols = cols[1:] # skip 'Disease'

In [9]:
df_pivoted = df_pivoted.groupby('Disease').sum()
df_pivoted = df_pivoted.reset_index()

In [10]:
df_pivoted.to_csv('all_pivoted_edit.csv', index=False)

In [11]:
#split labels and features:
cols = df_pivoted.columns
cols = cols[1:] # skip 'title'
x = df_pivoted[cols] # symptom rows
y = df_pivoted['Disease'] # diseases
x.to_csv("all_x_edit.csv", index=False)

In [12]:
def build_a_model(x, y):
    
    #split train and test data:
    #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
    
    #clf = DecisionTreeClassifier(max_depth = 4, random_state = 0)
    #clf = clf.fit(x_train, y_train)
    
    clf_tot = DecisionTreeClassifier(random_state = 0)
    clf_tot = clf_tot.fit(x, y)

    return clf_tot

In [13]:
clf = build_a_model(x, y)
joblib.dump(clf,'all_mdt.pkl', protocol=2)

In [14]:
def evaluate_model(clf,x,y):
    disease_pred = clf.predict(x)
    
    score = clf.score(x, y)
    disease_real = y.values
    
    mismatch_num = 0
    for i in range(0, len(disease_real)):
        if disease_pred[i]!=disease_real[i]:
            print ('Pred: {0} Actual:{1}'.format(disease_pred[i], disease_real[i]))
            mismatch_num +=1
    #print('Total: {0} Mismatch:{1}'.format(len(disease_real), mismatch_num))      
    return disease_pred

In [15]:
disease_pred = evaluate_model(clf,x,y)

Pred: C0006840 Actual:C0006849
Pred: C0001175 Actual:C0019682
Pred: C0001175 Actual:C0019693
Pred: C0036262 Actual:C0041976
Pred: C0036690 Actual:C0243026
Pred: C0014549 Actual:C0494475
Pred: C0039621 Actual:C0549346
Pred: C0006142 Actual:C0678222
Pred: C0242379 Actual:C0684249
Pred: C0007102 Actual:C0699790
Pred: C0036690 Actual:C1090821
Pred: C0031039 Actual:C1253937
Pred: C0020428 Actual:C1384514


In [32]:
data = pd.read_csv("all_x_edit.csv")
    
df = pd.DataFrame(data)
features = df.columns # = symptoms
'''features_raw = [str(features[x]) for x in range(len(features))]
features_raw = ','.join(map(str, features_raw))'''
    
# convert feature array into dict of symptom: index
feature_dict = {}
for i,f in enumerate(features):
    feature_dict[f] = i


In [33]:
def findFeatures(disease):
    return result.loc[result['Disease'] == disease]["Symptom"].values.astype(str)

In [35]:
input_symptoms = ['abdomen acute', 'bauch schmerzen', 'azidose']

In [40]:
def find_disease(input_symptoms, clf, features, feature_dict):

    input_symptoms = list(input_symptoms)
    
    sym_dictionary = pd.read_csv('full_dictionary.csv', encoding='utf-8', index_col=None, header=0)
    
    #create an empty list of sample
    sample = np.zeros((len(features),), dtype=np.int)
    sampe = sample.tolist()
    
    search = []
    symtom_list = sym_dictionary["Symptom"].tolist() 
    symptom_cui_list = sym_dictionary["Symptom_CUI"].tolist() 
    
    #find the position of symptoms in pivot
    for i,s in enumerate(input_symptoms): 
        search.append(symptom_cui_list[symtom_list.index(s)])
    for i,s in enumerate(search):
        sample[feature_dict[s]] = 1
        
    sample_re = np.array(sampe).reshape(1,len(sampe))

    results = clf.predict_proba(sample_re)[0]
    
    # gets a dictionary of {'class_name': probability}
    prob_per_class_dictionary = dict(zip(clf.classes_, results))
    
    # gets a list of ['most_probable_class', 'second_most_probable_class', ..., 'least_class']
    results_ordered_by_probability = map(lambda x: {"disease": x[0],"prop": x[1] * 100, "sy": findFeatures(x[0])}, sorted(zip(clf.classes_, results), key=lambda x: x[1], reverse=True))
    
    prob_order = list(results_ordered_by_probability)
    
    '''disease_result = []
    disease_list = sym_dictionary["Disease_UMLS"].tolist()
    disease_cui_list = sym_dictionary["Disease_CUI"].tolist()
    for i,s in enumerate(results_ordered_by_probability): 
        disease_result.append(disease_list[disease_cui_list.index(s)])'''
        
    #store the predicted probabilities for class 1
    y_pred_prob = clf.predict_proba(sample)[0]
    
    return prob_order

In [39]:
prob_order = find_disease(input_symptoms, clf, features, feature_dict)
prob_order

ValueError: Expected 2D array, got 1D array instead:
array=[1. 0. 1. ... 0. 0. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [43]:
sg.theme('Dark Blue 3')   # Add a touch of color

    
layout = [[sg.Text('Symptoms:'), sg.InputText(size=(15,))],
     [sg.Button('Find Disease', key='submit')],
     [sg.Text('', key='prop_order', size=(20,2))],
     [sg.Quit('quit', key='exit')]]
    
# Create the Window
window = sg.Window('Disease System', layout, alpha_channel=.8, grab_anywhere=True)


while True:
    event, value = window.Read()
    print(value)
    if event == 'submit':
        prop_order = find_disease(input_symptoms, clf, features, feature_dict)
        if prop_order:
            window.Element('prop_order').Update(prop_order, text_color='black')
        else:
            window.Element('prop_order').Update('There is a problem!', text_color='red')
    elif event == 'exit':
        break
window.Close()
            

{0: 'headache'}


ValueError: Expected 2D array, got 1D array instead:
array=[1. 0. 1. ... 0. 0. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
def menu():
    #sg.Menu()
    menu_def = [['File', ['Open', 'Save', 'Exit', 'Properties']],
                ['Edit', ['Paste', ['Special', 'Normal', ], 'Undo'], ],
                ['Help', 'About...'], ]
    layout = [[sg.Menu(menu_def, tearoff=True)]]
    window = sg.Window('MENU', default_element_size=(40, 5), grab_anywhere=True).Layout(layout)
    event, values = window.Read()

In [None]:
def func(prob_order):
    print(prob_order)