In [131]:
import pandas as pd
import numpy as np
import json
import ast
import re
import pickle
from tqdm import tqdm
from constants import base_path, n_questions
import matplotlib.pyplot as plt
from collections import Counter
tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
tqdm.pandas()

In [132]:
with open(f"{base_path}\\input\\release_conditions.json") as f:
  disease_dict = json.load(f)
disease_list = list(disease_dict.keys())

In [133]:
with open(f"{base_path}\\input\\release_evidences.json") as f:
  evidences = json.load(f)
evidences_list = []
evidences_dict = {}
evidences_en_to_code = {}
for e in evidences.keys():
  # only binary symptoms and no antecedents
  if (not evidences[e]["possible-values"]) and (not evidences[e]["is_antecedent"]):
    evidences_list.append(e)
    evidences_dict[e] = evidences[e]["question_en"]
    evidences_en_to_code[evidences[e]["question_en"]] = e
evidences_code_to_en = evidences_dict
evidences_list_en = list(evidences_en_to_code.keys())
evidences_dict["AGE"] = "AGE"
evidences_dict["SEX"] = "SEX"
feature_columns = ["AGE", "SEX"] + evidences_list

In [134]:
def get_ftr_importance_df(feature_importance_dict):
    feature_importance_df = pd.DataFrame()
    feature_importance_df["evidence"] = evidences_list_en
    for disease in feature_importance_dict:
        feature_importance_df[disease] = [feature_importance_dict[disease]["top10_relevant_symptoms"].get(evidence, 0) for evidence in evidences_list_en]
    feature_importance_df.set_index('evidence', inplace=True)
    return feature_importance_df

In [135]:
def data_proc(df, questionnaire):
    df["binary_evidences_all"] = df["EVIDENCES"].apply(lambda x: [d for d in ast.literal_eval(x) if d in evidences_list])
    df["binary_evidences_all_count"] = df["binary_evidences_all"].apply(lambda x: len(x))
    df["binary_evidences"] = df["EVIDENCES"].progress_apply(lambda x: [d for d in ast.literal_eval(x) if d in questionnaire])
    df["binary_evidences_count"] = df["binary_evidences"].apply(lambda x: len(x))
    df["hit_rate"] = df["binary_evidences_count"]/df["binary_evidences_all_count"]
    hit_rate = df["hit_rate"]
    for e in evidences_list:
        df[e] = df["binary_evidences"].apply(lambda x: 1 if e in x else 0)
    df["SEX"] = df["SEX"].map({'F': 0, 'M': 1})
    ftr_df = df[feature_columns + ["PATHOLOGY"]]
    questionnaire_df = df[["binary_evidences_all", "binary_evidences", "INITIAL_EVIDENCE"]]
    return ftr_df, hit_rate, questionnaire_df

In [136]:
def pred(x):
    pred_list = []
    for i in range(len(disease_list)):
        if x[i] > 0:
            pred_list.append({
                "disease": disease_list[i],
                "probability": x[i]})
    if pred_list:
        pred_df = pd.DataFrame(pred_list).set_index('disease')
        # return only top 1 - allows ties
        pred_df['rank'] = pred_df['probability'].rank(method='min', ascending=False)
        pred_df = pred_df.sort_values(by="rank")
        pred_df = pred_df[pred_df["rank"]<=1][["probability"]]
        return list(pred_df.index)
    else:
        return []

In [137]:
def get_missed_evidence(actual, asked):
    return list(set(actual)^set(asked))

In [138]:
def validate(x):
    return [x[0]]==x[1]

## Random Forest

In [139]:
diagnosis_df_valid = pd.read_csv(f"{base_path}\\input\\release_validate_patients")

In [140]:
with open(f"{base_path}\\output\\feature_importance.json") as f:
  feature_importance_dict = json.load(f)
# feature_importance_dict

In [141]:
feature_importance_df = get_ftr_importance_df(feature_importance_dict)#.replace({0:np.nan})
feature_importance_df['mean_relevance'] = feature_importance_df.mean(axis=1)
feature_importance_df_top = feature_importance_df.sort_values(by="mean_relevance", ascending=False).head(n_questions)
feature_importance_df_top

Unnamed: 0_level_0,Spontaneous pneumothorax,Cluster headache,Boerhaave,Spontaneous rib fracture,GERD,HIV (initial infection),Anemia,Viral pharyngitis,Inguinal hernia,Myasthenia gravis,...,Acute rhinosinusitis,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis,mean_relevance
evidence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Do you have pain somewhere, related to your reason for consulting?",0.048024,0.075092,0.049367,0.031275,0.028815,0.0,0.023572,0.086531,0.042174,0.129557,...,0.045449,0.043964,0.137135,0.0,0.045028,0.049111,0.020412,0.018093,0.050291,0.086082
Do you have a cough?,0.059795,0.106704,0.051196,0.029649,0.041469,0.047593,0.034843,0.040993,0.047443,0.025968,...,0.079045,0.109685,0.0,0.042054,0.061363,0.053941,0.0,0.03497,0.069687,0.049375
"Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?",0.0,0.029475,0.017294,0.014129,0.0,0.156626,0.0,0.034248,0.184359,0.0,...,0.0,0.015698,0.013473,0.016532,0.033064,0.018412,0.093787,0.0,0.0,0.034908
Do you have a fever (either felt or measured with a thermometer)?,0.025764,0.066418,0.027025,0.023705,0.027436,0.094198,0.0,0.036879,0.02749,0.0,...,0.0,0.058277,0.175665,0.031826,0.034001,0.033759,0.028436,0.0,0.027511,0.032308
Are you experiencing shortness of breath or difficulty breathing in a significant way?,0.0,0.120256,0.017359,0.0,0.05567,0.0,0.0,0.107961,0.066274,0.0,...,0.054629,0.05363,0.071619,0.046348,0.0,0.0,0.065434,0.027737,0.020826,0.032287
Do you have swelling in one or more areas of your body?,0.025,0.034212,0.0,0.015056,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.018441,0.0,0.242545,0.030562,0.025989
Do you have nasal congestion or a clear runny nose?,0.019959,0.056125,0.016219,0.012694,0.025337,0.0,0.0,0.103667,0.015233,0.0,...,0.025575,0.039647,0.12097,0.024041,0.0,0.020356,0.0,0.0,0.021333,0.023066
Do you have pain that is increased when you breathe in deeply?,0.346442,0.0,0.0,0.242905,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.182836,0.019937
Do you have a cough that produces colored or more abundant sputum than usual?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027133,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019096
Do you have symptoms that are increased with physical exertion but alleviated with rest?,0.105666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.025487,0.0,0.0,0.112252,0.021531,0.018923


In [142]:
fixed_questionnaire = [evidences_en_to_code[e] for e in list(feature_importance_df_top.index)]
fixed_questionnaire

['E_53',
 'E_201',
 'E_129',
 'E_91',
 'E_66',
 'E_151',
 'E_181',
 'E_220',
 'E_77',
 'E_218',
 'E_155',
 'E_45',
 'E_50',
 'E_214',
 'E_194',
 'E_148',
 'E_217',
 'E_162',
 'E_89',
 'E_103']

In [143]:
[evidences_code_to_en[e] for e in fixed_questionnaire]

['Do you have pain somewhere, related to your reason for consulting?',
 'Do you have a cough?',
 'Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?',
 'Do you have a fever (either felt or measured with a thermometer)?',
 'Are you experiencing shortness of breath or difficulty breathing in a significant way?',
 'Do you have swelling in one or more areas of your body?',
 'Do you have nasal congestion or a clear runny nose?',
 'Do you have pain that is increased when you breathe in deeply?',
 'Do you have a cough that produces colored or more abundant sputum than usual?',
 'Do you have symptoms that are increased with physical exertion but alleviated with rest?',
 'Do you feel your heart is beating fast (racing), irregularly (missing a beat) or do you feel palpitations?',
 'Have you been coughing up blood?',
 'Have you had significantly increased sweating?',
 'Have you noticed a wheezing sound when you exhale?',

In [144]:
model_dict = {}
for disease in disease_list:
    disease_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
    with open(f'{base_path}\\output\\diseases\\{disease_filename}\\{disease_filename}_model.pkl', 'rb') as f:
        model_dict[disease] = pickle.load(f)

In [145]:
data_proc(diagnosis_df_valid.head(5), fixed_questionnaire)

100%|██████████| 5/5 [00:00<00:00, 4169.29it/s]


(   AGE  SEX  E_91  E_53  E_159  E_129  E_154  E_155  E_210  E_140  ...  E_193  \
 0   55    0     0     1      0      0      0      0      0      0  ...      0   
 1   10    0     0     1      0      0      0      1      0      0  ...      0   
 2   68    0     1     1      0      1      0      0      0      0  ...      0   
 3   13    1     0     1      0      0      0      0      0      0  ...      0   
 4   48    1     0     1      0      0      0      0      0      0  ...      0   
 
    E_168  E_180  E_67  E_171  E_111  E_182  E_103  E_23     PATHOLOGY  
 0      0      0     0      0      0      0      0     0        Anemia  
 1      0      0     0      0      0      0      0     0  Panic attack  
 2      0      0     0      0      0      0      0     0     Influenza  
 3      0      0     0      0      0      0      0     0        Anemia  
 4      0      0     0      0      0      0      0     0     Boerhaave  
 
 [5 rows x 99 columns],
 0    0.285714
 1    0.444444
 2    0.5555

In [146]:
diagnosis_df_valid, hit_rate, questionnaire_df = data_proc(diagnosis_df_valid, fixed_questionnaire)
# sample x% of the validation dataset
# diagnosis_df_valid = diagnosis_df_valid.sample(frac=0.01, random_state=1)
diagnosis_df_valid

100%|██████████| 132448/132448 [00:04<00:00, 26530.66it/s]


Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,E_193,E_168,E_180,E_67,E_171,E_111,E_182,E_103,E_23,PATHOLOGY
0,55,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Anemia
1,10,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Panic attack
2,68,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Influenza
3,13,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Anemia
4,48,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Boerhaave
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Viral pharyngitis
132444,57,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Acute pulmonary edema
132445,52,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,GERD
132446,10,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis


In [148]:
questionnaire_df["missed_evidence"] = questionnaire_df.progress_apply(lambda x: get_missed_evidence(x[0], x[1]), axis=1)
questionnaire_df

100%|██████████| 132448/132448 [00:01<00:00, 91075.31it/s] 


Unnamed: 0,binary_evidences_all,binary_evidences,INITIAL_EVIDENCE,missed_evidence
0,"[E_53, E_76, E_82, E_88, E_89, E_140, E_154]","[E_53, E_89]",E_154,"[E_82, E_76, E_140, E_88, E_154]"
1,"[E_50, E_53, E_66, E_75, E_111, E_155, E_157, ...","[E_50, E_53, E_66, E_155]",E_171,"[E_111, E_157, E_177, E_75, E_171]"
2,"[E_50, E_53, E_88, E_91, E_94, E_129, E_144, E...","[E_50, E_53, E_91, E_129, E_181]",E_53,"[E_161, E_88, E_94, E_144]"
3,"[E_53, E_66, E_76, E_82, E_89, E_179]","[E_53, E_66, E_89]",E_53,"[E_82, E_179, E_76]"
4,"[E_53, E_148, E_210]","[E_53, E_148]",E_53,[E_210]
...,...,...,...,...
132443,"[E_53, E_181, E_201]","[E_53, E_181, E_201]",E_201,[]
132444,"[E_53, E_66, E_67, E_151, E_217]","[E_53, E_66, E_151, E_217]",E_151,[E_67]
132445,"[E_53, E_173, E_201, E_215]","[E_53, E_201]",E_173,"[E_173, E_215]"
132446,"[E_53, E_65, E_91, E_190, E_194]","[E_53, E_91, E_194]",E_91,"[E_190, E_65]"


In [149]:
questionnaire_df.to_csv(f"{base_path}\\output\\error_analysis_questionnaire\\fixed_questionnaire_df.csv")

In [150]:
missed_evidences = []
for e in questionnaire_df["missed_evidence"]:
    if e:
        missed_evidences.extend(e)

In [151]:
missed_evidences_dict = dict(Counter(missed_evidences).most_common(10))
missed_evidences_dict = {evidences_code_to_en[k]:missed_evidences_dict[k] for k in missed_evidences_dict}
missed_evidences_dict

{'Do you have a sore throat?': 12626,
 'Do you have diffuse (widespread) muscle pain?': 12433,
 'Do you feel lightheaded and dizzy or do you feel like you are about to faint?': 12000,
 'Do you feel so tired that you are unable to do your usual activities or are you stuck in your bed all day long?': 9638,
 'Have you had diarrhea or an increase in stool frequency?': 7817,
 'Do you feel slightly dizzy or lightheaded?': 7662,
 'Have you recently had a loss of appetite or do you get full more quickly then usually?': 7574,
 'Do you have swollen or painful lymph nodes?': 5589,
 'Have you noticed any new fatigue, generalized and vague discomfort, diffuse (widespread) muscle aches or a change in your general well-being related to your consultation today?': 5533,
 'Have you recently had stools that were black (like coal)?': 5161}

In [152]:
hit_rate.plot.hist()
plt.ylabel("Frequency")
plt.title("Hit rate Distribution")
plt.savefig(f'{base_path}\\output\\questionnaire\\hit_rate_fixed_questionnaire.jpg', bbox_inches='tight')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [153]:
counter = 0
for disease in disease_list:
    rf_model = model_dict[disease]
    diagnosis_df_valid[disease] = rf_model.predict_proba(diagnosis_df_valid[feature_columns])[:,1]
    counter+=1
    print(f"done {counter}: {disease}")

done 1: Spontaneous pneumothorax
done 2: Cluster headache
done 3: Boerhaave
done 4: Spontaneous rib fracture
done 5: GERD
done 6: HIV (initial infection)
done 7: Anemia
done 8: Viral pharyngitis
done 9: Inguinal hernia
done 10: Myasthenia gravis
done 11: Whooping cough
done 12: Anaphylaxis
done 13: Epiglottitis
done 14: Guillain-Barré syndrome
done 15: Acute laryngitis
done 16: Croup
done 17: PSVT
done 18: Atrial fibrillation
done 19: Bronchiectasis
done 20: Allergic sinusitis
done 21: Chagas
done 22: Scombroid food poisoning
done 23: Myocarditis
done 24: Larygospasm
done 25: Acute dystonic reactions
done 26: Localized edema
done 27: SLE
done 28: Tuberculosis
done 29: Unstable angina
done 30: Stable angina
done 31: Ebola
done 32: Acute otitis media
done 33: Panic attack
done 34: Bronchospasm / acute asthma exacerbation
done 35: Bronchitis
done 36: Acute COPD exacerbation / infection
done 37: Pulmonary embolism
done 38: URTI
done 39: Influenza
done 40: Pneumonia
done 41: Acute rhinosinu

In [154]:
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,Pneumonia,Acute rhinosinusitis,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis
0,55,0,0,1,0,0,0,0,0,0,...,0.00,0.000000,0.000000,0.00,0.02,1.00,0.00,0.97,0.0,0.000000
1,10,0,0,1,0,0,0,1,0,0,...,0.00,0.000000,0.000000,0.00,0.00,0.12,0.00,0.00,0.0,0.284667
2,68,0,1,1,0,1,0,0,0,0,...,0.07,0.000000,0.000000,0.00,0.00,0.00,0.00,0.01,0.0,0.000000
3,13,1,0,1,0,0,0,0,0,0,...,0.00,0.000000,0.000000,0.00,0.00,0.13,0.01,0.00,0.0,0.070000
4,48,1,0,1,0,0,0,0,0,0,...,0.00,0.000000,0.000000,0.00,0.00,1.00,0.00,0.07,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0.00,0.414463,0.702787,0.00,0.00,0.00,0.00,0.00,0.0,0.000000
132444,57,1,0,1,0,0,0,0,0,0,...,0.00,0.000000,0.000000,0.00,0.01,0.00,0.00,0.00,1.0,0.190000
132445,52,0,0,1,0,0,0,0,0,0,...,0.00,0.005385,0.000000,0.00,0.01,0.00,0.00,0.01,0.0,0.000000
132446,10,1,1,1,0,0,0,0,0,0,...,0.00,0.000000,0.000000,0.01,0.00,0.00,0.00,0.00,0.0,0.000000


In [155]:
diagnosis_df_valid["predicted_diagnosis"] = diagnosis_df_valid[disease_list].progress_apply(lambda x: pred(x), axis=1)

100%|██████████| 132448/132448 [04:08<00:00, 533.68it/s]


In [156]:
diagnosis_df_valid["is_matched"] = diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis"]].progress_apply(lambda x: validate(x), axis=1)

100%|██████████| 132448/132448 [00:01<00:00, 117520.73it/s]


In [157]:
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis,predicted_diagnosis,is_matched
0,55,0,0,1,0,0,0,0,0,0,...,0.000000,0.00,0.02,1.00,0.00,0.97,0.0,0.000000,[Possible NSTEMI / STEMI],False
1,10,0,0,1,0,0,0,1,0,0,...,0.000000,0.00,0.00,0.12,0.00,0.00,0.0,0.284667,[Panic attack],True
2,68,0,1,1,0,1,0,0,0,0,...,0.000000,0.00,0.00,0.00,0.00,0.01,0.0,0.000000,[Influenza],True
3,13,1,0,1,0,0,0,0,0,0,...,0.000000,0.00,0.00,0.13,0.01,0.00,0.0,0.070000,"[Anemia, SLE]",False
4,48,1,0,1,0,0,0,0,0,0,...,0.000000,0.00,0.00,1.00,0.00,0.07,0.0,0.000000,[Possible NSTEMI / STEMI],False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0.702787,0.00,0.00,0.00,0.00,0.00,0.0,0.000000,[Acute otitis media],False
132444,57,1,0,1,0,0,0,0,0,0,...,0.000000,0.00,0.01,0.00,0.00,0.00,1.0,0.190000,[Acute pulmonary edema],True
132445,52,0,0,1,0,0,0,0,0,0,...,0.000000,0.00,0.01,0.00,0.00,0.01,0.0,0.000000,[Viral pharyngitis],False
132446,10,1,1,1,0,0,0,0,0,0,...,0.000000,0.01,0.00,0.00,0.00,0.00,0.0,0.000000,[Epiglottitis],True


In [158]:
diagnosis_df_valid["hit_rate"] = hit_rate

In [159]:
diagnosis_df_valid.is_matched.value_counts()

True     80706
False    51742
Name: is_matched, dtype: int64

In [160]:
acc = diagnosis_df_valid.is_matched.value_counts().to_dict()
acc["error_rate"] = acc[False]/len(diagnosis_df_valid)
acc

{True: 80706, False: 51742, 'error_rate': 0.3906589755979705}

In [161]:
np.mean(hit_rate)

0.6911363871608647

In [162]:
acc["mean_hit_rate"] = np.mean(hit_rate)

In [163]:
# hit rate of misdiagnosed
hit_rate_misdiagnosed = diagnosis_df_valid[diagnosis_df_valid["is_matched"]==False]["hit_rate"]
hit_rate_misdiagnosed.plot.hist()
plt.ylabel("Frequency")
plt.title("Hit rate Distribution - Misdiagnosed")
plt.savefig(f'{base_path}\\output\\questionnaire\\hit_rate_misdiagnosed_fixed_questionnaire.jpg', bbox_inches='tight')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [164]:
acc["mean_hit_rate_misdiagnosed"] = np.mean(hit_rate_misdiagnosed)

In [165]:
acc

{True: 80706,
 False: 51742,
 'error_rate': 0.3906589755979705,
 'mean_hit_rate': 0.6911363871608647,
 'mean_hit_rate_misdiagnosed': 0.5702004619378319}

In [166]:
with open(f"{base_path}\\output\\error_analysis_questionnaire\\validation_metric_fixed_questionnaire.json", "w") as outfile: 
    json.dump(acc, outfile, indent=True)

In [167]:
diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis", "is_matched", "hit_rate"] + disease_list].to_csv(f"{base_path}\\output\\error_analysis_questionnaire\\validation_df_all_patients_fixed_questionnaire.csv")

## Logistic Regression

In [177]:
diagnosis_df_valid = pd.read_csv(f"{base_path}\\input\\release_validate_patients")

In [178]:
with open(f"{base_path}\\output\\feature_importance_logreg.json") as f:
  feature_importance_dict = json.load(f)
# feature_importance_dict

In [179]:
feature_importance_df = get_ftr_importance_df(feature_importance_dict)#.replace({0:np.nan})
feature_importance_df['mean_relevance'] = feature_importance_df.mean(axis=1)
feature_importance_df_top = feature_importance_df.sort_values(by="mean_relevance", ascending=False).head(n_questions)
feature_importance_df_top

Unnamed: 0_level_0,Spontaneous pneumothorax,Cluster headache,Boerhaave,Spontaneous rib fracture,GERD,HIV (initial infection),Anemia,Viral pharyngitis,Inguinal hernia,Myasthenia gravis,...,Acute rhinosinusitis,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis,mean_relevance
evidence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Do you have pain somewhere, related to your reason for consulting?",1.650935,3.568194,1.36364,0.881306,1.371733,0.86292,2.121008,5.186258,1.910237,0.0,...,2.024601,2.033464,0.0,1.195002,2.634012,1.386526,1.375371,0.217331,3.17952,1.358655
"Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?",0.0,0.0,0.0,0.0,0.0,4.273725,0.0,0.0,3.133899,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.973815,0.0,0.0,0.54392
Are you experiencing shortness of breath or difficulty breathing in a significant way?,0.0,0.0,0.613317,0.100892,0.0,0.0,0.370698,0.0,0.0,0.150842,...,0.0,0.0,0.640358,1.520528,0.403824,0.41445,0.0,2.216942,0.892174,0.457555
Do you have a cough?,0.0,0.0,0.0,1.604813,1.79061,0.0,0.0,0.496056,0.0,0.0,...,0.805366,1.30971,0.224546,0.697632,0.0,0.0,0.804146,0.0,0.0,0.34412
Do you have swelling in one or more areas of your body?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.037463,0.0,0.325274
Are you feeling nauseous or do you feel like vomiting?,0.0,0.0,2.78821,0.0,0.0,0.736466,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.116744,0.0,0.577192,0.0,0.0,0.242983
Do you have a fever (either felt or measured with a thermometer)?,0.0,0.0,0.0,0.0,0.0,1.585639,0.0,0.142418,0.0,0.0,...,0.242576,0.0,0.777401,0.0,0.0,0.0,0.0,0.0,0.0,0.229234
Do you have a cough that produces colored or more abundant sputum than usual?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.221103
Have you noticed a high pitched sound when breathing in?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217526
Do you have pain that is increased when you breathe in deeply?,3.079008,0.0,0.0,1.595284,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1e-06,0.0,0.0,0.0,0.0,0.0,1.511341,0.200726


In [180]:
fixed_questionnaire = [evidences_en_to_code[e] for e in list(feature_importance_df_top.index)]
fixed_questionnaire

['E_53',
 'E_129',
 'E_66',
 'E_201',
 'E_151',
 'E_148',
 'E_91',
 'E_77',
 'E_194',
 'E_220',
 'E_218',
 'E_50',
 'E_181',
 'E_214',
 'E_45',
 'E_9',
 'E_175',
 'E_155',
 'E_82',
 'E_88']

In [181]:
[evidences_code_to_en[e] for e in fixed_questionnaire]

['Do you have pain somewhere, related to your reason for consulting?',
 'Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?',
 'Are you experiencing shortness of breath or difficulty breathing in a significant way?',
 'Do you have a cough?',
 'Do you have swelling in one or more areas of your body?',
 'Are you feeling nauseous or do you feel like vomiting?',
 'Do you have a fever (either felt or measured with a thermometer)?',
 'Do you have a cough that produces colored or more abundant sputum than usual?',
 'Have you noticed a high pitched sound when breathing in?',
 'Do you have pain that is increased when you breathe in deeply?',
 'Do you have symptoms that are increased with physical exertion but alleviated with rest?',
 'Have you had significantly increased sweating?',
 'Do you have nasal congestion or a clear runny nose?',
 'Have you noticed a wheezing sound when you exhale?',
 'Have you been coughing up

In [182]:
model_dict = {}
for disease in disease_list:
    disease_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
    with open(f'{base_path}\\output\\diseases\\{disease_filename}\\{disease_filename}_logreg_model.pkl', 'rb') as f:
        model_dict[disease] = pickle.load(f)

In [183]:
data_proc(diagnosis_df_valid.head(5), fixed_questionnaire)

100%|██████████| 5/5 [00:00<00:00, 4506.13it/s]


(   AGE  SEX  E_91  E_53  E_159  E_129  E_154  E_155  E_210  E_140  ...  E_193  \
 0   55    0     0     1      0      0      0      0      0      0  ...      0   
 1   10    0     0     1      0      0      0      1      0      0  ...      0   
 2   68    0     1     1      0      1      0      0      0      0  ...      0   
 3   13    1     0     1      0      0      0      0      0      0  ...      0   
 4   48    1     0     1      0      0      0      0      0      0  ...      0   
 
    E_168  E_180  E_67  E_171  E_111  E_182  E_103  E_23     PATHOLOGY  
 0      0      0     0      0      0      0      0     0        Anemia  
 1      0      0     0      0      0      0      0     0  Panic attack  
 2      0      0     0      0      0      0      0     0     Influenza  
 3      0      0     0      0      0      0      0     0        Anemia  
 4      0      0     0      0      0      0      0     0     Boerhaave  
 
 [5 rows x 99 columns],
 0    0.428571
 1    0.444444
 2    0.6666

In [184]:
diagnosis_df_valid, hit_rate, questionnaire_df = data_proc(diagnosis_df_valid, fixed_questionnaire)
# sample x% of the validation dataset
# diagnosis_df_valid = diagnosis_df_valid.sample(frac=0.01, random_state=1)
diagnosis_df_valid

100%|██████████| 132448/132448 [00:04<00:00, 26684.15it/s]


Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,E_193,E_168,E_180,E_67,E_171,E_111,E_182,E_103,E_23,PATHOLOGY
0,55,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Anemia
1,10,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Panic attack
2,68,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Influenza
3,13,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Anemia
4,48,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Boerhaave
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Viral pharyngitis
132444,57,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Acute pulmonary edema
132445,52,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,GERD
132446,10,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis


In [185]:
questionnaire_df["missed_evidence"] = questionnaire_df.progress_apply(lambda x: get_missed_evidence(x[0], x[1]), axis=1)
questionnaire_df

100%|██████████| 132448/132448 [00:01<00:00, 95488.11it/s] 


Unnamed: 0,binary_evidences_all,binary_evidences,INITIAL_EVIDENCE,missed_evidence
0,"[E_53, E_76, E_82, E_88, E_89, E_140, E_154]","[E_53, E_82, E_88]",E_154,"[E_76, E_89, E_140, E_154]"
1,"[E_50, E_53, E_66, E_75, E_111, E_155, E_157, ...","[E_50, E_53, E_66, E_155]",E_171,"[E_111, E_157, E_177, E_75, E_171]"
2,"[E_50, E_53, E_88, E_91, E_94, E_129, E_144, E...","[E_50, E_53, E_88, E_91, E_129, E_181]",E_53,"[E_161, E_94, E_144]"
3,"[E_53, E_66, E_76, E_82, E_89, E_179]","[E_53, E_66, E_82]",E_53,"[E_179, E_76, E_89]"
4,"[E_53, E_148, E_210]","[E_53, E_148]",E_53,[E_210]
...,...,...,...,...
132443,"[E_53, E_181, E_201]","[E_53, E_181, E_201]",E_201,[]
132444,"[E_53, E_66, E_67, E_151, E_217]","[E_53, E_66, E_151]",E_151,"[E_67, E_217]"
132445,"[E_53, E_173, E_201, E_215]","[E_53, E_201]",E_173,"[E_173, E_215]"
132446,"[E_53, E_65, E_91, E_190, E_194]","[E_53, E_91, E_194]",E_91,"[E_190, E_65]"


In [186]:
questionnaire_df.to_csv(f"{base_path}\\output\\error_analysis_questionnaire\\fixed_questionnaire_df_logreg.csv")

In [187]:
missed_evidences = []
for e in questionnaire_df["missed_evidence"]:
    if e:
        missed_evidences.extend(e)

In [188]:
missed_evidences_dict = dict(Counter(missed_evidences).most_common(10))
missed_evidences_dict = {evidences_code_to_en[k]:missed_evidences_dict[k] for k in missed_evidences_dict}
missed_evidences_dict

{'Do you have a sore throat?': 12626,
 'Do you have diffuse (widespread) muscle pain?': 12433,
 'Do you constantly feel fatigued or do you have non-restful sleep?': 12269,
 'Have you had diarrhea or an increase in stool frequency?': 7817,
 'Do you feel slightly dizzy or lightheaded?': 7662,
 'Have you recently had a loss of appetite or do you get full more quickly then usually?': 7574,
 'Are your symptoms worse when lying down and alleviated while sitting up?': 7255,
 'Have you had an involuntary weight loss over the last 3 months?': 6837,
 'Have you recently had stools that were black (like coal)?': 5161,
 'Did you lose consciousness?': 4768}

In [189]:
hit_rate.plot.hist()
plt.ylabel("Frequency")
plt.title("Hit rate Distribution")
plt.savefig(f'{base_path}\\output\\questionnaire\\hit_rate_logreg_fixed_questionnaire.jpg', bbox_inches='tight')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [190]:
counter = 0
for disease in disease_list:
    logreg_model = model_dict[disease]
    diagnosis_df_valid[disease] = logreg_model.predict_proba(diagnosis_df_valid[feature_columns])[:,1]
    counter+=1
    print(f"done {counter}: {disease}")

done 1: Spontaneous pneumothorax
done 2: Cluster headache
done 3: Boerhaave
done 4: Spontaneous rib fracture
done 5: GERD
done 6: HIV (initial infection)
done 7: Anemia
done 8: Viral pharyngitis
done 9: Inguinal hernia
done 10: Myasthenia gravis
done 11: Whooping cough
done 12: Anaphylaxis
done 13: Epiglottitis
done 14: Guillain-Barré syndrome
done 15: Acute laryngitis
done 16: Croup
done 17: PSVT
done 18: Atrial fibrillation
done 19: Bronchiectasis
done 20: Allergic sinusitis
done 21: Chagas
done 22: Scombroid food poisoning
done 23: Myocarditis
done 24: Larygospasm
done 25: Acute dystonic reactions
done 26: Localized edema
done 27: SLE
done 28: Tuberculosis
done 29: Unstable angina
done 30: Stable angina
done 31: Ebola
done 32: Acute otitis media
done 33: Panic attack
done 34: Bronchospasm / acute asthma exacerbation
done 35: Bronchitis
done 36: Acute COPD exacerbation / infection
done 37: Pulmonary embolism
done 38: URTI
done 39: Influenza
done 40: Pneumonia
done 41: Acute rhinosinu

In [191]:
diagnosis_df_valid["predicted_diagnosis"] = diagnosis_df_valid[disease_list].progress_apply(lambda x: pred(x), axis=1)

  0%|          | 0/132448 [00:00<?, ?it/s]

100%|██████████| 132448/132448 [04:05<00:00, 538.41it/s]


In [192]:
diagnosis_df_valid["is_matched"] = diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis"]].progress_apply(lambda x: validate(x), axis=1)

  0%|          | 0/132448 [00:00<?, ?it/s]

100%|██████████| 132448/132448 [00:01<00:00, 113085.38it/s]


In [193]:
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis,predicted_diagnosis,is_matched
0,55,0,0,1,0,0,0,0,0,0,...,4.217325e-03,4.156562e-25,4.732913e-04,1.160692e-03,9.445064e-04,4.464657e-05,4.970032e-07,1.423231e-04,[Anemia],True
1,10,0,0,1,0,0,0,1,0,0,...,2.187272e-05,1.302628e-05,1.123396e-05,4.450931e-01,1.030640e-03,2.612394e-07,1.270043e-04,4.583824e-01,[PSVT],False
2,68,0,1,1,0,1,0,0,0,0,...,3.133114e-08,2.214204e-29,5.637664e-12,2.383754e-07,5.654625e-08,4.620419e-08,1.713709e-06,7.336574e-09,[Influenza],True
3,13,1,0,1,0,0,0,0,0,0,...,8.134955e-04,1.370609e-06,5.241459e-03,3.215578e-02,1.663514e-02,4.529149e-06,4.019650e-05,5.667495e-03,[Anemia],True
4,48,1,0,1,0,0,0,0,0,0,...,1.064213e-02,5.213560e-22,2.050258e-05,9.443614e-01,4.087747e-04,9.384200e-02,4.990391e-07,1.095001e-02,[Possible NSTEMI / STEMI],False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,8.255195e-01,6.266019e-12,2.593481e-04,1.940074e-05,3.897018e-04,4.993563e-03,1.110333e-06,3.782573e-05,[Viral pharyngitis],True
132444,57,1,0,1,0,0,0,0,0,0,...,1.772632e-04,3.020682e-25,5.154746e-03,1.525742e-02,5.914479e-03,1.394241e-06,7.689250e-01,3.721269e-03,[Pulmonary embolism],False
132445,52,0,0,1,0,0,0,0,0,0,...,4.443749e-01,1.595500e-23,4.883689e-02,6.889136e-03,1.719871e-03,2.122348e-01,1.071148e-06,4.491520e-03,[Viral pharyngitis],False
132446,10,1,1,1,0,0,0,0,0,0,...,3.183223e-06,3.171350e-05,8.758577e-07,9.003880e-04,1.425431e-04,2.243794e-05,3.437095e-07,2.467303e-03,[Epiglottitis],True


In [194]:
diagnosis_df_valid["hit_rate"] = hit_rate

In [195]:
diagnosis_df_valid.is_matched.value_counts()

True     85735
False    46713
Name: is_matched, dtype: int64

In [196]:
acc = diagnosis_df_valid.is_matched.value_counts().to_dict()
acc["error_rate"] = acc[False]/len(diagnosis_df_valid)
acc

{True: 85735, False: 46713, 'error_rate': 0.3526893573326891}

In [197]:
np.mean(hit_rate)

0.6837862064824721

In [198]:
acc["mean_hit_rate"] = np.mean(hit_rate)

In [199]:
# hit rate of misdiagnosed
hit_rate_misdiagnosed = diagnosis_df_valid[diagnosis_df_valid["is_matched"]==False]["hit_rate"]
hit_rate_misdiagnosed.plot.hist()
plt.ylabel("Frequency")
plt.title("Hit rate Distribution - Misdiagnosed")
plt.savefig(f'{base_path}\\output\\questionnaire\\hit_rate_misdiagnosed_logreg_fixed_questionnare.jpg', bbox_inches='tight')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [200]:
np.mean(hit_rate_misdiagnosed)

0.5242093139498573

In [201]:
acc["mean_hit_rate_misdiagnosed"] = np.mean(hit_rate_misdiagnosed)

In [202]:
with open(f"{base_path}\\output\\error_analysis_questionnaire\\validation_metric_logreg_fixed_questionnaire.json", "w") as outfile: 
    json.dump(acc, outfile, indent=True)

In [203]:
diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis", "is_matched", "hit_rate"] + disease_list].to_csv(f"{base_path}\\output\\error_analysis_questionnaire\\validation_logreg_df_all_patients_fixed_questionnaire.csv")