In [1]:
import pandas as pd
import numpy as np
import json
import ast
import re
import pickle
from tqdm import tqdm
from constants import base_path, n_questions
import matplotlib.pyplot as plt
from collections import Counter
tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
tqdm.pandas()

In [2]:
with open(f"{base_path}\\input\\release_conditions.json") as f:
  disease_dict = json.load(f)
disease_list = list(disease_dict.keys())

In [3]:
with open(f"{base_path}\\input\\release_evidences.json") as f:
  evidences = json.load(f)
evidences_list = []
evidences_dict = {}
evidences_en_to_code = {}
for e in evidences.keys():
  # only binary symptoms and no antecedents
  if (not evidences[e]["possible-values"]) and (not evidences[e]["is_antecedent"]):
    evidences_list.append(e)
    evidences_dict[e] = evidences[e]["question_en"]
    evidences_en_to_code[evidences[e]["question_en"]] = e
evidences_code_to_en = evidences_dict
evidences_list_en = list(evidences_en_to_code.keys())
evidences_dict["AGE"] = "AGE"
evidences_dict["SEX"] = "SEX"
feature_columns = ["AGE", "SEX"] + evidences_list

In [5]:
def data_proc(df, questionnaire):
    df["binary_evidences_all"] = df["EVIDENCES"].apply(lambda x: [d for d in ast.literal_eval(x) if d in evidences_list])
    df["binary_evidences_all_count"] = df["binary_evidences_all"].apply(lambda x: len(x))
    df["binary_evidences"] = df["EVIDENCES"].progress_apply(lambda x: [d for d in ast.literal_eval(x) if d in questionnaire])
    df["binary_evidences_count"] = df["binary_evidences"].apply(lambda x: len(x))
    df["hit_rate"] = df["binary_evidences_count"]/df["binary_evidences_all_count"]
    hit_rate = df["hit_rate"]
    for e in evidences_list:
        df[e] = df["binary_evidences"].apply(lambda x: 1 if e in x else 0)
    df["SEX"] = df["SEX"].map({'F': 0, 'M': 1})
    ftr_df = df[feature_columns + ["PATHOLOGY"]]
    questionnaire_df = df[["binary_evidences_all", "binary_evidences", "INITIAL_EVIDENCE"]]
    return ftr_df, hit_rate, questionnaire_df

In [6]:
def pred(x):
    pred_list = []
    for i in range(len(disease_list)):
        if x[i] > 0:
            pred_list.append({
                "disease": disease_list[i],
                "probability": x[i]})
    if pred_list:
        pred_df = pd.DataFrame(pred_list).set_index('disease')
        # return only top 1 - allows ties
        pred_df['rank'] = pred_df['probability'].rank(method='min', ascending=False)
        pred_df = pred_df.sort_values(by="rank")
        pred_df = pred_df[pred_df["rank"]<=1][["probability"]]
        return list(pred_df.index)
    else:
        return []

In [7]:
def get_missed_evidence(actual, asked):
    return list(set(actual)^set(asked))

In [8]:
def validate(x):
    return [x[0]]==x[1]

In [9]:
with open(f'{base_path}\\output\\questionnaire\\questionnaire.pkl', 'rb') as f:
    questionnaire = pickle.load(f)
feature_embeddings_df = pd.read_pickle(f'{base_path}\\output\\questionnaire\\questionnaire_embeddings.pkl')

In [10]:
diagnosis_df_valid = pd.read_csv(f"{base_path}\\input\\release_validate_patients")

## Random Forest

In [11]:
feature_embeddings_df['mean_freq'] = feature_embeddings_df.mean(axis=1)
feature_importance_df_top = feature_embeddings_df.sort_values(by="mean_freq", ascending=False).head(n_questions)
feature_importance_df_top

PATHOLOGY,Acute COPD exacerbation / infection,Acute dystonic reactions,Acute laryngitis,Acute otitis media,Acute pulmonary edema,Acute rhinosinusitis,Allergic sinusitis,Anaphylaxis,Anemia,Atrial fibrillation,...,Scombroid food poisoning,Spontaneous pneumothorax,Spontaneous rib fracture,Stable angina,Tuberculosis,URTI,Unstable angina,Viral pharyngitis,Whooping cough,mean_freq
evidence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Do you have pain somewhere, related to your reason for consulting?",0.0,0.0,0.999378,0.998109,1.0,1.0,0.0,0.999495,0.990329,0.0,...,0.0,1.0,1.0,1.0,0.0,0.996163,1.0,1.0,0.0,0.682113
Are you experiencing shortness of breath or difficulty breathing in a significant way?,0.748825,0.59445,0.0,0.0,0.793406,0.0,0.0,0.601811,0.604204,0.723094,...,0.555651,0.652824,0.430672,0.611062,0.582271,0.0,0.632084,0.0,0.0,0.453581
Do you have a cough?,0.775437,0.0,0.575449,0.524328,0.0,0.774046,0.630691,0.0,0.0,0.0,...,0.0,0.0,0.628676,0.0,0.999877,0.682482,0.0,0.600759,0.0,0.289254
Do you have a fever (either felt or measured with a thermometer)?,0.0,0.0,0.549215,0.564649,0.0,0.35447,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.644198,0.699664,0.0,0.493397,0.0,0.1949
Do you have nasal congestion or a clear runny nose?,0.0,0.0,0.0,0.605703,0.0,0.322286,0.878869,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.806892,0.0,0.685085,0.0,0.154888
"Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152148
Are you feeling nauseous or do you feel like vomiting?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.547226,0.0,0.0,...,0.535797,0.0,0.0,0.0,0.0,0.0,0.55258,0.0,0.0,0.118622
Do you constantly feel fatigued or do you have non-restful sleep?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.600789,0.0,...,0.0,0.0,0.0,0.539159,0.0,0.0,0.541753,0.0,0.0,0.092559
Have you had significantly increased sweating?,0.0,0.0,0.0,0.0,0.586182,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.647061,0.527631,0.0,0.0,0.087956
Have you noticed a wheezing sound when you exhale?,0.905215,0.0,0.0,0.0,0.0,0.0,0.0,0.596147,0.0,0.0,...,0.534071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084959


In [12]:
fixed_questionnaire = [evidences_en_to_code[e] for e in list(feature_importance_df_top.index)]
fixed_questionnaire

['E_53',
 'E_66',
 'E_201',
 'E_91',
 'E_181',
 'E_129',
 'E_148',
 'E_89',
 'E_50',
 'E_214',
 'E_218',
 'E_155',
 'E_45',
 'E_151',
 'E_220',
 'E_77',
 'E_51',
 'E_82',
 'E_144',
 'E_194']

In [13]:
[evidences_code_to_en[e] for e in fixed_questionnaire]

['Do you have pain somewhere, related to your reason for consulting?',
 'Are you experiencing shortness of breath or difficulty breathing in a significant way?',
 'Do you have a cough?',
 'Do you have a fever (either felt or measured with a thermometer)?',
 'Do you have nasal congestion or a clear runny nose?',
 'Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?',
 'Are you feeling nauseous or do you feel like vomiting?',
 'Do you constantly feel fatigued or do you have non-restful sleep?',
 'Have you had significantly increased sweating?',
 'Have you noticed a wheezing sound when you exhale?',
 'Do you have symptoms that are increased with physical exertion but alleviated with rest?',
 'Do you feel your heart is beating fast (racing), irregularly (missing a beat) or do you feel palpitations?',
 'Have you been coughing up blood?',
 'Do you have swelling in one or more areas of your body?',
 'Do you have pain 

In [14]:
model_dict = {}
for disease in disease_list:
    disease_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
    with open(f'{base_path}\\output\\diseases\\{disease_filename}\\{disease_filename}_model.pkl', 'rb') as f:
        model_dict[disease] = pickle.load(f)

In [15]:
data_proc(diagnosis_df_valid.head(5), fixed_questionnaire)

100%|██████████| 5/5 [00:00<00:00, 6619.80it/s]


(   AGE  SEX  E_91  E_53  E_159  E_129  E_154  E_155  E_210  E_140  ...  E_193  \
 0   55    0     0     1      0      0      0      0      0      0  ...      0   
 1   10    0     0     1      0      0      0      1      0      0  ...      0   
 2   68    0     1     1      0      1      0      0      0      0  ...      0   
 3   13    1     0     1      0      0      0      0      0      0  ...      0   
 4   48    1     0     1      0      0      0      0      0      0  ...      0   
 
    E_168  E_180  E_67  E_171  E_111  E_182  E_103  E_23     PATHOLOGY  
 0      0      0     0      0      0      0      0     0        Anemia  
 1      0      0     0      0      0      0      0     0  Panic attack  
 2      0      0     0      0      0      0      0     0     Influenza  
 3      0      0     0      0      0      0      0     0        Anemia  
 4      0      0     0      0      0      0      0     0     Boerhaave  
 
 [5 rows x 99 columns],
 0    0.428571
 1    0.444444
 2    0.6666

In [16]:
diagnosis_df_valid, hit_rate, questionnaire_df = data_proc(diagnosis_df_valid, fixed_questionnaire)
# sample x% of the validation dataset
# diagnosis_df_valid = diagnosis_df_valid.sample(frac=0.01, random_state=1)
diagnosis_df_valid

100%|██████████| 132448/132448 [00:04<00:00, 28061.50it/s]


Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,E_193,E_168,E_180,E_67,E_171,E_111,E_182,E_103,E_23,PATHOLOGY
0,55,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Anemia
1,10,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Panic attack
2,68,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Influenza
3,13,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Anemia
4,48,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Boerhaave
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Viral pharyngitis
132444,57,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Acute pulmonary edema
132445,52,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,GERD
132446,10,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis


In [17]:
questionnaire_df["missed_evidence"] = questionnaire_df.progress_apply(lambda x: get_missed_evidence(x[0], x[1]), axis=1)
questionnaire_df

100%|██████████| 132448/132448 [00:01<00:00, 93611.46it/s] 


Unnamed: 0,binary_evidences_all,binary_evidences,INITIAL_EVIDENCE,missed_evidence
0,"[E_53, E_76, E_82, E_88, E_89, E_140, E_154]","[E_53, E_82, E_89]",E_154,"[E_88, E_154, E_76, E_140]"
1,"[E_50, E_53, E_66, E_75, E_111, E_155, E_157, ...","[E_50, E_53, E_66, E_155]",E_171,"[E_75, E_171, E_157, E_111, E_177]"
2,"[E_50, E_53, E_88, E_91, E_94, E_129, E_144, E...","[E_50, E_53, E_91, E_129, E_144, E_181]",E_53,"[E_161, E_94, E_88]"
3,"[E_53, E_66, E_76, E_82, E_89, E_179]","[E_53, E_66, E_82, E_89]",E_53,"[E_76, E_179]"
4,"[E_53, E_148, E_210]","[E_53, E_148]",E_53,[E_210]
...,...,...,...,...
132443,"[E_53, E_181, E_201]","[E_53, E_181, E_201]",E_201,[]
132444,"[E_53, E_66, E_67, E_151, E_217]","[E_53, E_66, E_151]",E_151,"[E_217, E_67]"
132445,"[E_53, E_173, E_201, E_215]","[E_53, E_201]",E_173,"[E_215, E_173]"
132446,"[E_53, E_65, E_91, E_190, E_194]","[E_53, E_91, E_194]",E_91,"[E_190, E_65]"


In [18]:
questionnaire_df.to_csv(f"{base_path}\\output\\error_analysis_questionnaire\\fixed_questionnaire_df.csv")

In [19]:
missed_evidences = []
for e in questionnaire_df["missed_evidence"]:
    if e:
        missed_evidences.extend(e)

In [20]:
missed_evidences_dict = dict(Counter(missed_evidences).most_common(10))
missed_evidences_dict = {evidences_code_to_en[k]:missed_evidences_dict[k] for k in missed_evidences_dict}
missed_evidences_dict

{'Do you have a sore throat?': 12626,
 'Do you feel so tired that you are unable to do your usual activities or are you stuck in your bed all day long?': 9638,
 'Do you feel slightly dizzy or lightheaded?': 7662,
 'Have you recently had a loss of appetite or do you get full more quickly then usually?': 7574,
 'Are your symptoms worse when lying down and alleviated while sitting up?': 7255,
 'Have you had an involuntary weight loss over the last 3 months?': 6837,
 'Do you have swollen or painful lymph nodes?': 5589,
 'Have you noticed any new fatigue, generalized and vague discomfort, diffuse (widespread) muscle aches or a change in your general well-being related to your consultation today?': 5533,
 'Have you recently had stools that were black (like coal)?': 5161,
 'Did you lose consciousness?': 4768}

In [21]:
hit_rate.plot.hist()
plt.ylabel("Frequency")
plt.title("Hit rate Distribution")
plt.savefig(f'{base_path}\\output\\questionnaire\\hit_rate_fixed_questionnaire.jpg', bbox_inches='tight')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [22]:
counter = 0
for disease in disease_list:
    rf_model = model_dict[disease]
    diagnosis_df_valid[disease] = rf_model.predict_proba(diagnosis_df_valid[feature_columns])[:,1]
    counter+=1
    print(f"done {counter}: {disease}")

done 1: Spontaneous pneumothorax
done 2: Cluster headache
done 3: Boerhaave
done 4: Spontaneous rib fracture
done 5: GERD
done 6: HIV (initial infection)
done 7: Anemia
done 8: Viral pharyngitis
done 9: Inguinal hernia
done 10: Myasthenia gravis
done 11: Whooping cough
done 12: Anaphylaxis
done 13: Epiglottitis
done 14: Guillain-Barré syndrome
done 15: Acute laryngitis
done 16: Croup
done 17: PSVT
done 18: Atrial fibrillation
done 19: Bronchiectasis
done 20: Allergic sinusitis
done 21: Chagas
done 22: Scombroid food poisoning
done 23: Myocarditis
done 24: Larygospasm
done 25: Acute dystonic reactions
done 26: Localized edema
done 27: SLE
done 28: Tuberculosis
done 29: Unstable angina
done 30: Stable angina
done 31: Ebola
done 32: Acute otitis media
done 33: Panic attack
done 34: Bronchospasm / acute asthma exacerbation
done 35: Bronchitis
done 36: Acute COPD exacerbation / infection
done 37: Pulmonary embolism
done 38: URTI
done 39: Influenza
done 40: Pneumonia
done 41: Acute rhinosinu

In [23]:
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,Pneumonia,Acute rhinosinusitis,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis
0,55,0,0,1,0,0,0,0,0,0,...,0.00,0.000000,0.000000,0.00,0.01,0.09,0.00,0.11,0.0000,0.000000
1,10,0,0,1,0,0,0,1,0,0,...,0.00,0.000000,0.000000,0.00,0.00,0.12,0.00,0.00,0.0000,0.284667
2,68,0,1,1,0,1,0,0,0,0,...,0.09,0.000000,0.000000,0.00,0.00,0.00,0.00,0.01,0.0000,0.000000
3,13,1,0,1,0,0,0,0,0,0,...,0.00,0.000000,0.000000,0.00,0.00,0.00,0.00,0.00,0.0000,0.000000
4,48,1,0,1,0,0,0,0,0,0,...,0.00,0.000000,0.000000,0.00,0.00,1.00,0.00,0.07,0.0000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0.00,0.414463,0.702787,0.00,0.00,0.00,0.00,0.00,0.0000,0.000000
132444,57,1,0,1,0,0,0,0,0,0,...,0.00,0.000000,0.000000,0.00,0.05,0.00,0.00,0.00,0.9125,0.010000
132445,52,0,0,1,0,0,0,0,0,0,...,0.00,0.005385,0.000000,0.00,0.01,0.00,0.00,0.01,0.0000,0.000000
132446,10,1,1,1,0,0,0,0,0,0,...,0.00,0.000000,0.000000,0.01,0.00,0.00,0.00,0.00,0.0000,0.000000


In [24]:
diagnosis_df_valid["predicted_diagnosis"] = diagnosis_df_valid[disease_list].progress_apply(lambda x: pred(x), axis=1)

  0%|          | 0/132448 [00:00<?, ?it/s]

100%|██████████| 132448/132448 [04:51<00:00, 454.46it/s]


In [25]:
diagnosis_df_valid["is_matched"] = diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis"]].progress_apply(lambda x: validate(x), axis=1)

  0%|          | 0/132448 [00:00<?, ?it/s]

100%|██████████| 132448/132448 [00:01<00:00, 85390.26it/s]


In [26]:
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis,predicted_diagnosis,is_matched
0,55,0,0,1,0,0,0,0,0,0,...,0.000000,0.00,0.01,0.09,0.00,0.11,0.0000,0.000000,[Anemia],True
1,10,0,0,1,0,0,0,1,0,0,...,0.000000,0.00,0.00,0.12,0.00,0.00,0.0000,0.284667,[Panic attack],True
2,68,0,1,1,0,1,0,0,0,0,...,0.000000,0.00,0.00,0.00,0.00,0.01,0.0000,0.000000,[Influenza],True
3,13,1,0,1,0,0,0,0,0,0,...,0.000000,0.00,0.00,0.00,0.00,0.00,0.0000,0.000000,[Anemia],True
4,48,1,0,1,0,0,0,0,0,0,...,0.000000,0.00,0.00,1.00,0.00,0.07,0.0000,0.000000,[Possible NSTEMI / STEMI],False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0.702787,0.00,0.00,0.00,0.00,0.00,0.0000,0.000000,[Acute otitis media],False
132444,57,1,0,1,0,0,0,0,0,0,...,0.000000,0.00,0.05,0.00,0.00,0.00,0.9125,0.010000,[Pulmonary embolism],False
132445,52,0,0,1,0,0,0,0,0,0,...,0.000000,0.00,0.01,0.00,0.00,0.01,0.0000,0.000000,[Viral pharyngitis],False
132446,10,1,1,1,0,0,0,0,0,0,...,0.000000,0.01,0.00,0.00,0.00,0.00,0.0000,0.000000,[Epiglottitis],True


In [27]:
diagnosis_df_valid["hit_rate"] = hit_rate

In [28]:
diagnosis_df_valid.is_matched.value_counts()

True     83953
False    48495
Name: is_matched, dtype: int64

In [29]:
acc = diagnosis_df_valid.is_matched.value_counts().to_dict()
acc["error_rate"] = acc[False]/len(diagnosis_df_valid)
acc

{True: 83953, False: 48495, 'error_rate': 0.36614369412901665}

In [30]:
np.mean(hit_rate)

0.6999085512887155

In [31]:
acc["mean_hit_rate"] = np.mean(hit_rate)

In [32]:
# hit rate of misdiagnosed
hit_rate_misdiagnosed = diagnosis_df_valid[diagnosis_df_valid["is_matched"]==False]["hit_rate"]
hit_rate_misdiagnosed.plot.hist()
plt.ylabel("Frequency")
plt.title("Hit rate Distribution - Misdiagnosed")
plt.savefig(f'{base_path}\\output\\questionnaire\\hit_rate_misdiagnosed_fixed_questionnaire.jpg', bbox_inches='tight')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [33]:
acc["mean_hit_rate_misdiagnosed"] = np.mean(hit_rate_misdiagnosed)

In [34]:
acc

{True: 83953,
 False: 48495,
 'error_rate': 0.36614369412901665,
 'mean_hit_rate': 0.6999085512887155,
 'mean_hit_rate_misdiagnosed': 0.5717912499570403}

In [35]:
with open(f"{base_path}\\output\\error_analysis_questionnaire\\validation_metric_fixed_questionnaire.json", "w") as outfile: 
    json.dump(acc, outfile, indent=True)

In [36]:
diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis", "is_matched", "hit_rate"] + disease_list].to_csv(f"{base_path}\\output\\error_analysis_questionnaire\\validation_df_all_patients_fixed_questionnaire.csv")

## Logistic Regression

In [37]:
model_dict = {}
for disease in disease_list:
    disease_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
    with open(f'{base_path}\\output\\diseases\\{disease_filename}\\{disease_filename}_logreg_model.pkl', 'rb') as f:
        model_dict[disease] = pickle.load(f)

In [38]:
counter = 0
for disease in disease_list:
    logreg_model = model_dict[disease]
    diagnosis_df_valid[disease] = logreg_model.predict_proba(diagnosis_df_valid[feature_columns])[:,1]
    counter+=1
    print(f"done {counter}: {disease}")

done 1: Spontaneous pneumothorax
done 2: Cluster headache
done 3: Boerhaave
done 4: Spontaneous rib fracture
done 5: GERD
done 6: HIV (initial infection)
done 7: Anemia
done 8: Viral pharyngitis
done 9: Inguinal hernia
done 10: Myasthenia gravis
done 11: Whooping cough
done 12: Anaphylaxis
done 13: Epiglottitis
done 14: Guillain-Barré syndrome
done 15: Acute laryngitis
done 16: Croup
done 17: PSVT
done 18: Atrial fibrillation
done 19: Bronchiectasis
done 20: Allergic sinusitis
done 21: Chagas
done 22: Scombroid food poisoning
done 23: Myocarditis
done 24: Larygospasm
done 25: Acute dystonic reactions
done 26: Localized edema
done 27: SLE
done 28: Tuberculosis
done 29: Unstable angina
done 30: Stable angina
done 31: Ebola
done 32: Acute otitis media
done 33: Panic attack
done 34: Bronchospasm / acute asthma exacerbation
done 35: Bronchitis
done 36: Acute COPD exacerbation / infection
done 37: Pulmonary embolism
done 38: URTI
done 39: Influenza
done 40: Pneumonia
done 41: Acute rhinosinu

In [39]:
diagnosis_df_valid["predicted_diagnosis"] = diagnosis_df_valid[disease_list].progress_apply(lambda x: pred(x), axis=1)

  0%|          | 0/132448 [00:00<?, ?it/s]

100%|██████████| 132448/132448 [06:08<00:00, 359.58it/s]


In [40]:
diagnosis_df_valid["is_matched"] = diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis"]].progress_apply(lambda x: validate(x), axis=1)

  0%|          | 0/132448 [00:00<?, ?it/s]

100%|██████████| 132448/132448 [00:01<00:00, 84443.15it/s]


In [41]:
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis,predicted_diagnosis,is_matched,hit_rate
0,55,0,0,1,0,0,0,0,0,0,...,5.314201e-25,1.432385e-02,0.093532,7.460464e-04,4.094281e-02,2.117796e-07,1.345403e-05,[Anemia],True,0.428571
1,10,0,0,1,0,0,0,1,0,0,...,1.302628e-05,1.123396e-05,0.445093,1.030640e-03,2.612394e-07,1.270043e-04,4.583824e-01,[PSVT],False,0.444444
2,68,0,1,1,0,1,0,0,0,0,...,1.830912e-29,1.884588e-11,0.000001,5.550768e-08,1.352028e-07,1.717106e-06,5.120305e-08,[Influenza],True,0.666667
3,13,1,0,1,0,0,0,0,0,0,...,1.370581e-06,3.539349e-02,0.102577,2.902183e-03,5.998884e-05,1.670769e-05,5.243869e-05,[Anemia],True,0.666667
4,48,1,0,1,0,0,0,0,0,0,...,5.213560e-22,2.050258e-05,0.944361,4.087747e-04,9.384200e-02,4.990391e-07,1.095001e-02,[Possible NSTEMI / STEMI],False,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,6.266019e-12,2.593481e-04,0.000019,3.897018e-04,4.993563e-03,1.110333e-06,3.782573e-05,[Viral pharyngitis],True,1.000000
132444,57,1,0,1,0,0,0,0,0,0,...,3.020682e-25,5.154746e-03,0.015257,5.914479e-03,1.394241e-06,7.689250e-01,3.721269e-03,[Pulmonary embolism],False,0.600000
132445,52,0,0,1,0,0,0,0,0,0,...,1.595500e-23,4.883689e-02,0.006889,1.719871e-03,2.122348e-01,1.071148e-06,4.491520e-03,[Viral pharyngitis],False,0.500000
132446,10,1,1,1,0,0,0,0,0,0,...,3.171350e-05,8.758577e-07,0.000900,1.425431e-04,2.243794e-05,3.437095e-07,2.467303e-03,[Epiglottitis],True,0.600000


In [42]:
diagnosis_df_valid["hit_rate"] = hit_rate

In [43]:
diagnosis_df_valid.is_matched.value_counts()

True     82932
False    49516
Name: is_matched, dtype: int64

In [44]:
acc = diagnosis_df_valid.is_matched.value_counts().to_dict()
acc["error_rate"] = acc[False]/len(diagnosis_df_valid)
acc

{True: 82932, False: 49516, 'error_rate': 0.3738523798018845}

In [45]:
np.mean(hit_rate)

0.6999085512887155

In [46]:
acc["mean_hit_rate"] = np.mean(hit_rate)

In [47]:
# hit rate of misdiagnosed
hit_rate_misdiagnosed = diagnosis_df_valid[diagnosis_df_valid["is_matched"]==False]["hit_rate"]
hit_rate_misdiagnosed.plot.hist()
plt.ylabel("Frequency")
plt.title("Hit rate Distribution - Misdiagnosed")
plt.savefig(f'{base_path}\\output\\questionnaire\\hit_rate_misdiagnosed_logreg_fixed_questionnare.jpg', bbox_inches='tight')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [48]:
np.mean(hit_rate_misdiagnosed)

0.523393069006103

In [49]:
acc["mean_hit_rate_misdiagnosed"] = np.mean(hit_rate_misdiagnosed)

In [50]:
with open(f"{base_path}\\output\\error_analysis_questionnaire\\validation_metric_logreg_fixed_questionnaire.json", "w") as outfile: 
    json.dump(acc, outfile, indent=True)

In [51]:
diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis", "is_matched", "hit_rate"] + disease_list].to_csv(f"{base_path}\\output\\error_analysis_questionnaire\\validation_logreg_df_all_patients_fixed_questionnaire.csv")