In [1]:
import pandas as pd
import numpy as np
import json
import ast
import re
import pickle
from tqdm import tqdm
from constants import base_path, n_questions
import matplotlib.pyplot as plt
tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
tqdm.pandas()

In [2]:
diagnosis_df_valid = pd.read_csv(f"{base_path}\\input\\release_validate_patients")

In [3]:
with open(f"{base_path}\\input\\release_conditions.json") as f:
  disease_dict = json.load(f)
disease_list = list(disease_dict.keys())

In [4]:
with open(f"{base_path}\\input\\release_evidences.json") as f:
  evidences = json.load(f)
evidences_list = []
evidences_dict = {}
evidences_en_to_code = {}
for e in evidences.keys():
  # only binary symptoms and no antecedents
  if (not evidences[e]["possible-values"]) and (not evidences[e]["is_antecedent"]):
    evidences_list.append(e)
    evidences_dict[e] = evidences[e]["question_en"]
    evidences_en_to_code[evidences[e]["question_en"]] = e
evidences_code_to_en = evidences_dict
evidences_list_en = list(evidences_en_to_code.keys())
evidences_dict["AGE"] = "AGE"
evidences_dict["SEX"] = "SEX"
feature_columns = ["AGE", "SEX"] + evidences_list

In [5]:
def get_ftr_importance_df(feature_importance_dict):
    feature_importance_df = pd.DataFrame()
    feature_importance_df["evidence"] = evidences_list_en
    for disease in feature_importance_dict:
        feature_importance_df[disease] = [feature_importance_dict[disease]["top10_relevant_symptoms"].get(evidence, 0) for evidence in evidences_list_en]
    feature_importance_df.set_index('evidence', inplace=True)
    return feature_importance_df

In [6]:
def get_next_question(evidences, questionairre, feature_importance_df):
    centroid = np.array([feature_importance_df.loc[e].values for e in evidences]).mean(axis=0)
    _, indices = questionairre.kneighbors([centroid])
    ask_list = [evidences_list_en[i] for i in indices[0] if evidences_list_en[i] not in evidences]
    try:
        return ask_list
    except:
        return []

In [7]:
def get_evidences(answers, user_evidences, questionairre, feature_importance_df):
    ask = True
    question_counter = 1 # counts initial evidence sa q0
    evidences_en = [evidences_code_to_en[e] for e in user_evidences]
    answers_en = [evidences_code_to_en[e] for e in answers if e in evidences_list]
    asked = evidences_en.copy()
    while question_counter < n_questions:
      ask =  True
      next_question_idx = 0
      next_question = get_next_question(evidences_en, questionairre, feature_importance_df)
      while ask and (question_counter < n_questions):
        if next_question[next_question_idx] not in asked:
          answer = 1 if next_question[next_question_idx] in answers_en else 0
          asked.append(next_question[next_question_idx])
          question_counter+=1
          if answer==1:
            evidences_en.append(next_question[next_question_idx])
            ask = False
          else:
            next_question_idx += 1
          if next_question_idx > n_questions:
            break
        else:
            next_question_idx += 1
    return [evidences_en_to_code[e] for e in evidences_en]

In [8]:
def data_proc(df, questionairre, feature_importance_df):
    df["binary_evidences_all"] = df["EVIDENCES"].apply(lambda x: [d for d in ast.literal_eval(x) if d in evidences_list])
    df["binary_evidences_all_count"] = df["binary_evidences_all"].apply(lambda x: len(x))
    df["binary_evidences"] = df[["EVIDENCES", "INITIAL_EVIDENCE"]].progress_apply(lambda x: get_evidences(ast.literal_eval(x[0]), [x[1]], questionairre, feature_importance_df), axis=1)
    df["binary_evidences_count"] = df["binary_evidences"].apply(lambda x: len(x))
    df["hit_rate"] = df["binary_evidences_count"]/df["binary_evidences_all_count"]
    hit_rate = df["hit_rate"]
    for e in evidences_list:
        df[e] = df["binary_evidences"].apply(lambda x: 1 if e in x else 0)
    df["SEX"] = df["SEX"].map({'F': 0, 'M': 1})
    df = df[feature_columns + ["PATHOLOGY"]]
    return df, hit_rate

In [9]:
def pred(x):
    pred_list = []
    for i in range(len(disease_list)):
        if x[i] > 0:
            pred_list.append({
                "disease": disease_list[i],
                "probability": x[i]})
    if pred_list:
        pred_df = pd.DataFrame(pred_list).set_index('disease')
        # return only top 1 - allows ties
        pred_df['rank'] = pred_df['probability'].rank(method='min', ascending=False)
        pred_df = pred_df.sort_values(by="rank")
        pred_df = pred_df[pred_df["rank"]<=1][["probability"]]
        return list(pred_df.index)
    else:
        return []

## Random Forest

In [10]:
with open(f"{base_path}\\output\\feature_importance.json") as f:
  feature_importance_dict = json.load(f)
# feature_importance_dict

In [11]:
feature_importance_df = get_ftr_importance_df(feature_importance_dict)
feature_importance_df

Unnamed: 0_level_0,Spontaneous pneumothorax,Cluster headache,Boerhaave,Spontaneous rib fracture,GERD,HIV (initial infection),Anemia,Viral pharyngitis,Inguinal hernia,Myasthenia gravis,...,Pneumonia,Acute rhinosinusitis,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis
evidence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Do you have a fever (either felt or measured with a thermometer)?,0.025764,0.066418,0.027025,0.023705,0.027436,0.094198,0.000000,0.036879,0.027490,0.000000,...,0.025668,0.000000,0.058277,0.175665,0.031826,0.034001,0.033759,0.028436,0.000000,0.027511
"Do you have pain somewhere, related to your reason for consulting?",0.048024,0.075092,0.049367,0.031275,0.028815,0.000000,0.023572,0.086531,0.042174,0.129557,...,0.000000,0.045449,0.043964,0.137135,0.000000,0.045028,0.049111,0.020412,0.018093,0.050291
Did you lose consciousness?,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
"Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?",0.000000,0.029475,0.017294,0.014129,0.000000,0.156626,0.000000,0.034248,0.184359,0.000000,...,0.147003,0.000000,0.015698,0.013473,0.016532,0.033064,0.018412,0.093787,0.000000,0.000000
Is your skin much paler than usual?,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.135016,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Do you feel like you are detached from your own body or your surroundings?,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Do you feel like you are dying or were you afraid that you were about do die?,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Do you have greenish or yellowish nasal discharge?,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.307416,0.159449,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Have you lost your sense of smell?,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.230766,0.324863,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [12]:
with open(f'{base_path}\\output\\questionnaire\\questionairre.pkl', 'rb') as f:
    questionairre = pickle.load(f)

In [13]:
model_dict = {}
for disease in disease_list:
    disease_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
    with open(f'{base_path}\\output\\diseases\\{disease_filename}\\{disease_filename}_model.pkl', 'rb') as f:
        model_dict[disease] = pickle.load(f)

In [14]:
# just for checking
# {c: evidences[c]["is_antecedent"] for c in [i for i in ast.literal_eval(diagnosis_df_valid["EVIDENCES"][0]) if ("@" not in i)]}

In [15]:
data_proc(diagnosis_df_valid.head(5), questionairre, feature_importance_df)

100%|██████████| 5/5 [00:00<00:00, 222.68it/s]


(   AGE  SEX  E_91  E_53  E_159  E_129  E_154  E_155  E_210  E_140  ...  E_193  \
 0   55    0     0     0      0      0      1      0      0      1  ...      0   
 1   10    0     0     1      0      0      0      1      0      0  ...      0   
 2   68    0     1     1      0      1      0      0      0      0  ...      0   
 3   13    1     0     1      0      0      0      0      0      0  ...      0   
 4   48    1     0     1      0      0      0      0      0      0  ...      0   
 
    E_168  E_180  E_67  E_171  E_111  E_182  E_103  E_23     PATHOLOGY  
 0      0      0     0      0      0      0      0     0        Anemia  
 1      0      0     0      1      1      0      0     0  Panic attack  
 2      0      0     0      0      0      0      0     0     Influenza  
 3      0      0     0      0      0      0      0     0        Anemia  
 4      0      0     0      0      0      0      0     0     Boerhaave  
 
 [5 rows x 99 columns],
 0    0.857143
 1    1.000000
 2    1.0000

In [16]:
diagnosis_df_valid, hit_rate = data_proc(diagnosis_df_valid, questionairre, feature_importance_df)
# sample x% of the validation dataset
# diagnosis_df_valid = diagnosis_df_valid.sample(frac=0.01, random_state=1)
diagnosis_df_valid

100%|██████████| 132448/132448 [06:54<00:00, 319.31it/s]


Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,E_193,E_168,E_180,E_67,E_171,E_111,E_182,E_103,E_23,PATHOLOGY
0,55,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,Anemia
1,10,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,Panic attack
2,68,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Influenza
3,13,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Anemia
4,48,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Boerhaave
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Viral pharyngitis
132444,57,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,Acute pulmonary edema
132445,52,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,GERD
132446,10,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis


In [17]:
hit_rate.plot.hist()
plt.ylabel("Frequency")
plt.title("Hit rate Distribution")
plt.savefig(f'{base_path}\\output\\questionnaire\\hit_rate.jpg', bbox_inches='tight')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [18]:
counter = 0
for disease in disease_list:
    rf_model = model_dict[disease]
    diagnosis_df_valid[disease] = rf_model.predict_proba(diagnosis_df_valid[feature_columns])[:,1]
    counter+=1
    print(f"done {counter}: {disease}")

done 1: Spontaneous pneumothorax
done 2: Cluster headache
done 3: Boerhaave
done 4: Spontaneous rib fracture
done 5: GERD
done 6: HIV (initial infection)
done 7: Anemia
done 8: Viral pharyngitis
done 9: Inguinal hernia
done 10: Myasthenia gravis
done 11: Whooping cough
done 12: Anaphylaxis
done 13: Epiglottitis
done 14: Guillain-Barré syndrome
done 15: Acute laryngitis
done 16: Croup
done 17: PSVT
done 18: Atrial fibrillation
done 19: Bronchiectasis
done 20: Allergic sinusitis
done 21: Chagas
done 22: Scombroid food poisoning
done 23: Myocarditis
done 24: Larygospasm
done 25: Acute dystonic reactions
done 26: Localized edema
done 27: SLE
done 28: Tuberculosis
done 29: Unstable angina
done 30: Stable angina
done 31: Ebola
done 32: Acute otitis media
done 33: Panic attack
done 34: Bronchospasm / acute asthma exacerbation
done 35: Bronchitis
done 36: Acute COPD exacerbation / infection
done 37: Pulmonary embolism
done 38: URTI
done 39: Influenza
done 40: Pneumonia
done 41: Acute rhinosinu

In [19]:
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,Pneumonia,Acute rhinosinusitis,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis
0,55,0,0,0,0,0,1,0,0,1,...,0.0,0.000000,0.000000,0.0,0.01,0.00,0.00,0.02,0.0,0.000
1,10,0,0,1,0,0,0,1,0,0,...,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.00,0.0,0.003
2,68,0,1,1,0,1,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.00,0.0,0.000
3,13,1,0,1,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.00,0.06,0.06,0.00,0.0,0.990
4,48,1,0,1,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.00,1.00,0.00,0.07,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0.0,0.414463,0.702787,0.0,0.00,0.00,0.00,0.00,0.0,0.000
132444,57,1,0,1,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.01,0.00,0.00,0.00,1.0,0.010
132445,52,0,0,1,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.00,0.0,0.000
132446,10,1,1,1,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.00,0.0,0.000


In [20]:
diagnosis_df_valid["predicted_diagnosis"] = diagnosis_df_valid[disease_list].progress_apply(lambda x: pred(x), axis=1)

100%|██████████| 132448/132448 [07:36<00:00, 290.20it/s]


In [21]:
def validate(x):
    return [x[0]]==x[1]

In [22]:
diagnosis_df_valid["is_matched"] = diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis"]].progress_apply(lambda x: validate(x), axis=1)

  7%|▋         | 9213/132448 [00:00<00:01, 91460.43it/s]

100%|██████████| 132448/132448 [00:01<00:00, 84826.40it/s]


In [23]:
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis,predicted_diagnosis,is_matched
0,55,0,0,0,0,0,1,0,0,1,...,0.000000,0.0,0.01,0.00,0.00,0.02,0.0,0.000,[Anemia],True
1,10,0,0,1,0,0,0,1,0,0,...,0.000000,0.0,0.00,0.00,0.00,0.00,0.0,0.003,[Panic attack],True
2,68,0,1,1,0,1,0,0,0,0,...,0.000000,0.0,0.00,0.00,0.00,0.00,0.0,0.000,[Influenza],True
3,13,1,0,1,0,0,0,0,0,0,...,0.000000,0.0,0.00,0.06,0.06,0.00,0.0,0.990,[Myocarditis],False
4,48,1,0,1,0,0,0,0,0,0,...,0.000000,0.0,0.00,1.00,0.00,0.07,0.0,0.000,[Possible NSTEMI / STEMI],False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0.702787,0.0,0.00,0.00,0.00,0.00,0.0,0.000,[Acute otitis media],False
132444,57,1,0,1,0,0,0,0,0,0,...,0.000000,0.0,0.01,0.00,0.00,0.00,1.0,0.010,[Acute pulmonary edema],True
132445,52,0,0,1,0,0,0,0,0,0,...,0.000000,0.0,0.00,0.00,0.00,0.00,0.0,0.000,[GERD],True
132446,10,1,1,1,0,0,0,0,0,0,...,0.000000,0.0,0.00,0.00,0.00,0.00,0.0,0.000,[Epiglottitis],True


In [24]:
diagnosis_df_valid["hit_rate"] = hit_rate

In [25]:
diagnosis_df_valid.is_matched.value_counts()

True     97116
False    35332
Name: is_matched, dtype: int64

In [26]:
acc = diagnosis_df_valid.is_matched.value_counts().to_dict()
acc["error_rate"] = acc[False]/len(diagnosis_df_valid)
acc

{True: 97116, False: 35332, 'error_rate': 0.266761294998792}

In [27]:
np.mean(hit_rate)

0.8464285504978833

In [28]:
acc["mean_hit_rate"] = np.mean(hit_rate)

In [29]:
# hit rate of misdiagnosed
hit_rate_misdiagnosed = diagnosis_df_valid[diagnosis_df_valid["is_matched"]==False]["hit_rate"]
hit_rate_misdiagnosed.plot.hist()
plt.ylabel("Frequency")
plt.title("Hit rate Distribution - Misdiagnosed")
plt.savefig(f'{base_path}\\output\\questionnaire\\hit_rate_misdiagnosed.jpg', bbox_inches='tight')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [30]:
acc["mean_hit_rate_misdiagnosed"] = np.mean(hit_rate_misdiagnosed)

In [31]:
with open(f"{base_path}\\output\\error_analysis_questionnaire\\validation_metric_questionnaire.json", "w") as outfile: 
    json.dump(acc, outfile, indent=True)

In [32]:
diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis", "is_matched", "hit_rate"] + disease_list].to_csv(f"{base_path}\\output\\error_analysis_questionnaire\\validation_df_all_patients_questionnaire.csv")

## Logistic Regression

In [33]:
diagnosis_df_valid = pd.read_csv(f"{base_path}\\input\\release_validate_patients")

In [34]:
with open(f"{base_path}\\output\\feature_importance_logreg.json") as f:
  feature_importance_dict = json.load(f)
# feature_importance_dict

In [35]:
feature_importance_df = get_ftr_importance_df(feature_importance_dict)
feature_importance_df

Unnamed: 0_level_0,Spontaneous pneumothorax,Cluster headache,Boerhaave,Spontaneous rib fracture,GERD,HIV (initial infection),Anemia,Viral pharyngitis,Inguinal hernia,Myasthenia gravis,...,Pneumonia,Acute rhinosinusitis,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis
evidence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Do you have a fever (either felt or measured with a thermometer)?,0.000000,0.000000,0.00000,0.000000,0.000000,1.585639,0.000000,0.142418,0.000000,0.0,...,0.000000,0.242576,0.000000,0.777401,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
"Do you have pain somewhere, related to your reason for consulting?",1.650935,3.568194,1.36364,0.881306,1.371733,0.862920,2.121008,5.186258,1.910237,0.0,...,1.026723,2.024601,2.033464,0.000000,1.195002,2.634012,1.386526,1.375371,0.217331,3.17952
Did you lose consciousness?,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
"Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?",0.000000,0.000000,0.00000,0.000000,0.000000,4.273725,0.000000,0.000000,3.133899,0.0,...,3.567464,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.973815,0.000000,0.00000
Is your skin much paler than usual?,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,3.052768,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Do you feel like you are detached from your own body or your surroundings?,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
Do you feel like you are dying or were you afraid that you were about do die?,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
Do you have greenish or yellowish nasal discharge?,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,1.856938,1.590656,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
Have you lost your sense of smell?,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.956432,2.054624,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000


In [36]:
with open(f'{base_path}\\output\\questionnaire\\questionnaire_logreg.pkl', 'rb') as f:
    questionairre = pickle.load(f)

In [37]:
model_dict = {}
for disease in disease_list:
    disease_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
    with open(f'{base_path}\\output\\diseases\\{disease_filename}\\{disease_filename}_logreg_model.pkl', 'rb') as f:
        model_dict[disease] = pickle.load(f)

In [38]:
data_proc(diagnosis_df_valid.head(5), questionairre, feature_importance_df)

100%|██████████| 5/5 [00:00<00:00, 191.92it/s]


(   AGE  SEX  E_91  E_53  E_159  E_129  E_154  E_155  E_210  E_140  ...  E_193  \
 0   55    0     0     1      0      0      1      0      0      1  ...      0   
 1   10    0     0     1      0      0      0      1      0      0  ...      0   
 2   68    0     1     1      0      1      0      0      0      0  ...      0   
 3   13    1     0     1      0      0      0      0      0      0  ...      0   
 4   48    1     0     1      0      0      0      0      1      0  ...      0   
 
    E_168  E_180  E_67  E_171  E_111  E_182  E_103  E_23     PATHOLOGY  
 0      0      0     0      0      0      0      0     0        Anemia  
 1      0      0     0      1      1      0      0     0  Panic attack  
 2      0      0     0      0      0      0      0     0     Influenza  
 3      0      0     0      0      0      0      0     0        Anemia  
 4      0      0     0      0      0      0      0     0     Boerhaave  
 
 [5 rows x 99 columns],
 0    1.000000
 1    1.000000
 2    0.8888

In [39]:
diagnosis_df_valid, hit_rate = data_proc(diagnosis_df_valid, questionairre, feature_importance_df)
# sample x% of the validation dataset
# diagnosis_df_valid = diagnosis_df_valid.sample(frac=0.01, random_state=1)
diagnosis_df_valid

100%|██████████| 132448/132448 [07:56<00:00, 277.96it/s]


Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,E_193,E_168,E_180,E_67,E_171,E_111,E_182,E_103,E_23,PATHOLOGY
0,55,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,Anemia
1,10,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,Panic attack
2,68,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Influenza
3,13,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Anemia
4,48,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Boerhaave
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Viral pharyngitis
132444,57,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,Acute pulmonary edema
132445,52,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,GERD
132446,10,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis


In [40]:
hit_rate.plot.hist()
plt.ylabel("Frequency")
plt.title("Hit rate Distribution")
plt.savefig(f'{base_path}\\output\\questionnaire\\hit_rate_logreg.jpg', bbox_inches='tight')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [41]:
counter = 0
for disease in disease_list:
    logreg_model = model_dict[disease]
    diagnosis_df_valid[disease] = logreg_model.predict_proba(diagnosis_df_valid[feature_columns])[:,1]
    counter+=1
    print(f"done {counter}: {disease}")

done 1: Spontaneous pneumothorax
done 2: Cluster headache
done 3: Boerhaave
done 4: Spontaneous rib fracture
done 5: GERD
done 6: HIV (initial infection)
done 7: Anemia
done 8: Viral pharyngitis
done 9: Inguinal hernia
done 10: Myasthenia gravis
done 11: Whooping cough
done 12: Anaphylaxis
done 13: Epiglottitis
done 14: Guillain-Barré syndrome
done 15: Acute laryngitis
done 16: Croup
done 17: PSVT
done 18: Atrial fibrillation
done 19: Bronchiectasis
done 20: Allergic sinusitis
done 21: Chagas
done 22: Scombroid food poisoning
done 23: Myocarditis
done 24: Larygospasm
done 25: Acute dystonic reactions
done 26: Localized edema
done 27: SLE
done 28: Tuberculosis
done 29: Unstable angina
done 30: Stable angina
done 31: Ebola
done 32: Acute otitis media
done 33: Panic attack
done 34: Bronchospasm / acute asthma exacerbation
done 35: Bronchitis
done 36: Acute COPD exacerbation / infection
done 37: Pulmonary embolism
done 38: URTI
done 39: Influenza
done 40: Pneumonia
done 41: Acute rhinosinu

In [42]:
diagnosis_df_valid["predicted_diagnosis"] = diagnosis_df_valid[disease_list].progress_apply(lambda x: pred(x), axis=1)

100%|██████████| 132448/132448 [04:59<00:00, 441.64it/s]


In [43]:
diagnosis_df_valid["is_matched"] = diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis"]].progress_apply(lambda x: validate(x), axis=1)

100%|██████████| 132448/132448 [00:01<00:00, 82825.35it/s]


In [44]:
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis,predicted_diagnosis,is_matched
0,55,0,0,1,0,0,1,0,0,1,...,2.964278e-05,4.156345e-25,9.971866e-06,5.167661e-07,1.204016e-05,1.259412e-05,1.749385e-07,2.262037e-11,[Anemia],True
1,10,0,0,1,0,0,0,1,0,0,...,5.755309e-06,9.456161e-06,7.512104e-07,1.435395e-06,1.222314e-04,7.887219e-08,9.039293e-05,2.043855e-05,[Panic attack],True
2,68,0,1,1,0,1,0,0,0,0,...,3.128740e-11,1.331117e-30,2.137316e-09,3.441881e-04,3.258142e-08,8.664104e-10,1.472785e-06,9.539523e-09,[Influenza],True
3,13,1,0,1,0,0,0,0,0,0,...,4.353086e-04,1.416310e-06,2.048354e-01,9.019023e-01,9.313944e-03,1.009869e-03,4.045544e-05,3.123637e-02,[Stable angina],False
4,48,1,0,1,0,0,0,0,1,0,...,3.077436e-03,5.213556e-22,2.279438e-06,4.176768e-02,1.158820e-04,2.148152e-02,4.861522e-07,1.714267e-04,[Boerhaave],True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,4.572903e-01,7.446075e-13,1.360130e-02,4.190084e-03,2.996711e-03,1.346232e-01,1.154691e-06,3.352459e-03,[Viral pharyngitis],True
132444,57,1,0,1,0,0,0,0,0,0,...,1.457259e-05,3.020683e-25,9.024875e-04,1.140563e-04,1.114355e-03,2.156294e-07,9.999477e-01,7.069921e-02,[Acute pulmonary edema],True
132445,52,0,0,1,0,0,0,0,0,0,...,3.697064e-03,1.595498e-23,8.437705e-03,5.051341e-05,4.315803e-04,6.298523e-03,1.031900e-06,1.934437e-07,[GERD],True
132446,10,1,1,1,0,0,0,0,0,0,...,1.627865e-06,2.815239e-05,2.789253e-07,8.064809e-05,4.625051e-05,1.487674e-05,3.269457e-07,1.571412e-04,[Epiglottitis],True


In [45]:
diagnosis_df_valid["hit_rate"] = hit_rate

In [46]:
diagnosis_df_valid.is_matched.value_counts()

True     109388
False     23060
Name: is_matched, dtype: int64

In [47]:
acc = diagnosis_df_valid.is_matched.value_counts().to_dict()
acc["error_rate"] = acc[False]/len(diagnosis_df_valid)
acc

{True: 109388, False: 23060, 'error_rate': 0.1741060642667311}

In [48]:
np.mean(hit_rate)

0.8911475938996174

In [49]:
acc["mean_hit_rate"] = np.mean(hit_rate)

In [50]:
# hit rate of misdiagnosed
hit_rate_misdiagnosed = diagnosis_df_valid[diagnosis_df_valid["is_matched"]==False]["hit_rate"]
hit_rate_misdiagnosed.plot.hist()
plt.ylabel("Frequency")
plt.title("Hit rate Distribution - Misdiagnosed")
plt.savefig(f'{base_path}\\output\\questionnaire\\hit_rate_misdiagnosed_logreg.jpg', bbox_inches='tight')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [51]:
np.mean(hit_rate_misdiagnosed)

0.6274497640260867

In [52]:
acc["mean_hit_rate_misdiagnosed"] = np.mean(hit_rate_misdiagnosed)

In [53]:
with open(f"{base_path}\\output\\error_analysis_questionnaire\\validation_metric_logreg_questionnaire.json", "w") as outfile: 
    json.dump(acc, outfile, indent=True)

In [54]:
diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis", "is_matched", "hit_rate"] + disease_list].to_csv(f"{base_path}\\output\\error_analysis_questionnaire\\validation_logreg_df_all_patients_questionnaire.csv")