In [1]:
import pandas as pd
import json
import ast
import re
import pickle
from tqdm import tqdm
from constants import base_path
tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
tqdm.pandas()

In [2]:
diagnosis_df_valid = pd.read_csv(f"{base_path}\\input\\release_validate_patients")

In [3]:
with open(f"{base_path}\\input\\release_conditions.json") as f:
  disease_dict = json.load(f)
disease_list = list(disease_dict.keys())

In [4]:
with open(f"{base_path}\\input\\release_evidences.json") as f:
  evidences = json.load(f)
evidences_list = []
evidences_dict = {}
for e in evidences.keys():
  # only binary symptoms and no antecedents
  if (not evidences[e]["possible-values"]) and (not evidences[e]["is_antecedent"]):
    evidences_list.append(e)
    evidences_dict[e] = evidences[e]["question_en"]
evidences_dict["AGE"] = "AGE"
evidences_dict["SEX"] = "SEX"
feature_columns = ["AGE", "SEX"] + evidences_list

## Random Forest

In [5]:
model_dict = {}
for disease in disease_list:
    disease_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
    with open(f'{base_path}\\output\\diseases\\{disease_filename}\\{disease_filename}_model.pkl', 'rb') as f:
        model_dict[disease] = pickle.load(f)

In [6]:
def data_proc(df):
    df["binary_evidences"] = df["EVIDENCES"].apply(lambda x: [d for d in ast.literal_eval(x) if "@" not in d])
    for e in evidences_list:
        df[e] = df["binary_evidences"].apply(lambda x: 1 if e in x else 0)
    df["SEX"] = df["SEX"].map({'F': 0, 'M': 1})
    df = df[feature_columns + ["PATHOLOGY"]]
    return df

In [7]:
diagnosis_df_valid = data_proc(diagnosis_df_valid)
# sample x% of the validation dataset
# diagnosis_df_valid = diagnosis_df_valid.sample(frac=0.01, random_state=1)
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,E_193,E_168,E_180,E_67,E_171,E_111,E_182,E_103,E_23,PATHOLOGY
0,55,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,Anemia
1,10,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,Panic attack
2,68,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Influenza
3,13,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Anemia
4,48,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Boerhaave
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Viral pharyngitis
132444,57,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,Acute pulmonary edema
132445,52,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,GERD
132446,10,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis


In [8]:
counter = 0
for disease in disease_list:
    rf_model = model_dict[disease]
    diagnosis_df_valid[disease] = rf_model.predict_proba(diagnosis_df_valid[feature_columns])[:,1]
    counter+=1
    print(f"done {counter}: {disease}")

done 1: Spontaneous pneumothorax
done 2: Cluster headache
done 3: Boerhaave
done 4: Spontaneous rib fracture
done 5: GERD
done 6: HIV (initial infection)
done 7: Anemia
done 8: Viral pharyngitis
done 9: Inguinal hernia
done 10: Myasthenia gravis
done 11: Whooping cough
done 12: Anaphylaxis
done 13: Epiglottitis
done 14: Guillain-Barré syndrome
done 15: Acute laryngitis
done 16: Croup
done 17: PSVT
done 18: Atrial fibrillation
done 19: Bronchiectasis
done 20: Allergic sinusitis
done 21: Chagas
done 22: Scombroid food poisoning
done 23: Myocarditis
done 24: Larygospasm
done 25: Acute dystonic reactions
done 26: Localized edema
done 27: SLE
done 28: Tuberculosis
done 29: Unstable angina
done 30: Stable angina
done 31: Ebola
done 32: Acute otitis media
done 33: Panic attack
done 34: Bronchospasm / acute asthma exacerbation
done 35: Bronchitis
done 36: Acute COPD exacerbation / infection
done 37: Pulmonary embolism
done 38: URTI
done 39: Influenza
done 40: Pneumonia
done 41: Acute rhinosinu

In [9]:
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,Pneumonia,Acute rhinosinusitis,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis
0,55,0,0,1,0,0,1,0,0,1,...,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.0,0.0,0.000
1,10,0,0,1,0,0,0,1,0,0,...,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.0,0.0,0.003
2,68,0,1,1,0,1,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.0,0.0,0.000
3,13,1,0,1,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.0,0.0,0.000
4,48,1,0,1,0,0,0,0,1,0,...,0.0,0.000000,0.000000,0.0,0.00,0.09,0.00,0.0,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0.0,0.414463,0.702787,0.0,0.00,0.00,0.00,0.0,0.0,0.000
132444,57,1,0,1,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.01,0.00,0.00,0.0,1.0,0.010
132445,52,0,0,1,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.0,0.0,0.000
132446,10,1,1,1,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.0,0.0,0.000


In [10]:
def pred(x):
    pred_list = []
    for i in range(len(disease_list)):
        if x[i] > 0:
            pred_list.append({
                "disease": disease_list[i],
                "probability": x[i]})
    if pred_list:
        pred_df = pd.DataFrame(pred_list).set_index('disease')
        # return only top 1 - allows ties
        pred_df['rank'] = pred_df['probability'].rank(method='min', ascending=False)
        pred_df = pred_df.sort_values(by="rank")
        pred_df = pred_df[pred_df["rank"]<=1][["probability"]]
        return list(pred_df.index)
    else:
        return []

In [11]:
diagnosis_df_valid["predicted_diagnosis"] = diagnosis_df_valid[disease_list].progress_apply(lambda x: pred(x), axis=1)

  0%|          | 0/132448 [00:00<?, ?it/s]

100%|██████████| 132448/132448 [03:41<00:00, 597.10it/s]


In [12]:
def validate(x):
    return [x[0]]==x[1]

In [13]:
diagnosis_df_valid["is_matched"] = diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis"]].progress_apply(lambda x: validate(x), axis=1)

  0%|          | 0/132448 [00:00<?, ?it/s]

100%|██████████| 132448/132448 [00:01<00:00, 104622.03it/s]


In [14]:
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis,predicted_diagnosis,is_matched
0,55,0,0,1,0,0,1,0,0,1,...,0.000000,0.0,0.00,0.00,0.00,0.0,0.0,0.000,[Anemia],True
1,10,0,0,1,0,0,0,1,0,0,...,0.000000,0.0,0.00,0.00,0.00,0.0,0.0,0.003,[Panic attack],True
2,68,0,1,1,0,1,0,0,0,0,...,0.000000,0.0,0.00,0.00,0.00,0.0,0.0,0.000,[Influenza],True
3,13,1,0,1,0,0,0,0,0,0,...,0.000000,0.0,0.00,0.00,0.00,0.0,0.0,0.000,[Anemia],True
4,48,1,0,1,0,0,0,0,1,0,...,0.000000,0.0,0.00,0.09,0.00,0.0,0.0,0.000,[Boerhaave],True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0.702787,0.0,0.00,0.00,0.00,0.0,0.0,0.000,[Acute otitis media],False
132444,57,1,0,1,0,0,0,0,0,0,...,0.000000,0.0,0.01,0.00,0.00,0.0,1.0,0.010,[Acute pulmonary edema],True
132445,52,0,0,1,0,0,0,0,0,0,...,0.000000,0.0,0.00,0.00,0.00,0.0,0.0,0.000,[GERD],True
132446,10,1,1,1,0,0,0,0,0,0,...,0.000000,0.0,0.00,0.00,0.00,0.0,0.0,0.000,[Epiglottitis],True


In [15]:
diagnosis_df_valid.is_matched.value_counts()

True     119852
False     12596
Name: is_matched, dtype: int64

In [16]:
acc = diagnosis_df_valid.is_matched.value_counts().to_dict()
acc["error_rate"] = acc[False]/len(diagnosis_df_valid)
acc

{True: 119852, False: 12596, 'error_rate': 0.09510147378593864}

In [17]:
with open(f"{base_path}\\output\\error_analysis\\validation_metric.json", "w") as outfile: 
    json.dump(acc, outfile, indent=True)

In [18]:
diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis", "is_matched"] + disease_list].to_csv(f"{base_path}\\output\\error_analysis\\validation_df_all_patients.csv")

## Logistic Regression

In [19]:
model_dict = {}
for disease in disease_list:
    disease_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
    with open(f'{base_path}\\output\\diseases\\{disease_filename}\\{disease_filename}_logreg_model.pkl', 'rb') as f:
        model_dict[disease] = pickle.load(f)

In [20]:
counter = 0
for disease in disease_list:
    logreg_model = model_dict[disease]
    diagnosis_df_valid[disease] = logreg_model.predict_proba(diagnosis_df_valid[feature_columns])[:,1]
    counter+=1
    print(f"done {counter}: {disease}")

done 1: Spontaneous pneumothorax
done 2: Cluster headache
done 3: Boerhaave
done 4: Spontaneous rib fracture
done 5: GERD
done 6: HIV (initial infection)
done 7: Anemia
done 8: Viral pharyngitis
done 9: Inguinal hernia
done 10: Myasthenia gravis
done 11: Whooping cough
done 12: Anaphylaxis
done 13: Epiglottitis
done 14: Guillain-Barré syndrome
done 15: Acute laryngitis
done 16: Croup
done 17: PSVT
done 18: Atrial fibrillation
done 19: Bronchiectasis
done 20: Allergic sinusitis
done 21: Chagas
done 22: Scombroid food poisoning
done 23: Myocarditis
done 24: Larygospasm
done 25: Acute dystonic reactions
done 26: Localized edema
done 27: SLE
done 28: Tuberculosis
done 29: Unstable angina
done 30: Stable angina
done 31: Ebola
done 32: Acute otitis media
done 33: Panic attack
done 34: Bronchospasm / acute asthma exacerbation
done 35: Bronchitis
done 36: Acute COPD exacerbation / infection
done 37: Pulmonary embolism
done 38: URTI
done 39: Influenza
done 40: Pneumonia
done 41: Acute rhinosinu

In [21]:
diagnosis_df_valid["predicted_diagnosis"] = diagnosis_df_valid[disease_list].progress_apply(lambda x: pred(x), axis=1)

  0%|          | 0/132448 [00:00<?, ?it/s]

100%|██████████| 132448/132448 [03:58<00:00, 554.29it/s]


In [22]:
diagnosis_df_valid["is_matched"] = diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis"]].progress_apply(lambda x: validate(x), axis=1)

100%|██████████| 132448/132448 [00:01<00:00, 112001.88it/s]


In [23]:
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,Chronic rhinosinusitis,Bronchiolitis,Pulmonary neoplasm,Possible NSTEMI / STEMI,Sarcoidosis,Pancreatic neoplasm,Acute pulmonary edema,Pericarditis,predicted_diagnosis,is_matched
0,55,0,0,1,0,0,1,0,0,1,...,2.964278e-05,4.156345e-25,9.971866e-06,5.167661e-07,1.204016e-05,1.259412e-05,1.749385e-07,2.262037e-11,[Anemia],True
1,10,0,0,1,0,0,0,1,0,0,...,5.755309e-06,9.456161e-06,7.512104e-07,1.435395e-06,1.222314e-04,7.887219e-08,9.039293e-05,2.043855e-05,[Panic attack],True
2,68,0,1,1,0,1,0,0,0,0,...,1.756812e-10,1.120161e-29,4.021024e-11,1.587541e-06,4.225942e-09,2.795075e-11,1.416207e-06,1.072775e-10,[Influenza],True
3,13,1,0,1,0,0,0,0,0,0,...,2.853150e-05,1.370537e-06,9.743676e-04,1.436016e-04,4.686427e-04,5.978126e-06,1.461622e-05,1.750324e-08,[Anemia],True
4,48,1,0,1,0,0,0,0,1,0,...,3.077436e-03,5.213556e-22,2.279438e-06,4.176768e-02,1.158820e-04,2.148152e-02,4.861522e-07,1.714267e-04,[Boerhaave],True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,8.255195e-01,6.266019e-12,2.593481e-04,1.940074e-05,3.897018e-04,4.993563e-03,1.110333e-06,3.782573e-05,[Viral pharyngitis],True
132444,57,1,0,1,0,0,0,0,0,0,...,1.457259e-05,3.020683e-25,9.024875e-04,1.140563e-04,1.114355e-03,2.156294e-07,9.999477e-01,7.069921e-02,[Acute pulmonary edema],True
132445,52,0,0,1,0,0,0,0,0,0,...,3.697064e-03,1.595498e-23,8.437705e-03,5.051341e-05,4.315803e-04,6.298523e-03,1.031900e-06,1.934437e-07,[GERD],True
132446,10,1,1,1,0,0,0,0,0,0,...,1.627865e-06,2.815239e-05,2.789253e-07,8.064809e-05,4.625051e-05,1.487674e-05,3.269457e-07,1.571412e-04,[Epiglottitis],True


In [24]:
diagnosis_df_valid.is_matched.value_counts()

True     125078
False      7370
Name: is_matched, dtype: int64

In [25]:
acc = diagnosis_df_valid.is_matched.value_counts().to_dict()
acc["error_rate"] = acc[False]/len(diagnosis_df_valid)
acc

{True: 125078, False: 7370, 'error_rate': 0.055644479342836436}

In [26]:
with open(f"{base_path}\\output\\error_analysis\\validation_metric_logreg.json", "w") as outfile: 
    json.dump(acc, outfile, indent=True)

In [27]:
diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis", "is_matched"] + disease_list].to_csv(f"{base_path}\\output\\error_analysis\\validation_logreg_df_all_patients.csv")