In [1]:
import pandas as pd
import numpy as np
import json
import ast
import re
import os
import pickle
from tqdm import tqdm
from constants import base_path, model_list, pathology_scope, positive_threshold
tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
tqdm.pandas()



In [2]:
diagnosis_df_valid = pd.read_csv(f"{base_path}\\input\\release_validate_patients")

In [3]:
with open(f"{base_path}\\input\\release_conditions.json") as f:
  disease_dict = json.load(f)
if pathology_scope:
  disease_list =  pathology_scope
else:
  disease_list = list(disease_dict.keys())

In [4]:
with open(f"{base_path}\\input\\release_evidences.json") as f:
  evidences = json.load(f)
evidences_list = []
evidences_dict = {}
for e in evidences.keys():
  # only binary symptoms and antecedents
  if (not evidences[e]["possible-values"]):
    evidences_list.append(e)
    evidences_dict[e] = evidences[e]["question_en"]
evidences_dict["AGE"] = "AGE"
evidences_dict["SEX"] = "SEX"
feature_columns = ["AGE", "SEX"] + evidences_list

In [5]:
def pred(x):
    pred_list = []
    for i in range(len(disease_list)):
        if x[i] >= positive_threshold:
            pred_list.append({
                "disease": disease_list[i],
                "probability": x[i]})
    if pred_list:
        pred_df = pd.DataFrame(pred_list).set_index('disease')
        # return only top 1 - allows ties
        pred_df['rank'] = pred_df['probability'].rank(method='min', ascending=False)
        pred_df = pred_df.sort_values(by="rank")
        pred_df = pred_df[pred_df["rank"]<=1][["probability"]]
        if pred_df.shape[0] > 1: # in case of tied rankings
            pred_df = pred_df.sample(random_state=1)
        return list(pred_df.index)
    else:
        return []

In [6]:
def data_proc(df):
    df["binary_evidences"] = df["EVIDENCES"].apply(lambda x: [d for d in ast.literal_eval(x) if "@" not in d])
    for e in evidences_list:
        df[e] = df["binary_evidences"].apply(lambda x: 1 if e in x else 0)
    df["SEX"] = df["SEX"].map({'F': 0, 'M': 1})
    df = df[feature_columns + ["PATHOLOGY"]]
    return df

In [7]:
diagnosis_df_valid["PATHOLOGY"] = [i if i in disease_list else "" for i in diagnosis_df_valid["PATHOLOGY"]]
diagnosis_df_valid

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE
0,55,"[['Anemia', 0.25071110167158567], ['Atrial fib...",F,,"['E_7', 'E_24', 'E_26', 'E_53', 'E_54_@_V_180'...",E_154
1,10,"[['Guillain-Barré syndrome', 0.135558991316712...",F,,"['E_16', 'E_29', 'E_50', 'E_53', 'E_54_@_V_182...",E_171
2,68,"[['Influenza', 0.1900250899717378], ['Viral ph...",F,,"['E_50', 'E_53', 'E_54_@_V_183', 'E_54_@_V_198...",E_53
3,13,"[['Anemia', 0.18697604010451876], ['Atrial fib...",M,,"['E_7', 'E_24', 'E_26', 'E_53', 'E_54_@_V_180'...",E_53
4,48,"[['Boerhaave', 1.0]]",M,,"['E_53', 'E_54_@_V_71', 'E_54_@_V_112', 'E_54_...",E_53
...,...,...,...,...,...,...
132443,27,"[['Viral pharyngitis', 0.22702125813983617], [...",M,,"['E_41', 'E_48', 'E_53', 'E_54_@_V_161', 'E_55...",E_201
132444,57,"[['Acute pulmonary edema', 0.12078088376840804...",M,,"['E_5', 'E_53', 'E_54_@_V_154', 'E_54_@_V_183'...",E_151
132445,52,"[['GERD', 0.24494427036287517], ['Bronchitis',...",F,GERD,"['E_53', 'E_54_@_V_112', 'E_54_@_V_161', 'E_54...",E_173
132446,10,"[['Epiglottitis', 0.2969684152571116], ['HIV (...",M,,"['E_53', 'E_54_@_V_179', 'E_54_@_V_192', 'E_55...",E_91


In [8]:
diagnosis_df_valid = data_proc(diagnosis_df_valid)
# sample x% of the validation dataset
# diagnosis_df_valid = diagnosis_df_valid.sample(frac=0.01, random_state=1)
diagnosis_df_valid

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,E_199,E_121,E_120,E_142,E_195,E_183,E_224,E_223,E_5,PATHOLOGY
0,55,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,
1,10,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,
2,68,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
3,13,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
4,48,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132443,27,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
132444,57,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,
132445,52,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,GERD
132446,10,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,


In [9]:
def validate(x):
    if x[0]:
        return [x[0]]==x[1]
    else:
        return not x[1]

In [10]:
model_dict = {}
for model_name in model_list["tree-based"]:
    model_dict[model_name]={}
    for disease in disease_list:
        disease_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
        with open(f'{base_path}\\output\\diseases\\{disease_filename}\\{model_name}\\{disease_filename}_model.pkl', 'rb') as f:
            model_dict[model_name][disease] = pickle.load(f)
model_name="logistic_regression"
model_dict[model_name]={}
for disease in disease_list:
    disease_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
    with open(f'{base_path}\\output\\diseases\\{disease_filename}\\{model_name}\\{disease_filename}_model.pkl', 'rb') as f:
        model_dict[model_name][disease] = pickle.load(f)

## Tree-based models

In [11]:
for model_name in model_list["tree-based"]:
    print(f"Evaluating {model_name}...")
    counter = 0
    for disease in disease_list:
        clf_model = model_dict[model_name][disease]
        diagnosis_df_valid[disease] = np.round(clf_model.predict_proba(diagnosis_df_valid[feature_columns])[:,1], 2)
        counter+=1
        print(f"done {counter}: {disease}")
    diagnosis_df_valid["predicted_diagnosis"] = diagnosis_df_valid[disease_list].progress_apply(lambda x: pred(x), axis=1)
    diagnosis_df_valid["is_matched"] = diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis"]].progress_apply(lambda x: validate(x), axis=1)
    acc = diagnosis_df_valid.is_matched.value_counts().to_dict()
    if False not in acc:
        acc[False] = 0
    acc["error_rate"] = acc[False]/len(diagnosis_df_valid)
    print(acc)
    output_path = f"{base_path}\\output\\error_analysis\\{model_name}"
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    with open(f"{output_path}\\validation_metric.json", "w") as outfile: 
        json.dump(acc, outfile, indent=True)
    diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis", "is_matched"] + disease_list].to_csv(f"{output_path}\\validation_df_all_patients.csv")

Evaluating decision_tree...
done 1: Tuberculosis
done 2: GERD
done 3: SLE
done 4: HIV (initial infection)
done 5: Pulmonary neoplasm


100%|██████████| 132448/132448 [00:24<00:00, 5479.72it/s]
100%|██████████| 132448/132448 [00:01<00:00, 125781.55it/s]


{True: 131963, False: 485, 'error_rate': 0.0036618144479342836}
Evaluating random_forest...
done 1: Tuberculosis
done 2: GERD
done 3: SLE
done 4: HIV (initial infection)
done 5: Pulmonary neoplasm


100%|██████████| 132448/132448 [00:27<00:00, 4833.81it/s]
100%|██████████| 132448/132448 [00:01<00:00, 94638.31it/s] 


{True: 132416, False: 32, 'error_rate': 0.00024160425223483932}
Evaluating gradient_boost...
done 1: Tuberculosis
done 2: GERD
done 3: SLE
done 4: HIV (initial infection)
done 5: Pulmonary neoplasm


100%|██████████| 132448/132448 [00:25<00:00, 5286.42it/s]
100%|██████████| 132448/132448 [00:01<00:00, 112055.87it/s]


{True: 132334, False: 114, 'error_rate': 0.0008607151485866151}


## Logistic Regression

In [12]:
model_name = "logistic_regression"
print(f"Evaluating {model_name}...")
counter = 0
for disease in disease_list:
    clf_model = model_dict[model_name][disease]
    diagnosis_df_valid[disease] = np.round(clf_model.predict_proba(diagnosis_df_valid[feature_columns])[:,1],2)
    counter+=1
    print(f"done {counter}: {disease}")
diagnosis_df_valid["predicted_diagnosis"] = diagnosis_df_valid[disease_list].progress_apply(lambda x: pred(x), axis=1)
diagnosis_df_valid["is_matched"] = diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis"]].progress_apply(lambda x: validate(x), axis=1)
acc = diagnosis_df_valid.is_matched.value_counts().to_dict()
if False not in acc:
    acc[False] = 0
acc["error_rate"] = acc[False]/len(diagnosis_df_valid)
print(acc)
output_path = f"{base_path}\\output\\error_analysis\\{model_name}"
if not os.path.exists(output_path):
    os.makedirs(output_path)
with open(f"{output_path}\\validation_metric.json", "w") as outfile: 
    json.dump(acc, outfile, indent=True)
diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis", "is_matched"] + disease_list].to_csv(f"{output_path}\\validation_df_all_patients.csv")

Evaluating logistic_regression...
done 1: Tuberculosis
done 2: GERD
done 3: SLE
done 4: HIV (initial infection)
done 5: Pulmonary neoplasm


100%|██████████| 132448/132448 [00:24<00:00, 5454.69it/s]
100%|██████████| 132448/132448 [00:01<00:00, 129403.23it/s]


{True: 132384, False: 64, 'error_rate': 0.00048320850446967865}
