In [1]:
import pandas as pd
import numpy as np
import json
import ast
import re
import pickle
from tqdm import tqdm
from constants import base_path
import matplotlib.pyplot as plt
tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
tqdm.pandas()

In [2]:
with open(f"{base_path}\\input\\release_conditions.json") as f:
  disease_dict = json.load(f)
disease_list = list(disease_dict.keys())

In [3]:
with open(f"{base_path}\\input\\release_evidences.json") as f:
  evidences = json.load(f)
evidences_list = []
evidences_dict = {}
evidences_en_to_code = {}
for e in evidences.keys():
  # only binary symptoms and no antecedents
  if (not evidences[e]["possible-values"]) and (not evidences[e]["is_antecedent"]):
    evidences_list.append(e)
    evidences_dict[e] = evidences[e]["question_en"]
    evidences_en_to_code[evidences[e]["question_en"]] = e
evidences_code_to_en = evidences_dict
evidences_list_en = list(evidences_en_to_code.keys())
evidences_dict["AGE"] = "AGE"
evidences_dict["SEX"] = "SEX"
feature_columns = ["AGE", "SEX"] + evidences_list

In [4]:
def get_next_question(evidences, questionnaire, feature_embeddings_df):
    centroid = np.array([feature_embeddings_df.loc[e].values for e in evidences]).mean(axis=0)
    _, indices = questionnaire.kneighbors([centroid])
    ask_list = [evidences_list_en[i] for i in indices[0] if evidences_list_en[i] not in evidences]
    try:
        return ask_list
    except:
        return []

In [5]:
def get_evidences(answers, user_evidences, questionnaire, feature_embeddings_df, n_questions):
    ask = True
    question_counter = 1 # counts initial evidence sa q0
    evidences_en = [evidences_code_to_en[e] for e in user_evidences]
    answers_en = [evidences_code_to_en[e] for e in answers if e in evidences_list]
    asked = evidences_en.copy()
    while question_counter < n_questions:
      ask =  True
      next_question_idx = 0
      next_question = get_next_question(evidences_en, questionnaire, feature_embeddings_df)
      while ask and (question_counter < n_questions):
        if next_question[next_question_idx] not in asked:
          answer = 1 if next_question[next_question_idx] in answers_en else 0
          asked.append(next_question[next_question_idx])
          question_counter+=1
          if answer==1:
            evidences_en.append(next_question[next_question_idx])
            ask = False
          else:
            next_question_idx += 1
          if next_question_idx > n_questions:
            break
        else:
            next_question_idx += 1
    return [evidences_en_to_code[e] for e in evidences_en]

In [6]:
def data_proc(df, questionnaire, feature_embeddings_df, n_questions):
    df["binary_evidences_all"] = df["EVIDENCES"].apply(lambda x: [d for d in ast.literal_eval(x) if d in evidences_list])
    df["binary_evidences_all_count"] = df["binary_evidences_all"].apply(lambda x: len(x))
    df["binary_evidences"] = df[["EVIDENCES", "INITIAL_EVIDENCE"]].progress_apply(lambda x: get_evidences(ast.literal_eval(x[0]), [x[1]], questionnaire, feature_embeddings_df, n_questions), axis=1)
    df["binary_evidences_count"] = df["binary_evidences"].apply(lambda x: len(x))
    df["hit_rate"] = df["binary_evidences_count"]/df["binary_evidences_all_count"]
    hit_rate = df["hit_rate"]
    for e in evidences_list:
        df[e] = df["binary_evidences"].apply(lambda x: 1 if e in x else 0)
    df["SEX"] = df["SEX"].map({'F': 0, 'M': 1})
    ftr_df = df[feature_columns + ["PATHOLOGY"]]
    questionnaire_df = df[["binary_evidences_all", "binary_evidences", "INITIAL_EVIDENCE"]]
    return ftr_df, hit_rate, questionnaire_df

In [7]:
def pred(x):
    pred_list = []
    for i in range(len(disease_list)):
        if x[i] > 0:
            pred_list.append({
                "disease": disease_list[i],
                "probability": x[i]})
    if pred_list:
        pred_df = pd.DataFrame(pred_list).set_index('disease')
        # return only top 1 - allows ties
        pred_df['rank'] = pred_df['probability'].rank(method='min', ascending=False)
        pred_df = pred_df.sort_values(by="rank")
        pred_df = pred_df[pred_df["rank"]<=1][["probability"]]
        return list(pred_df.index)
    else:
        return []

In [8]:
def get_missed_evidence(actual, asked):
    return list(set(actual)^set(asked))

In [9]:
def validate(x):
    return [x[0]]==x[1]

In [10]:
with open(f'{base_path}\\output\\questionnaire\\questionnaire.pkl', 'rb') as f:
    questionnaire = pickle.load(f)
feature_embeddings_df = pd.read_pickle(f'{base_path}\\output\\questionnaire\\questionnaire_embeddings.pkl')

In [11]:
diagnosis_df_valid = pd.read_csv(f"{base_path}\\input\\release_validate_patients")

In [12]:
data_proc(diagnosis_df_valid.head(5), questionnaire, feature_embeddings_df, n_questions=20)

100%|██████████| 5/5 [00:00<00:00, 168.53it/s]


(   AGE  SEX  E_91  E_53  E_159  E_129  E_154  E_155  E_210  E_140  ...  E_193  \
 0   55    0     0     1      0      0      1      0      0      1  ...      0   
 1   10    0     0     1      0      0      0      1      0      0  ...      0   
 2   68    0     1     1      0      1      0      0      0      0  ...      0   
 3   13    1     0     1      0      0      0      0      0      0  ...      0   
 4   48    1     0     1      0      0      0      0      0      0  ...      0   
 
    E_168  E_180  E_67  E_171  E_111  E_182  E_103  E_23     PATHOLOGY  
 0      0      0     0      0      0      0      0     0        Anemia  
 1      0      0     0      1      1      0      0     0  Panic attack  
 2      0      0     0      0      0      0      0     0     Influenza  
 3      0      0     0      0      0      0      0     0        Anemia  
 4      0      0     0      0      0      0      0     0     Boerhaave  
 
 [5 rows x 99 columns],
 0    1.000000
 1    1.000000
 2    1.0000

In [13]:
questionnaire_dict = {}
n_questions_list = list(range(1, 31))
for n_questions in n_questions_list:
    diagnosis_df_valid = pd.read_csv(f"{base_path}\\input\\release_validate_patients")
    print(f"Asking {n_questions} questions...")
    diagnosis_df_valid, hit_rate, questionnaire_df = data_proc(diagnosis_df_valid, questionnaire, feature_embeddings_df, n_questions)
    questionnaire_dict[f"questionnaire_data_{n_questions}"] = {
        "diagnosis_df_valid": diagnosis_df_valid,
        "hit_rate": hit_rate,
        "questionnaire_df": questionnaire_df
    }

Asking 1 questions...


100%|██████████| 132448/132448 [00:07<00:00, 17119.25it/s]


Asking 2 questions...


100%|██████████| 132448/132448 [02:16<00:00, 970.27it/s] 


Asking 3 questions...


100%|██████████| 132448/132448 [02:36<00:00, 844.00it/s]


Asking 4 questions...


100%|██████████| 132448/132448 [04:04<00:00, 540.99it/s]


Asking 5 questions...


100%|██████████| 132448/132448 [03:44<00:00, 589.00it/s]


Asking 6 questions...


100%|██████████| 132448/132448 [04:05<00:00, 538.55it/s] 


Asking 7 questions...


100%|██████████| 132448/132448 [04:32<00:00, 486.63it/s]


Asking 8 questions...


100%|██████████| 132448/132448 [10:01<00:00, 220.19it/s]


Asking 9 questions...


100%|██████████| 132448/132448 [08:25<00:00, 262.17it/s]


Asking 10 questions...


100%|██████████| 132448/132448 [06:55<00:00, 318.44it/s]


Asking 11 questions...


100%|██████████| 132448/132448 [05:05<00:00, 432.85it/s]


Asking 12 questions...


100%|██████████| 132448/132448 [05:21<00:00, 412.02it/s]


Asking 13 questions...


100%|██████████| 132448/132448 [07:38<00:00, 288.99it/s]


Asking 14 questions...


100%|██████████| 132448/132448 [06:00<00:00, 367.15it/s]


Asking 15 questions...


100%|██████████| 132448/132448 [10:55<00:00, 202.10it/s]


Asking 16 questions...


100%|██████████| 132448/132448 [11:33<00:00, 191.09it/s]


Asking 17 questions...


100%|██████████| 132448/132448 [12:03<00:00, 182.97it/s]


Asking 18 questions...


100%|██████████| 132448/132448 [12:07<00:00, 182.07it/s]


Asking 19 questions...


100%|██████████| 132448/132448 [12:02<00:00, 183.30it/s]


Asking 20 questions...


100%|██████████| 132448/132448 [11:51<00:00, 186.14it/s]


Asking 21 questions...


100%|██████████| 132448/132448 [12:40<00:00, 174.12it/s]


Asking 22 questions...


100%|██████████| 132448/132448 [12:47<00:00, 172.51it/s]


Asking 23 questions...


100%|██████████| 132448/132448 [07:44<00:00, 285.02it/s]


Asking 24 questions...


100%|██████████| 132448/132448 [06:44<00:00, 327.79it/s]


Asking 25 questions...


100%|██████████| 132448/132448 [06:40<00:00, 331.01it/s]


Asking 26 questions...


100%|██████████| 132448/132448 [07:17<00:00, 302.42it/s]


Asking 27 questions...


100%|██████████| 132448/132448 [12:52<00:00, 171.41it/s]


Asking 28 questions...


100%|██████████| 132448/132448 [11:26<00:00, 193.00it/s]


Asking 29 questions...


100%|██████████| 132448/132448 [09:52<00:00, 223.38it/s]


Asking 30 questions...


100%|██████████| 132448/132448 [20:02<00:00, 110.17it/s]


In [14]:
metrics_dict = {"Random_forest": {}, "Logistic_regression":{}, "Decision_Tree":{}}

## Random Forest

In [15]:
model_dict = {}
for disease in disease_list:
    disease_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
    with open(f'{base_path}\\output\\diseases\\{disease_filename}\\{disease_filename}_model.pkl', 'rb') as f:
        model_dict[disease] = pickle.load(f)

In [16]:
for n_questions in n_questions_list:
    diagnosis_df_valid = questionnaire_dict[f"questionnaire_data_{n_questions}"]["diagnosis_df_valid"]
    hit_rate = questionnaire_dict[f"questionnaire_data_{n_questions}"]["hit_rate"]
    questionnaire_df = questionnaire_dict[f"questionnaire_data_{n_questions}"]["questionnaire_df"]
    print(f"using {n_questions} questions predicting diagnosis...")
    for disease in disease_list:
        rf_model = model_dict[disease]
        diagnosis_df_valid[disease] = rf_model.predict_proba(diagnosis_df_valid[feature_columns])[:,1]
    diagnosis_df_valid["predicted_diagnosis"] = diagnosis_df_valid[disease_list].progress_apply(lambda x: pred(x), axis=1)
    diagnosis_df_valid["is_matched"] = diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis"]].progress_apply(lambda x: validate(x), axis=1)
    diagnosis_df_valid["hit_rate"] = hit_rate
    print("validating...")
    acc = diagnosis_df_valid.is_matched.value_counts().to_dict()
    acc["error_rate"] = acc[False]/len(diagnosis_df_valid)
    acc["mean_hit_rate"] = np.mean(hit_rate)
    hit_rate_misdiagnosed = diagnosis_df_valid[diagnosis_df_valid["is_matched"]==False]["hit_rate"]
    acc["mean_hit_rate_misdiagnosed"] = np.mean(hit_rate_misdiagnosed)
    metrics_dict["Random_forest"][f"n_questions_{n_questions}"] = acc
        


predicting diagnosis...


100%|██████████| 132448/132448 [10:30<00:00, 210.21it/s]
100%|██████████| 132448/132448 [00:02<00:00, 61331.66it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [06:17<00:00, 351.31it/s]
100%|██████████| 132448/132448 [00:03<00:00, 43442.78it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [09:47<00:00, 225.47it/s]
100%|██████████| 132448/132448 [00:02<00:00, 49882.24it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [10:07<00:00, 218.17it/s]
100%|██████████| 132448/132448 [00:02<00:00, 49328.80it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [09:53<00:00, 223.03it/s]
100%|██████████| 132448/132448 [00:02<00:00, 48651.17it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [08:51<00:00, 249.17it/s]
100%|██████████| 132448/132448 [00:02<00:00, 44633.65it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [05:06<00:00, 431.92it/s]
100%|██████████| 132448/132448 [00:01<00:00, 96214.86it/s] 


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [08:47<00:00, 250.92it/s]
100%|██████████| 132448/132448 [00:01<00:00, 96202.43it/s] 


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [11:50<00:00, 186.41it/s]
100%|██████████| 132448/132448 [00:02<00:00, 44622.04it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [08:18<00:00, 265.84it/s]
100%|██████████| 132448/132448 [00:02<00:00, 58615.09it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [05:29<00:00, 401.83it/s]
100%|██████████| 132448/132448 [00:01<00:00, 67656.30it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [07:22<00:00, 299.19it/s]
100%|██████████| 132448/132448 [00:01<00:00, 67515.41it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [05:54<00:00, 373.80it/s]
100%|██████████| 132448/132448 [00:01<00:00, 113371.20it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [05:32<00:00, 398.24it/s]
100%|██████████| 132448/132448 [00:01<00:00, 86016.72it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [04:53<00:00, 450.61it/s]
100%|██████████| 132448/132448 [00:01<00:00, 89666.41it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [04:38<00:00, 475.45it/s]
100%|██████████| 132448/132448 [00:01<00:00, 99763.56it/s] 


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [04:36<00:00, 478.92it/s]
100%|██████████| 132448/132448 [00:01<00:00, 79069.81it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [04:05<00:00, 540.26it/s]
100%|██████████| 132448/132448 [00:01<00:00, 112739.48it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [05:54<00:00, 373.64it/s]
100%|██████████| 132448/132448 [00:01<00:00, 86022.14it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [04:31<00:00, 488.59it/s]
100%|██████████| 132448/132448 [00:01<00:00, 114613.88it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [04:24<00:00, 500.23it/s]
100%|██████████| 132448/132448 [00:01<00:00, 108609.71it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [08:11<00:00, 269.39it/s]
100%|██████████| 132448/132448 [00:02<00:00, 56565.78it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [08:44<00:00, 252.32it/s]
100%|██████████| 132448/132448 [00:02<00:00, 52915.14it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [07:11<00:00, 307.25it/s]
100%|██████████| 132448/132448 [00:01<00:00, 72863.00it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [05:42<00:00, 386.47it/s]
100%|██████████| 132448/132448 [00:01<00:00, 89082.10it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [05:16<00:00, 418.09it/s]
100%|██████████| 132448/132448 [00:01<00:00, 93153.41it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [05:01<00:00, 439.39it/s]
100%|██████████| 132448/132448 [00:01<00:00, 93764.26it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [05:24<00:00, 407.84it/s]
100%|██████████| 132448/132448 [00:01<00:00, 126070.62it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [04:33<00:00, 484.43it/s]
100%|██████████| 132448/132448 [00:01<00:00, 115908.30it/s]


validating...
predicting diagnosis...


100%|██████████| 132448/132448 [04:36<00:00, 479.86it/s]
100%|██████████| 132448/132448 [00:01<00:00, 106680.66it/s]

validating...





In [33]:
plt_y_rf = [metrics_dict["Random_forest"][f"n_questions_{i}"]["error_rate"] for i in n_questions_list]
plt.plot(n_questions_list, plt_y_rf, marker='o')
plt.xlabel('number of questions')
plt.ylabel('misdiagnosis rate')
plt.title('Random Forest - Questionnaire Experiments')
plt.savefig(f'{base_path}\\output\\error_analysis_questionnaire\\random_forest_experiments.jpg', bbox_inches='tight')
plt.clf()

## Logistic Regression

In [21]:
model_dict = {}
for disease in disease_list:
    disease_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
    with open(f'{base_path}\\output\\diseases\\{disease_filename}\\{disease_filename}_logreg_model.pkl', 'rb') as f:
        model_dict[disease] = pickle.load(f)

In [22]:
for n_questions in n_questions_list:
    diagnosis_df_valid = questionnaire_dict[f"questionnaire_data_{n_questions}"]["diagnosis_df_valid"]
    hit_rate = questionnaire_dict[f"questionnaire_data_{n_questions}"]["hit_rate"]
    questionnaire_df = questionnaire_dict[f"questionnaire_data_{n_questions}"]["questionnaire_df"]
    print(f"using {n_questions} questions predicting diagnosis...")
    for disease in disease_list:
        rf_model = model_dict[disease]
        diagnosis_df_valid[disease] = rf_model.predict_proba(diagnosis_df_valid[feature_columns])[:,1]
    diagnosis_df_valid["predicted_diagnosis"] = diagnosis_df_valid[disease_list].progress_apply(lambda x: pred(x), axis=1)
    diagnosis_df_valid["is_matched"] = diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis"]].progress_apply(lambda x: validate(x), axis=1)
    diagnosis_df_valid["hit_rate"] = hit_rate
    print("validating...")
    acc = diagnosis_df_valid.is_matched.value_counts().to_dict()
    acc["error_rate"] = acc[False]/len(diagnosis_df_valid)
    acc["mean_hit_rate"] = np.mean(hit_rate)
    hit_rate_misdiagnosed = diagnosis_df_valid[diagnosis_df_valid["is_matched"]==False]["hit_rate"]
    acc["mean_hit_rate_misdiagnosed"] = np.mean(hit_rate_misdiagnosed)
    metrics_dict["Logistic_regression"][f"n_questions_{n_questions}"] = acc

using 1 questions predicting diagnosis...


100%|██████████| 132448/132448 [08:25<00:00, 262.14it/s]
100%|██████████| 132448/132448 [00:01<00:00, 68009.36it/s]


validating...
using 2 questions predicting diagnosis...


100%|██████████| 132448/132448 [07:27<00:00, 296.07it/s]
100%|██████████| 132448/132448 [00:01<00:00, 78091.07it/s]


validating...
using 3 questions predicting diagnosis...


100%|██████████| 132448/132448 [08:40<00:00, 254.53it/s]
100%|██████████| 132448/132448 [00:02<00:00, 58248.41it/s]


validating...
using 4 questions predicting diagnosis...


100%|██████████| 132448/132448 [08:57<00:00, 246.59it/s]
100%|██████████| 132448/132448 [00:02<00:00, 61586.15it/s]


validating...
using 5 questions predicting diagnosis...


100%|██████████| 132448/132448 [08:38<00:00, 255.42it/s]
100%|██████████| 132448/132448 [00:02<00:00, 58883.25it/s]


validating...
using 6 questions predicting diagnosis...


100%|██████████| 132448/132448 [05:30<00:00, 401.12it/s]
100%|██████████| 132448/132448 [00:01<00:00, 95244.00it/s] 


validating...
using 7 questions predicting diagnosis...


100%|██████████| 132448/132448 [05:17<00:00, 417.28it/s]
100%|██████████| 132448/132448 [00:01<00:00, 77151.63it/s]


validating...
using 8 questions predicting diagnosis...


100%|██████████| 132448/132448 [06:01<00:00, 366.10it/s]
100%|██████████| 132448/132448 [00:01<00:00, 93035.13it/s]


validating...
using 9 questions predicting diagnosis...


100%|██████████| 132448/132448 [07:19<00:00, 301.02it/s]
100%|██████████| 132448/132448 [00:02<00:00, 58084.46it/s]


validating...
using 10 questions predicting diagnosis...


100%|██████████| 132448/132448 [06:44<00:00, 327.04it/s]
100%|██████████| 132448/132448 [00:01<00:00, 82616.68it/s]


validating...
using 11 questions predicting diagnosis...


100%|██████████| 132448/132448 [05:49<00:00, 379.29it/s]
100%|██████████| 132448/132448 [00:01<00:00, 85467.99it/s]


validating...
using 12 questions predicting diagnosis...


100%|██████████| 132448/132448 [06:39<00:00, 331.77it/s]
100%|██████████| 132448/132448 [00:02<00:00, 50407.69it/s]


validating...
using 13 questions predicting diagnosis...


100%|██████████| 132448/132448 [05:29<00:00, 401.73it/s]
100%|██████████| 132448/132448 [00:00<00:00, 134228.69it/s]


validating...
using 14 questions predicting diagnosis...


100%|██████████| 132448/132448 [06:47<00:00, 325.28it/s]
100%|██████████| 132448/132448 [00:01<00:00, 78916.88it/s]


validating...
using 15 questions predicting diagnosis...


100%|██████████| 132448/132448 [07:36<00:00, 289.89it/s]
100%|██████████| 132448/132448 [00:01<00:00, 86173.95it/s]


validating...
using 16 questions predicting diagnosis...


100%|██████████| 132448/132448 [06:05<00:00, 362.70it/s]
100%|██████████| 132448/132448 [00:01<00:00, 81704.88it/s]


validating...
using 17 questions predicting diagnosis...


100%|██████████| 132448/132448 [06:33<00:00, 336.20it/s]
100%|██████████| 132448/132448 [00:01<00:00, 86400.06it/s]


validating...
using 18 questions predicting diagnosis...


100%|██████████| 132448/132448 [05:50<00:00, 378.06it/s]
100%|██████████| 132448/132448 [00:01<00:00, 80052.07it/s]


validating...
using 19 questions predicting diagnosis...


100%|██████████| 132448/132448 [05:03<00:00, 436.21it/s]
100%|██████████| 132448/132448 [00:01<00:00, 121823.62it/s]


validating...
using 20 questions predicting diagnosis...


100%|██████████| 132448/132448 [08:35<00:00, 257.01it/s]
100%|██████████| 132448/132448 [00:02<00:00, 59844.75it/s]


validating...
using 21 questions predicting diagnosis...


100%|██████████| 132448/132448 [06:46<00:00, 326.09it/s]
100%|██████████| 132448/132448 [00:01<00:00, 70165.03it/s]


validating...
using 22 questions predicting diagnosis...


100%|██████████| 132448/132448 [08:18<00:00, 265.67it/s]
100%|██████████| 132448/132448 [00:02<00:00, 62007.26it/s]


validating...
using 23 questions predicting diagnosis...


100%|██████████| 132448/132448 [07:06<00:00, 310.39it/s]
100%|██████████| 132448/132448 [00:01<00:00, 80497.57it/s]


validating...
using 24 questions predicting diagnosis...


100%|██████████| 132448/132448 [08:53<00:00, 248.29it/s]
100%|██████████| 132448/132448 [00:02<00:00, 46890.04it/s]


validating...
using 25 questions predicting diagnosis...


100%|██████████| 132448/132448 [09:55<00:00, 222.55it/s]
100%|██████████| 132448/132448 [00:01<00:00, 93787.66it/s]


validating...
using 26 questions predicting diagnosis...


100%|██████████| 132448/132448 [08:09<00:00, 270.73it/s]
100%|██████████| 132448/132448 [00:02<00:00, 49172.28it/s]


validating...
using 27 questions predicting diagnosis...


100%|██████████| 132448/132448 [10:20<00:00, 213.49it/s]
100%|██████████| 132448/132448 [00:02<00:00, 48601.63it/s]


validating...
using 28 questions predicting diagnosis...


100%|██████████| 132448/132448 [08:41<00:00, 253.88it/s]
100%|██████████| 132448/132448 [00:02<00:00, 49791.59it/s]


validating...
using 29 questions predicting diagnosis...


100%|██████████| 132448/132448 [06:18<00:00, 350.04it/s]
100%|██████████| 132448/132448 [00:01<00:00, 120851.14it/s]


validating...
using 30 questions predicting diagnosis...


100%|██████████| 132448/132448 [06:34<00:00, 335.47it/s]
100%|██████████| 132448/132448 [00:03<00:00, 37029.58it/s]


validating...


In [32]:
plt_y_logreg = [metrics_dict["Logistic_regression"][f"n_questions_{i}"]["error_rate"] for i in n_questions_list]
plt.plot(n_questions_list, plt_y_logreg, marker='o')
plt.xlabel('number of questions')
plt.ylabel('misdiagnosis rate')
plt.title('Logistic Regression - Questionnaire Experiments')
plt.savefig(f'{base_path}\\output\\error_analysis_questionnaire\\logistic_regression_experiments.jpg', bbox_inches='tight')
plt.clf()

## Decision Tree

In [27]:
model_dict = {}
for disease in disease_list:
    disease_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
    with open(f'{base_path}\\output\\diseases\\{disease_filename}\\{disease_filename}_dt_model.pkl', 'rb') as f:
        model_dict[disease] = pickle.load(f)

In [28]:
for n_questions in n_questions_list:
    diagnosis_df_valid = questionnaire_dict[f"questionnaire_data_{n_questions}"]["diagnosis_df_valid"]
    hit_rate = questionnaire_dict[f"questionnaire_data_{n_questions}"]["hit_rate"]
    questionnaire_df = questionnaire_dict[f"questionnaire_data_{n_questions}"]["questionnaire_df"]
    print(f"using {n_questions} questions predicting diagnosis...")
    for disease in disease_list:
        rf_model = model_dict[disease]
        diagnosis_df_valid[disease] = rf_model.predict_proba(diagnosis_df_valid[feature_columns])[:,1]
    diagnosis_df_valid["predicted_diagnosis"] = diagnosis_df_valid[disease_list].progress_apply(lambda x: pred(x), axis=1)
    diagnosis_df_valid["is_matched"] = diagnosis_df_valid[["PATHOLOGY", "predicted_diagnosis"]].progress_apply(lambda x: validate(x), axis=1)
    diagnosis_df_valid["hit_rate"] = hit_rate
    print("validating...")
    acc = diagnosis_df_valid.is_matched.value_counts().to_dict()
    acc["error_rate"] = acc[False]/len(diagnosis_df_valid)
    acc["mean_hit_rate"] = np.mean(hit_rate)
    hit_rate_misdiagnosed = diagnosis_df_valid[diagnosis_df_valid["is_matched"]==False]["hit_rate"]
    acc["mean_hit_rate_misdiagnosed"] = np.mean(hit_rate_misdiagnosed)
    metrics_dict["Decision_Tree"][f"n_questions_{n_questions}"] = acc

using 1 questions predicting diagnosis...


100%|██████████| 132448/132448 [08:33<00:00, 258.04it/s]
100%|██████████| 132448/132448 [00:03<00:00, 42265.38it/s]


validating...
using 2 questions predicting diagnosis...


100%|██████████| 132448/132448 [09:24<00:00, 234.83it/s]
100%|██████████| 132448/132448 [00:01<00:00, 90102.49it/s]


validating...
using 3 questions predicting diagnosis...


100%|██████████| 132448/132448 [04:54<00:00, 449.40it/s]
100%|██████████| 132448/132448 [00:01<00:00, 119698.03it/s]


validating...
using 4 questions predicting diagnosis...


100%|██████████| 132448/132448 [03:46<00:00, 585.38it/s]
100%|██████████| 132448/132448 [00:01<00:00, 116444.34it/s]


validating...
using 5 questions predicting diagnosis...


100%|██████████| 132448/132448 [04:41<00:00, 469.91it/s]
100%|██████████| 132448/132448 [00:01<00:00, 92537.34it/s] 


validating...
using 6 questions predicting diagnosis...


100%|██████████| 132448/132448 [05:31<00:00, 398.94it/s]
100%|██████████| 132448/132448 [00:03<00:00, 42781.06it/s]


validating...
using 7 questions predicting diagnosis...


100%|██████████| 132448/132448 [11:02<00:00, 199.84it/s]
100%|██████████| 132448/132448 [00:01<00:00, 127298.43it/s]


validating...
using 8 questions predicting diagnosis...


100%|██████████| 132448/132448 [09:20<00:00, 236.39it/s]
100%|██████████| 132448/132448 [00:02<00:00, 47718.05it/s]


validating...
using 9 questions predicting diagnosis...


100%|██████████| 132448/132448 [04:56<00:00, 447.12it/s]
100%|██████████| 132448/132448 [00:01<00:00, 90195.93it/s]


validating...
using 10 questions predicting diagnosis...


100%|██████████| 132448/132448 [07:30<00:00, 293.69it/s]
100%|██████████| 132448/132448 [00:03<00:00, 41861.85it/s]


validating...
using 11 questions predicting diagnosis...


100%|██████████| 132448/132448 [10:37<00:00, 207.84it/s]
100%|██████████| 132448/132448 [00:03<00:00, 39751.57it/s]


validating...
using 12 questions predicting diagnosis...


100%|██████████| 132448/132448 [08:17<00:00, 266.00it/s]
100%|██████████| 132448/132448 [00:01<00:00, 115254.03it/s]


validating...
using 13 questions predicting diagnosis...


100%|██████████| 132448/132448 [03:45<00:00, 588.51it/s]
100%|██████████| 132448/132448 [00:01<00:00, 118857.36it/s]


validating...
using 14 questions predicting diagnosis...


100%|██████████| 132448/132448 [03:49<00:00, 578.14it/s]
100%|██████████| 132448/132448 [00:01<00:00, 85127.36it/s] 


validating...
using 15 questions predicting diagnosis...


100%|██████████| 132448/132448 [04:57<00:00, 445.94it/s]
100%|██████████| 132448/132448 [00:01<00:00, 79301.41it/s]


validating...
using 16 questions predicting diagnosis...


100%|██████████| 132448/132448 [04:46<00:00, 462.66it/s]
100%|██████████| 132448/132448 [00:01<00:00, 118794.20it/s]


validating...
using 17 questions predicting diagnosis...


100%|██████████| 132448/132448 [04:19<00:00, 510.30it/s]
100%|██████████| 132448/132448 [00:01<00:00, 115129.20it/s]


validating...
using 18 questions predicting diagnosis...


100%|██████████| 132448/132448 [05:47<00:00, 381.05it/s]
100%|██████████| 132448/132448 [00:01<00:00, 104488.12it/s]


validating...
using 19 questions predicting diagnosis...


100%|██████████| 132448/132448 [05:15<00:00, 419.70it/s]
100%|██████████| 132448/132448 [00:02<00:00, 56635.54it/s]


validating...
using 20 questions predicting diagnosis...


100%|██████████| 132448/132448 [04:27<00:00, 495.31it/s]
100%|██████████| 132448/132448 [00:01<00:00, 119571.14it/s]


validating...
using 21 questions predicting diagnosis...


100%|██████████| 132448/132448 [03:47<00:00, 582.31it/s]
100%|██████████| 132448/132448 [00:01<00:00, 96489.56it/s] 


validating...
using 22 questions predicting diagnosis...


100%|██████████| 132448/132448 [04:05<00:00, 539.63it/s]
100%|██████████| 132448/132448 [00:01<00:00, 125332.53it/s]


validating...
using 23 questions predicting diagnosis...


100%|██████████| 132448/132448 [03:46<00:00, 585.90it/s]
100%|██████████| 132448/132448 [00:01<00:00, 121000.70it/s]


validating...
using 24 questions predicting diagnosis...


100%|██████████| 132448/132448 [03:41<00:00, 598.11it/s]
100%|██████████| 132448/132448 [00:01<00:00, 115692.00it/s]


validating...
using 25 questions predicting diagnosis...


100%|██████████| 132448/132448 [03:42<00:00, 595.39it/s]
100%|██████████| 132448/132448 [00:01<00:00, 123917.05it/s]


validating...
using 26 questions predicting diagnosis...


100%|██████████| 132448/132448 [03:52<00:00, 568.87it/s]
100%|██████████| 132448/132448 [00:01<00:00, 111002.96it/s]


validating...
using 27 questions predicting diagnosis...


100%|██████████| 132448/132448 [04:32<00:00, 486.53it/s]
100%|██████████| 132448/132448 [00:01<00:00, 92061.94it/s] 


validating...
using 28 questions predicting diagnosis...


100%|██████████| 132448/132448 [06:14<00:00, 353.41it/s]
100%|██████████| 132448/132448 [00:01<00:00, 76007.45it/s]


validating...
using 29 questions predicting diagnosis...


100%|██████████| 132448/132448 [04:42<00:00, 468.12it/s]
100%|██████████| 132448/132448 [00:01<00:00, 120338.58it/s]


validating...
using 30 questions predicting diagnosis...


100%|██████████| 132448/132448 [04:16<00:00, 516.18it/s]
100%|██████████| 132448/132448 [00:01<00:00, 92367.01it/s] 

validating...





In [30]:
plt_y_dt = [metrics_dict["Decision_Tree"][f"n_questions_{i}"]["error_rate"] for i in n_questions_list]
plt.plot(n_questions_list, plt_y_dt, marker='o')
plt.xlabel('number of questions')
plt.ylabel('misdiagnosis rate')
plt.title('Decision Tree - Questionnaire Experiments')
plt.savefig(f'{base_path}\\output\\error_analysis_questionnaire\\decision_tree_experiments.jpg', bbox_inches='tight')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [37]:
plt.plot(n_questions_list, plt_y_rf, marker='o', label = "Random Forest")
plt.plot(n_questions_list, plt_y_logreg, marker='o', label = "Logistic Regression")
plt.plot(n_questions_list, plt_y_dt, marker='o', label = "Decision Tree")
plt.xlabel('number of questions')
plt.ylabel('misdiagnosis rate')
plt.title('Questionnaire Experiments')
plt.ylim(0, 1)
plt.legend()
plt.savefig(f'{base_path}\\output\\error_analysis_questionnaire\\questionnaire_experiments.jpg', bbox_inches='tight')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [35]:
with open(f"{base_path}\\output\\error_analysis_questionnaire\\validation_metric_questionnaire_experiments.json", "w") as outfile: 
    json.dump(metrics_dict, outfile, indent=True)

In [None]:
# ran for approx 920 min
# 49 x 3 x 30 = 4410 model runs hahahaha