In [8]:
import pandas as pd
from constants import base_path, model_list
import matplotlib.pyplot as plt
import json
import ast
from collections import Counter

In [9]:
diagnosis_df_valid = pd.read_csv(f"{base_path}\\input\\release_validate_patients")

In [11]:
with open(f"{base_path}\\input\\release_evidences.json") as f:
  evidences = json.load(f)
evidences_list = []
evidences_dict = {}
evidences_en_to_code = {}
for e in evidences.keys():
  # only binary symptoms and no antecedents
  if (not evidences[e]["possible-values"]):
    evidences_list.append(e)
    evidences_dict[e] = evidences[e]["question_en"]
    evidences_en_to_code[evidences[e]["question_en"]] = e
evidences_code_to_en = evidences_dict
evidences_list_en = list(evidences_en_to_code.keys())

In [12]:
with open(f"{base_path}\\input\\release_conditions.json") as f:
  disease_dict = json.load(f)
disease_list = list(disease_dict.keys())

In [13]:
def get_misses(degraded_df):
    missed_evidences_per_disease = {}
    for disease in disease_list:
        disease_df = degraded_df[degraded_df["PATHOLOGY"]==disease]

        # here, we want to count the most missed evidence per disease
        missed_evidences = []
        for e in disease_df["missed_evidence"]:
            e = ast.literal_eval(e)
            if e:
                missed_evidences.extend(e)
        missed_evidences_dict = dict(Counter(missed_evidences).most_common(10))
        missed_evidences_dict = {evidences_code_to_en[k]:missed_evidences_dict[k] for k in missed_evidences_dict}

        # hypothesis: the initial evidence is not very specific, hence the degrade in predictions
        initial_evidences_dict = dict(Counter(list(disease_df["INITIAL_EVIDENCE"])).most_common(10))
        initial_evidences_dict = {
            evidences_code_to_en[k]: initial_evidences_dict[k]
            for k in initial_evidences_dict}
        missed_evidences_per_disease[disease] = {
            "top_missed_evidences": missed_evidences_dict,
            "initial_evidences_count": initial_evidences_dict
        }
    return missed_evidences_per_disease

In [14]:
valid_df_questionnaire = pd.read_csv(f"{base_path}\\output\\error_analysis_questionnaire\\questionnaire_df.csv", index_col=False).drop(["Unnamed: 0"], axis=1)

In [22]:
def analyze_outputs(model_name):
    print(f"Analyzing {model_name}...")
    valid_df_pred = pd.read_csv(f"{base_path}\\output\\error_analysis\\{model_name}\\validation_df_all_patients.csv", index_col=False).drop(["Unnamed: 0"], axis=1)
    valid_df_pred = valid_df_pred[["PATHOLOGY", "predicted_diagnosis", "is_matched"]]
    valid_df_pred_questionnaire = pd.read_csv(f"{base_path}\\output\\error_analysis_questionnaire\\{model_name}\\validation_df_all_patients_questionnaire.csv", index_col=False).drop(["Unnamed: 0"], axis=1)
    valid_df_pred_questionnaire = valid_df_pred_questionnaire[["predicted_diagnosis", "is_matched"]]
    match_df = valid_df_pred.join(valid_df_pred_questionnaire.add_suffix("_questionnaire"))
    degraded_df = match_df[(match_df['is_matched']==True) & (match_df['is_matched_questionnaire']==False)]
    save_path = f"{base_path}\\output\\error_analysis_questionnaire\\{model_name}"
    degraded_df["PATHOLOGY"].value_counts().sort_values().plot.barh(figsize=(6, 8))
    plt.title(f"Pathology count - Degraded Predictions\n{model_name}")
    plt.savefig(f'{save_path}\\pathology_freq_degraded.jpg', bbox_inches='tight')
    plt.clf()
    degraded_df = degraded_df.join(valid_df_questionnaire)
    missed_evidences_per_disease = get_misses(degraded_df)
    with open(f"{save_path}\\missed_evidences.json", "w") as outfile: 
        json.dump(missed_evidences_per_disease, outfile, indent=True)
    # most common initial evidence for the degrades
    degraded_df.INITIAL_EVIDENCE.map(evidences_code_to_en).value_counts().sort_values().tail(10).plot.barh()
    plt.title(f"Initial Evidence Count - Degraded Predictions\n{model_name}")
    plt.savefig(f'{save_path}\\initial_evidence_freq_degraded.jpg', bbox_inches='tight')
    plt.clf()

## Tree-based models

In [23]:
for model_name in model_list["tree-based"]:
    analyze_outputs(model_name)

Analyzing decision_tree...
Analyzing random_forest...
Analyzing gradient_boost...


<Figure size 600x800 with 0 Axes>

## Logistic Regression

In [24]:
analyze_outputs("logistic_regression")

Analyzing logistic_regression...


<Figure size 600x800 with 0 Axes>