In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import json
from collections import Counter
import ast
import networkx as nx
import re
from constants import base_path, model_list, pathology_scope



In [2]:
with open(f"{base_path}\\input\\release_conditions.json") as f:
  disease_dict = json.load(f)
if pathology_scope:
  disease_list =  pathology_scope + ["NA"]
else:
  disease_list = list(disease_dict.keys()) + ["NA"]

In [3]:
def eval(model_name):
    print(f"Evaluating {model_name}...")
    error_df = pd.read_csv(f"{base_path}\\output\\error_analysis\\{model_name}\\validation_df_all_patients.csv").fillna("NA")
    error_df = error_df[error_df["is_matched"]==False]
    error_df = error_df[["PATHOLOGY", "predicted_diagnosis"]]
    error_df["predicted_diagnosis"] = [i if ast.literal_eval(i)!=[] else str(['NA']) for i in error_df["predicted_diagnosis"]]
    error_df["PATHOLOGY"].value_counts().sort_values().plot.barh(figsize=(6, 8))
    plt.title(f"Prediction Error Frequency\n({model_name})")
    plt.xlabel("Count")
    plt.savefig(f'{base_path}\\output\\error_analysis\\{model_name}\\pred_miss_freq.jpg', bbox_inches='tight')
    plt.clf()
    pred_miss_dict = {}
    for disease in disease_list:
        miss_list = []
        for idx, row in error_df[error_df["PATHOLOGY"]==disease].iterrows():
            miss_list.extend(ast.literal_eval(row["predicted_diagnosis"]))
        miss_list = [i for i in miss_list if i!=disease]
        if miss_list:
            count = dict(Counter(miss_list))
            pred_miss_dict[disease]={i: round(count[i] / len(miss_list), 3) for i in count}
            # pred_miss_dict[disease]=dict(Counter(miss_list))
    pred_miss_df = pd.DataFrame({"disease": pred_miss_dict.keys()})
    for disease in disease_list:
        pred_miss_df[disease]=[pred_miss_dict[i].get(disease, 0) for i in pred_miss_dict.keys()]
    pred_miss_df.set_index('disease', inplace=True)
    pred_miss_graph = pred_miss_df.stack()
    pred_miss_graph = pred_miss_graph.rename_axis(('Actual', 'Prediction Miss')).reset_index(name='weight')
    pred_miss_graph = pred_miss_graph[pred_miss_graph["weight"]>0]
    pred_miss_graph = pred_miss_graph.sort_values(['Actual','weight'], ascending=False)
    pred_miss_graph.to_csv(f"{base_path}\\output\\error_analysis\\{model_name}\\pred_miss_weigths.csv", index=False)
    G = nx.from_pandas_edgelist(pred_miss_graph, 'Actual', 'Prediction Miss', edge_attr='weight', create_using=nx.DiGraph())
    pos = nx.spring_layout(G, seed=0)
    fig = plt.figure(figsize=(15, 8))
    weights = [G[u][v]['weight'] for u,v in G.edges()]
    nx.draw(G, pos, edge_color=weights, edge_cmap=plt.cm.Blues, with_labels=True, arrowsize=20)
    plt.title("Prediction Miss", fontsize=20)
    plt.tight_layout()
    plt.savefig(f'{base_path}\\output\\error_analysis\\{model_name}\\pred_miss_network.jpg', bbox_inches='tight')
    plt.clf()
    for disease in pred_miss_dict:
        G = nx.from_pandas_edgelist(pred_miss_graph[(pred_miss_graph["Actual"]==disease) | (pred_miss_graph["Prediction Miss"]==disease) ], 'Actual', 'Prediction Miss', edge_attr='weight', create_using=nx.DiGraph())
        pos = nx.spring_layout(G, seed=0)
        fig = plt.figure(figsize=(15, 8))
        weights = [G[u][v]['weight'] for u,v in G.edges()]
        nx.draw(G, pos, edge_color=weights, edge_cmap=plt.cm.Blues, with_labels=True, arrowsize=20)
        plt.title(f"Prediction Miss - {disease}\n({model_name})", fontsize=20)
        plt.tight_layout()
        img_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', disease).replace(" ", "_")
        plt.savefig(f'{base_path}\\output\\error_analysis\\{model_name}\\pred_miss_{img_filename}.jpg', bbox_inches='tight')
        plt.clf()


## Tree-based models

In [4]:
for model_name in model_list["tree-based"]:
    eval(model_name)

Evaluating decision_tree...


  plt.tight_layout()
  plt.tight_layout()


Evaluating random_forest...


  plt.tight_layout()
  plt.tight_layout()


Evaluating gradient_boost...


  plt.tight_layout()
  plt.tight_layout()


<Figure size 600x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 600x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 600x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

## Logistic Regression

In [5]:
eval("logistic_regression")

Evaluating logistic_regression...


  plt.tight_layout()
  plt.tight_layout()


<Figure size 600x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

<Figure size 1500x800 with 0 Axes>