In [1]:
import pandas as pd
import evaluate
import seaborn as sns

In [2]:
data_root = '../data/processed/'
files = ['preds_qual.csv','preds_evidence.csv','preds_suggestion.csv','preds_connection.csv']
dfs = [pd.read_csv(data_root + f, index_col=0) for f in files]
dfs = [pd.read_csv(data_root + 'um.csv')] + dfs
df = pd.concat(dfs, axis=1)
df = df.dropna(subset=['clerkship'])

In [3]:
df.columns

Index(['Unnamed: 0', 'assessor', 'assessor_role', 'learner', 'date_assigned',
       'date_completed', 'approx_date', 'qual', 'evidence', 'suggestion',
       'connection', 'text', 'clerkship', 'from_file', 'qual_pred_label',
       'qual_pred_score', 'evidence_pred_label', 'evidence_pred_score',
       'suggestion_pred_label', 'suggestion_pred_score',
       'connection_pred_label', 'connection_pred_score'],
      dtype='object')

In [4]:
avg_type = 'macro'
metrics_with_args = {
    'accuracy': (evaluate.load('accuracy'), {}),
    'hyperml/balanced_accuracy': (evaluate.load('hyperml/balanced_accuracy'), {}),
    'f1': (evaluate.load('f1'), {'average': avg_type}),
    'precision': (evaluate.load('precision'), {'average': avg_type}),
    'recall': (evaluate.load('recall'), {'average': avg_type}),
    'matthews_correlation': (evaluate.load('matthews_correlation'), {}),
    'mae': (evaluate.load('mae'), {}),
    'confusion_matrix': (evaluate.load('confusion_matrix'), {})
}

In [5]:
overall_results = {}
target_cols = ['qual','evidence','suggestion','connection']
for target_col in target_cols:
    df_t = df.dropna(subset=[target_col, target_col + '_pred_label'])
    if target_col in ['suggestion','connection']:
        # model is trained on inverted labels
        df_t.loc[:, target_col] = df_t[target_col] * -1 + 1
    results = {}
    for metric_name in metrics_with_args:
        metric = metrics_with_args[metric_name][0]
        args = metrics_with_args[metric_name][1]
        results.update(metric.compute(predictions=df_t[target_col + '_pred_label'].tolist(), references=df_t[target_col].tolist(), **args))

    overall_results[target_col] = results

In [12]:
len(df)

4030

In [6]:
pd.DataFrame(overall_results).T

Unnamed: 0,accuracy,balanced_accuracy,f1,precision,recall,matthews_correlation,mae,confusion_matrix
qual,0.39354,0.35963,0.337663,0.374225,0.35963,0.210294,1.029457,"[[56, 21, 13, 4, 0, 3], [14, 60, 129, 55, 28, ..."
evidence,0.565725,0.460638,0.436117,0.483192,0.460638,0.10088,0.509214,"[[10, 3, 0, 3], [0, 29, 5, 74], [2, 36, 19, 50..."
suggestion,0.855562,0.83955,0.830276,0.822914,0.83955,0.662255,0.144438,"[[1010, 140], [95, 382]]"
connection,0.663793,0.672595,0.653942,0.701646,0.672595,0.373112,0.336207,"[[676, 100], [446, 402]]"


In [14]:
overall_results = {}
target_cols = ['qual','evidence','suggestion','connection']
for target_col in target_cols:
    print(target_col)
    clerkship_results = {}
    df_f = df.dropna(subset=[target_col, target_col + '_pred_label'])
    for clerkship in df_f['clerkship'].unique():
        df_t = df_f[df_f['clerkship'] == clerkship]
        print(clerkship, df_t.shape)
        if target_col in ['suggestion','connection']:
            # model is trained on inverted labels
            df_t.loc[:, target_col] = df_t[target_col] * -1 + 1
        results = {}
        for metric_name in metrics_with_args:
            metric = metrics_with_args[metric_name][0]
            args = metrics_with_args[metric_name][1]
            results.update(metric.compute(predictions=df_t[target_col + '_pred_label'].tolist(), references=df_t[target_col].tolist(), **args))
        clerkship_results[clerkship] = results
    overall_results[target_col] = clerkship_results

qual
Emergency (991, 22)
Family Med (214, 22)
Internal Medicine (1017, 22)
Neuro (726, 22)
Ob (159, 22)
Peds (75, 22)
Surgery (688, 22)
evidence
Family Med (214, 22)
Neuro (726, 22)
Surgery (688, 22)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


suggestion
Family Med (213, 22)
Neuro (726, 22)
Surgery (688, 22)
connection
Family Med (212, 22)
Neuro (726, 22)
Surgery (686, 22)


In [8]:
pd.DataFrame.from_dict(overall_results['qual'])

Unnamed: 0,Emergency,Family Med,Internal Medicine,Neuro,Ob,Peds,Surgery
accuracy,0.618567,0.341121,0.214356,0.231405,0.427673,0.466667,0.505814
balanced_accuracy,0.552601,0.275647,0.295881,0.361282,0.216241,0.660235,0.547903
f1,0.544101,0.22005,0.203773,0.192366,0.218092,0.564296,0.50443
precision,0.636961,0.207679,0.236521,0.37087,0.416477,0.567723,0.5205
recall,0.552601,0.275647,0.295881,0.361282,0.216241,0.660235,0.547903
matthews_correlation,0.448136,0.102392,0.094448,0.060954,0.15038,0.361798,0.341553
mae,0.54894,1.102804,1.622419,1.139118,0.91195,0.653333,0.774709
confusion_matrix,"[[3, 1, 0, 0, 0, 0], [0, 31, 47, 4, 0, 4], [0,...","[[1, 0, 1, 0, 0, 0], [0, 0, 2, 6, 0, 4], [0, 0...","[[41, 19, 12, 4, 0, 3], [14, 7, 61, 30, 24, 46...","[[1, 0, 0, 0, 0, 0], [0, 1, 8, 7, 2, 3], [0, 0...","[[1, 7, 6, 0, 1], [0, 1, 5, 2, 1], [0, 4, 4, 2...","[[2, 0, 0, 0, 0, 0], [0, 5, 0, 0, 0, 0], [0, 2...","[[8, 1, 0, 0, 0, 0], [0, 15, 4, 2, 2, 0], [0, ..."


In [9]:
pd.DataFrame.from_dict(overall_results['evidence'])

Unnamed: 0,Family Med,Neuro,Surgery
accuracy,0.546729,0.399449,0.747093
balanced_accuracy,0.310363,0.521911,0.614445
f1,0.260163,0.306141,0.576735
precision,0.262441,0.474301,0.608536
recall,0.310363,0.521911,0.614445
matthews_correlation,0.0471,0.094813,0.351495
mae,0.602804,0.661157,0.319767
confusion_matrix,"[[1, 0, 0, 3], [0, 0, 1, 24], [0, 0, 0, 68], [...","[[1, 0, 0, 0], [0, 4, 1, 42], [2, 2, 1, 389], ...","[[8, 3, 0, 0], [0, 25, 3, 8], [0, 34, 18, 47],..."


In [10]:
pd.DataFrame.from_dict(overall_results['suggestion'])

Unnamed: 0,Family Med,Neuro,Surgery
accuracy,0.840376,0.88843,0.825581
balanced_accuracy,0.790962,0.750509,0.831984
f1,0.810141,0.741928,0.825575
precision,0.861861,0.734215,0.832687
recall,0.790962,0.750509,0.831984
matthews_correlation,0.648962,0.48445,0.664671
mae,0.159624,0.11157,0.174419
confusion_matrix,"[[132, 5], [29, 47]]","[[596, 44], [37, 49]]","[[282, 91], [29, 286]]"


In [11]:
pd.DataFrame.from_dict(overall_results['connection'])

Unnamed: 0,Family Med,Neuro,Surgery
accuracy,0.674528,0.515152,0.817784
balanced_accuracy,0.698386,0.567587,0.822267
f1,0.657704,0.446306,0.817458
precision,0.77321,0.713005,0.827529
recall,0.698386,0.567587,0.822267
matthews_correlation,0.465623,0.23997,0.649775
mae,0.325472,0.484848,0.182216
confusion_matrix,"[[95, 2], [67, 48]]","[[315, 3], [349, 59]]","[[266, 95], [30, 295]]"
