## Debug MIRQI

In [None]:
import pandas as pd
import re

In [None]:
%run -n ../eval_report_generation_mirqi.py

### Load MIRQI output

In [None]:
df = pd.read_csv('~/software/MIRQI/testing2.csv')
df.fillna('', inplace=True)
df.head()

In [None]:
attributes_gt = _attributes_to_list(df['attributes-gt'])
attributes_gen = _attributes_to_list(df['attributes-gen'])
len(attributes_gt), len(attributes_gen)

In [None]:
df['attributes-gt']

In [None]:
%run -n ../eval_report_generation_mirqi.py

In [None]:
scores = MIRQI_v2(attributes_gt, attributes_gen)

In [None]:
scores['MIRQI-v2-attr-p']

In [None]:
idx = 2
attributes_gt[idx], attributes_gen[idx]

## MIRQI Examples

In [None]:
%run -n ../eval_report_generation_mirqi.py
# %run -n ~/software/MIRQI/evaluate.py

### MIRQI original def

In [None]:
def MIRQI(gt_list, cand_list, pos_weight=0.8, attribute_weight=0.3, verbose=False):
    """Compute the score of matching keyword and associated attributes between gt list and candidate list.
       It returns two scores:   MIRQI-r (recall: hits in gt)
                                MIRQI-p (precision: correct ratio of all candidates)
    """

    MIRQI_r = []
    MIRQI_p = []
    MIRQI_f = []

    for gt_report_entry, cand_report_entry in zip(gt_list, cand_list):
        attribute_cand_all = []

        pos_count_in_gt = 0
        pos_count_in_cand = 0
        tp = 0.0
        fp = 0.0
        tn = 0.0
        fn = 0.0

        for gt_entity in gt_report_entry:
            if gt_entity[2] == 'NEGATIVE':
                continue
            pos_count_in_gt = pos_count_in_gt + 1
        neg_count_in_gt = len(gt_report_entry) - pos_count_in_gt

        for entity_index, cand_entity in enumerate(cand_report_entry):
            if cand_entity[2] == 'NEGATIVE':
                for entity_index, gt_entity in enumerate(gt_report_entry):
                    if  gt_entity[1] == cand_entity[1]:
                        if gt_entity[2] == 'NEGATIVE':
                            tn = tn + 1     # true negative hits
                            break
                        else:
                            fn = fn + 1     # false negative hits
                            break
            else:
                pos_count_in_cand = pos_count_in_cand + 1
                for entity_index, gt_entity in enumerate(gt_report_entry):
                    if gt_entity[1] == cand_entity[1]:
                        if gt_entity[2] == 'NEGATIVE':
                            fp = fp + 1     # false positive hits
                            break
                        else:
                            tp = tp + 1.0 - attribute_weight    # true positive hits (key words part)
                            # count attribute hits
                            if gt_entity[3] == '':
                                break
                            attributes_all_gt = gt_entity[3].split('/')
                            attribute_hit_count = 0
                            for attribute in attributes_all_gt:
                                if attribute in cand_entity[3]:
                                    attribute_hit_count = attribute_hit_count + 1
                            # true positive hits (attributes part)
                            temp = attribute_hit_count/len(attributes_all_gt)*attribute_weight
                            tp = tp + temp
                            break
        neg_count_in_cand = len(cand_report_entry) - pos_count_in_cand
        #
        # calculate score for positive/uncertain mentions
        if pos_count_in_gt == 0 and pos_count_in_cand == 0:
            score_r = 1.0
            score_p = 1.0
        elif pos_count_in_gt == 0 and pos_count_in_cand != 0:
            score_r = 0.0
            score_p = 0.0
        elif pos_count_in_gt != 0 and pos_count_in_cand == 0:
            score_r = 0.0
            score_p = 0.0
        else:
            score_r = tp / (tp + fn + 0.000001)
            score_p = tp / (tp + fp + 0.000001)

        # calculate score for negative mentions
        # if neg_count_in_cand != 0 and neg_count_in_gt != 0:
        if tn != 0:
            score_r = score_r * pos_weight + tn / (tn + fp + 0.000001) * (1.0 - pos_weight)
            score_p = score_p * pos_weight + tn / (tn + fn + 0.000001) * (1.0 - pos_weight)

        MIRQI_r.append(score_r)
        MIRQI_p.append(score_p)
        rec_prec = (score_r + score_p)
        MIRQI_f.append(2*(score_r * score_p) / rec_prec if rec_prec != 0.0 else 0.0)

    scores = {
        'MIRQI-r': MIRQI_r,
        'MIRQI-p': MIRQI_p,
        'MIRQI-f': MIRQI_f,
    }

    return scores

### Robust matching

#### Repeated nodes with different attributes

In [None]:
report_gt = "right effusion with mild atelectasis. left effusion is also present."
entities_gt = [
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'right/present'],
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'left/pleural'],
]
report_gen = report_gt
entities_gen = [
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'left/pleural'],
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'right/present'],
]
{
    **MIRQI([entities_gt], [entities_gen]),
    **MIRQI_v2([entities_gt], [entities_gen]),
}

#### GT nodes matched twice

In [None]:
report_gt = "right pleural effusion."
entities_gt = [
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'right'],
]
report_gen = "right pleural effusion. left pleural effusion"
entities_gen = [
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'right'],
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'left'],
]
{
    **MIRQI([entities_gt], [entities_gen]),
    **MIRQI_v2([entities_gt], [entities_gen]),
}