In [30]:
import numpy as np
import torch
import nltk
import json
import bert_score
import rouge
from collections import Counter

In [27]:
with open("data/combined.json") as f:
    data = json.load(f)

In [4]:
scorer = bert_score.BERTScorer(model_type="microsoft/deberta-xlarge-mnli", lang="en", rescale_with_baseline=True)

Some weights of the model checkpoint at microsoft/deberta-xlarge-mnli were not used when initializing DebertaModel: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
evaluator = rouge.Rouge(metrics=["rouge-n", "rouge-l"], max_n=2, limit_length=False, apply_avg=True, stemming=True, ensure_compatibility=True)

In [37]:
def stem(x):
    return Counter(evaluator.stem_tokens(evaluator.tokenize_text(x.lower())))


def calc_ds(summ_a, summ_b, summ_comm):
    s_a, s_b, s_c = stem(summ_a), stem(summ_b), stem(summ_comm)
    nr = sum((s_a & s_b).values()) + sum((s_a & s_c).values()) + sum((s_b & s_c).values()) - 2.0 * sum((s_a & s_b & s_c).values())
    dr = sum((s_a | s_b | s_c).values())
    return 1.0 - (nr / dr)

 
def calc_bs(summ_a, summ_b, summ_comm):
    ab = [s.detach().numpy()[0] for s in scorer.score([summ_a], [summ_b])]
    ba = [s.detach().numpy()[0] for s in scorer.score([summ_a], [summ_b])]
    a_comm = [s.detach().numpy()[0] for s in scorer.score([summ_a], [summ_comm])]
    comm_a = [s.detach().numpy()[0] for s in scorer.score([summ_comm], [summ_a])]
    b_comm = [s.detach().numpy()[0] for s in scorer.score([summ_b], [summ_comm])]
    comm_b = [s.detach().numpy()[0] for s in scorer.score([summ_comm], [summ_b])]
    a_b = (np.array(ab) + np.array(ba)) / 2.0
    a_c = (np.array(a_comm) + np.array(comm_a)) / 2.0
    b_c = (np.array(b_comm) + np.array(comm_a)) / 2.0
    return (a_b + a_c + b_c) / 3.0

In [39]:
dev_bs = list()
dev_ds = list()
for idx, d in enumerate(data['dev']):
    print(idx)
    for a_summ in d['entity_a_summary']:
        for b_summ in d['entity_b_summary']:
            for comm_summ in d['common_summary']:
                dev_bs.append(calc_bs(a_summ, b_summ, comm_summ))
                dev_ds.append(calc_ds(a_summ, b_summ, comm_summ))

0
1
2
3
4
5
6
7
8
9


In [45]:
dev_bs_json = [[float(y) for y in list(x)] for x in dev_bs]
with open("data/dev_bs.json", "w") as f:
    json.dump(dev_bs_json, f, indent=4)
with open("data/dev_ds.json", "w") as f:
    json.dump(dev_ds, f, indent=4)

In [46]:
test_bs = list()
test_ds = list()
for idx, d in enumerate(data['test']):
    print(idx)
    for a_summ in d['entity_a_summary']:
        for b_summ in d['entity_b_summary']:
            for comm_summ in d['common_summary']:
                test_bs.append(calc_bs(a_summ, b_summ, comm_summ))
                test_ds.append(calc_ds(a_summ, b_summ, comm_summ))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17


In [48]:
test_bs_json = [[float(y) for y in list(x)] for x in test_bs]
with open("data/test_bs.json", "w") as f:
    json.dump(test_bs_json, f, indent=4)
with open("data/test_ds.json", "w") as f:
    json.dump(test_ds, f, indent=4)

In [10]:
# def calc_data/umm_1, summ_2):
#     s_1, s_2 = stem(summ_1), stem(summ_2)
#     return 1.0 - (sum((s_1 & s_2).values()) / sum((s_1 | s_2).values()))


# dev_scores = dict(
#     ab=list(),
#     ba=list(),
#     comm_a=list(),
#     a_comm=list(),
#     comm_b=list(),
#     b_comm=list(),
#     gen_ab=list(),
#     gen_ba=list(),
#     gen_comm_a=list(),
#     gen_a_comm=list(),
#     gen_comm_b=list(),
#     gen_b_comm=list()
# )
# dev_ds = dict(
#     ab=list(),
#     ba=list(),
#     comm_a=list(),
#     a_comm=list(),
#     comm_b=list(),
#     b_comm=list(),
#     gen_ab=list(),
#     gen_ba=list(),
#     gen_comm_a=list(),
#     gen_a_comm=list(),
#     gen_comm_b=list(),
#     gen_b_comm=list()
# )

# for d in data['dev']:
#     for a_summ in d['entity_a_summary']:
#         for b_summ in d['entity_b_summary']:
#             dev_scores['ab'].append(scorer.score(a_summ, b_summ))
#             dev_scores['ba'].append(scorer.score(b_summ, a_summ))
#             dev_ds['ab'].append(calc_ds(a_summ, b_summ))
#             dev_ds['ba'].append(calc_ds(b_summ, a_summ))
#     for comm_sum in d['common_summary']:
#         for a_summ in d['entity_a_summary']:
#             dev_scores['comm_a'].append(scorer.score(comm_summ, a_summ))
#             dev_scores['a_comm'].append(scorer.score(a_summ, comm_summ))
#             dev_ds['comm_a'].append(calc_ds(comm_summ, a_summ))
#             dev_ds['a_comm'].append(calc_ds(a_summ, comm_summ))
#         for b_summ in d['entity_b_summary']:
#             dev_scores['comm_b'].append(scorer.score(comm_summ, b_summ))
#             dev_scores['b_comm'].append(scorer.score(b_summ, comm_summ))
#             dev_ds['comm_b'].append(calc_ds(comm_summ, b_summ))
#             dev_ds['b_comm'].append(calc_ds(b_summ, comm_summ))
#     gen_a = d['gen_cont_a']
#     gen_b = d['gen_cont_b']
#     gen_comm = d['gen_comm_a']  # assuming comm summary is same for entitiy a and entity b
#     dev_scores['gen_ab'].append(scorer.score(gen_a, gen_b))
#     dev_scores['gen_ba'].append(scorer.score(gen_b, gen_a))
#     dev_ds['gen_ab'].append(calc_ds(gen_a, gen_b))
#     dev_ds['gen_ba'].append(calc_ds(gen_b, gen_a))
#     dev_scores['gen_comm_a'].append(scorer.score(gen_comm, gen_a))
#     dev_scores['gen_a_comm'].append(scorer.score(gen_a, gen_comm))
#     dev_ds['gen_comm_a'].append(calc_ds(gen_comm, gen_a))
#     dev_ds['gen_a_comm'].append(calc_ds(gen_a, gen_comm))
#     dev_scores['gen_comm_b'].append(scorer.score(gen_comm, gen_b))
#     dev_scores['gen_b_comm'].append(scorer.score(gen_b, gen_comm))
#     dev_ds['gen_comm_b'].append(calc_ds(gen_comm, gen_b))
#     dev_ds['gen_b_comm'].append(calc_ds(gen_b, gen_comm))