### Sentence indices mapping

In [1]:
import hashlib
import os


def hashhex(s):
    """Returns a heximal formated SHA1 hash of the input string."""
    h = hashlib.sha1()
    h.update(s)
    return h.hexdigest()


def get_url_hashes(url_list):
    return [hashhex(url) for url in url_list]


def read_text_file(text_file):
    lines = []
    with open(text_file) as f:
        for line in f:
            lines.append(line.strip().encode('utf8'))
    return lines


def map_cnndailymail(url_file):
    url_list = read_text_file(url_file)
    url_hashes = get_url_hashes(url_list)
    story_fnames = [s + ".story" for s in url_hashes]
    num_stories = len(story_fnames)
    print(f'num_stories={num_stories}')
    return story_fnames

In [2]:
story_fnames = map_cnndailymail('data/all_test.txt')

num_stories=11490


In [3]:
idx2fname = {ct: fname for ct, fname in enumerate(story_fnames)}

In [4]:
# the original id of CNN/DM
idx2fname[0]

'469c6ac05092ca5997728c9dfc19f9ab6b936e40.story'

In [6]:
def read_one_cnndm_new(idx):
    # please refer to https://github.com/ChenRocks/fast_abs_rl to generate json files for each sample
    # we use their sentence tokenization results in the annotation
    with open(f'./cnn-dailymail/finished_files/test/{idx}.json') as f:
        data = json.loads(f.read())
    return data['article'], data['abstract']


def match_extract_idx_all(FAMs):
    '''
    given raw output, find the indices of extracted sentences
    '''
    # sample idx to indices of extracted sentences
    idx2labels = defaultdict(list)
    for idx in FAMs['low_abs']:
        fname = idx2fname[idx][:-6]
        # feed your model output here
        with open(f"../your_model_output/{fname}.model") as f:
            ext_l = [line.strip().lower() for line in f if len(line) > 1]
        match_extract_idx(idx, ext_l, idx2labels)
    return idx2labels


def match_extract_idx(idx, ext_l, idx2labels):
    '''
    since the results of sentence tokenization might vary by models, we find the closest matched sentence by TF-IDF
    to match the sentence idx used in our annotation.
    idx: the idx of sample in the test set
    ext_l: a list of sentence str representing the extracted summary
    '''
    tfidf = TfidfVectorizer()
    sent_l, _ = read_one_cnndm_new(idx)
    doc_vec_all = tfidf.fit_transform(sent_l)
    for sent in ext_l:
        sent_vec = tfidf.transform([sent])
        scores = cosine_similarity(sent_vec, doc_vec_all).squeeze()
        rank = np.argsort(-scores)
        idx2labels[idx].append(rank[0])
        # make sure they are (almost) identical
        print('Original sentence', sent)
        print('Found sentence', sent_l[rank[0]])
        print()
    print('*' * 20)

### Facet-Aware Evaluation

In [7]:
import pickle
from collections import defaultdict
import numpy as np

In [14]:
def facet_aware_eval(FAMs, method='lead3', num_sent_max=100, num_sup_group_max=100):
    def calc_prec(p, g):
        return len(p & g) / len(p)

    def calc_recall(p, g):
        return len(p & g) / len(g)

    def calc_f1(p, g):
        corr = len(p & g)
        P = corr / len(p)
        R = corr / len(g)
        return 2 * P * R / (P + R + 1e-6)

    def calc_facet_recall(p, sup_dict, n_sup=100):
        cover = 0
        for ref_i in sup_dict:
            for sup_group in sup_dict[ref_i][:n_sup]:
                if len(set(sup_group) - p) == 0:
                    cover += 1
                    break
        return cover / len(sup_dict)

    f1_l = []
    prec_l = []
    SAR_l = []
    FAR_l = []
    for idx in FAMs['low_abs']:
        # a set of salient (support) sentences
        all_sup_sents = set([i for ref in FAMs['low_abs'][idx]
                             for sup_group in FAMs['low_abs'][idx][ref][:num_sup_group_max] for i in sup_group])
        if method == 'lead3':
            ext_sents = set(range(3))
        elif method == 'fast_ext_rl':
            ext_sents = set(idx2labels_fastrl[idx][:num_sent_max])
        elif method == 'refresh':
            ext_sents = set(idx2labels_refresh[idx][:num_sent_max])
        elif method == 'neusum':
            ext_sents = set(idx2labels_neusum[idx][:num_sent_max])
        elif method == 'banditsum':
            ext_sents = set(idx2labels_bs[idx][:num_sent_max])
        elif method == 'unified_extract':
            ext_sents = set(idx2labels_unified[idx][:num_sent_max])
        # TODO add your method here
        else:
            raise
        assert len(ext_sents) != 0

        prec_l.append(calc_prec(ext_sents, all_sup_sents))
        SAR_l.append(calc_recall(ext_sents, all_sup_sents))
        f1_l.append(calc_f1(ext_sents, all_sup_sents))
        FAR_l.append(calc_facet_recall(
            ext_sents, FAMs['low_abs'][idx], num_sup_group_max))

    P, SAR, F1, FAR = np.mean(prec_l), np.mean(
        SAR_l), np.mean(f1_l), np.mean(FAR_l)
    print(f'{method:15} #samples: {len(f1_l)} SAP: {P:.3f} SAR: {SAR:.3f} SAF1: {F1:.3f} FAR: {FAR:.3f}')
    return P, SAR, F1, FAR

#### Load FAMs and system outputs

In [15]:
idx2labels_unified = pickle.load(open('data/idx2labels_unified.pkl', 'rb'))
idx2labels_neusum = pickle.load(open('data/idx2labels_neusum.pkl', 'rb'))
idx2labels_bs = pickle.load(open('data/idx2labels_bs.pkl', 'rb'))
idx2labels_refresh = pickle.load(open('data/idx2labels_refresh.pkl', 'rb'))
idx2labels_fastrl = pickle.load(open('data/idx2labels_fastrl.pkl', 'rb'))

In [27]:
FAMs = pickle.load(open('data/FAMs.pkl', 'rb'))

In [28]:
for k in FAMs:
    print(k, len(FAMs[k]))

low_abs 89
noise 41
high_abs 20
all_idx 150


In [29]:
# Facet 0 has two support groups {0}, {2}
# Facet 1 has one support group {3, 5}
FAMs['low_abs'][1]

defaultdict(list, {0: [{0}, {2}], 1: [{3, 5}]})

In [39]:
for idx in FAMs['low_abs']:
    for facet, sup_groups in FAMs['low_abs'][idx].items():
        for sup_group in sup_groups:
            print(f'sample_idx: {idx} facet-{facet} sup_group: {sup_group}')
    print()
    if idx > 1:
        break

sample_idx: 0 facet-0 sup_group: {1}
sample_idx: 0 facet-1 sup_group: {19}
sample_idx: 0 facet-2 sup_group: {25}

sample_idx: 1 facet-0 sup_group: {0}
sample_idx: 1 facet-0 sup_group: {2}
sample_idx: 1 facet-1 sup_group: {3, 5}

sample_idx: 2 facet-0 sup_group: {13}
sample_idx: 2 facet-1 sup_group: {0}
sample_idx: 2 facet-1 sup_group: {1}
sample_idx: 2 facet-2 sup_group: {6, 14}



In [12]:
# sample id list
FAMs['all_idx']

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 105,
 379,
 420,
 438,
 472,
 524,
 678,
 791,
 835,
 852,
 977,
 1249,
 1560,
 1683,
 2023,
 2043,
 2085,
 2371,
 2521,
 2542,
 2644,
 2725,
 2807,
 2843,
 3143,
 3183,
 3214,
 3479,
 3545,
 3594,
 3643,
 3704,
 4148,
 4177,
 4189,
 4223,
 4249,
 4742,
 5005,
 5368,
 5386,
 5388,
 5771,
 5814,
 5941,
 5978,
 6018,
 6056,
 6079,
 6525,
 6571,
 6582,
 6586,
 6626,
 6729,
 6839,
 6842,
 6846,
 6852,
 6886,
 7164,
 7168,
 7331,
 7360,
 7428,
 7435,
 7494,
 7570,
 7579,
 7675,
 7709,
 7784,
 8043,
 8061,
 8066,
 8093,
 8157,
 8219,
 8226,
 8249,
 8299,
 8513,
 8799,
 8925,
 9166,
 9711,
 9715,
 9980,
 10128,
 10290,
 10325,
 10395,
 10636,
 10681,
 10686,
 10739,
 11269,
 11364,
 11388,
 11395]

#### Evaluation

In [16]:
# results in the paper (note that only low_abs samples can be evaluated)
for method in ['lead3', 'fast_ext_rl', 'banditsum', 'neusum', 'refresh', 'unified_extract']:
    res = facet_aware_eval(FAMs=FAMs, method=method, num_sent_max=3)

lead3           #samples: 89 SAP: 0.610 SAR: 0.373 SAF1: 0.445 FAR: 0.506
fast_ext_rl     #samples: 89 SAP: 0.648 SAR: 0.406 SAF1: 0.479 FAR: 0.508
banditsum       #samples: 89 SAP: 0.586 SAR: 0.343 SAF1: 0.417 FAR: 0.447
neusum          #samples: 89 SAP: 0.639 SAR: 0.395 SAF1: 0.468 FAR: 0.512
refresh         #samples: 89 SAP: 0.610 SAR: 0.375 SAF1: 0.447 FAR: 0.513
unified_extract #samples: 89 SAP: 0.669 SAR: 0.413 SAF1: 0.488 FAR: 0.548
