In [1]:
import rouge

from tqdm import tqdm

In [2]:
def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)

avg_evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                            max_n=4,
                            limit_length=True,
                            length_limit=200,
                            length_limit_type='words',
                            apply_avg=True,
                            apply_best=False,
                            alpha=0.5, # Default F1_score
                            weight_factor=1.2,
                            stemming=True)

best_evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                             max_n=4,
                             limit_length=True,
                             length_limit=200,
                             length_limit_type='words',
                             apply_avg=False,
                             apply_best=True,
                             alpha=0.5, # Default F1_score
                             weight_factor=1.2,
                             stemming=True)

#### CNN/DailyMail

In [3]:
from utils import read_lines

In [4]:
cnndm_preds = 'preds/cnndm_test_bm4_all.hypo.tokenized'
cnndm_target = 'preds/test.target.tokenzied'

In [5]:
preds = read_lines(cnndm_preds)
target = read_lines(cnndm_target)

In [6]:
print(len(preds))
print((len(target)))

45960
11490


In [7]:
beam_size = 4
preds = [preds[i::beam_size] for i in range(beam_size)]

In [8]:
assert len(preds[0]) == len(target)

In [9]:
for i, beam_pred in enumerate(preds):
    print('- No. {}:'.format(i + 1))
    scores = avg_evaluator.get_scores(beam_pred, target)
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        print(prepare_results(results['p'], results['r'], results['f']))

- top 1:
	rouge-1:	P: 40.41	R: 51.69	F1: 44.29
	rouge-2:	P: 19.34	R: 24.69	F1: 21.16
	rouge-3:	P: 11.69	R: 14.89	F1: 12.77
	rouge-4:	P:  8.02	R: 10.19	F1:  8.74
	rouge-l:	P: 34.11	R: 42.16	F1: 37.08
	rouge-w:	P: 22.07	R: 13.21	F1: 16.02
- top 2:
	rouge-1:	P: 41.37	R: 50.52	F1: 44.41
	rouge-2:	P: 19.84	R: 24.14	F1: 21.25
	rouge-3:	P: 12.00	R: 14.54	F1: 12.82
	rouge-4:	P:  8.24	R:  9.94	F1:  8.77
	rouge-l:	P: 34.87	R: 41.45	F1: 37.23
	rouge-w:	P: 22.72	R: 12.98	F1: 16.01
- top 3:
	rouge-1:	P: 42.41	R: 49.10	F1: 44.41
	rouge-2:	P: 20.41	R: 23.52	F1: 21.31
	rouge-3:	P: 12.39	R: 14.21	F1: 12.89
	rouge-4:	P:  8.53	R:  9.73	F1:  8.84
	rouge-l:	P: 35.75	R: 40.62	F1: 37.38
	rouge-w:	P: 23.51	R: 12.72	F1: 16.00
- top 4:
	rouge-1:	P: 43.62	R: 47.39	F1: 44.31
	rouge-2:	P: 21.00	R: 22.67	F1: 21.24
	rouge-3:	P: 12.74	R: 13.66	F1: 12.83
	rouge-4:	P:  8.75	R:  9.32	F1:  8.77
	rouge-l:	P: 36.70	R: 39.53	F1: 37.39
	rouge-w:	P: 24.32	R: 12.36	F1: 15.88


In [18]:
index, best_predictions = 0, []
for t in tqdm(target):
    best_rouge_1, best_pred = -1.0, None
    for s in range(beam_size):
        p = preds[s][index]
        score = avg_evaluator.get_scores([p], [t])
        if score['rouge-1']['f'] > best_rouge_1:
            best_rouge_1 = score['rouge-1']['f']
            best_pred = p
    best_predictions.append(best_pred)
    index += 1

100%|██████████| 11490/11490 [07:31<00:00, 25.44it/s]


In [19]:
scores = avg_evaluator.get_scores(best_predictions, target)
for metric, results in sorted(scores.items(), key=lambda x: x[0]):
    print(prepare_results(results['p'], results['r'], results['f']))

	rouge-1:	P: 44.82	R: 52.40	F1: 47.25
	rouge-2:	P: 22.21	R: 25.79	F1: 23.33
	rouge-3:	P: 13.71	R: 15.80	F1: 14.34
	rouge-4:	P:  9.53	R: 10.91	F1:  9.93
	rouge-l:	P: 37.35	R: 42.75	F1: 39.24
	rouge-w:	P: 24.77	R: 13.50	F1: 16.98


#### XSum

In [20]:
xsum_preds_path = 'preds/xsum_test_bm6_all.hypo.tokenized'
xsum_target_path = 'preds/test_xsum.target.tokenzied'

In [21]:
xsum_preds = read_lines(xsum_preds_path)
xsum_target = read_lines(xsum_target_path)

In [22]:
print(len(xsum_preds))
print(len(xsum_target))

67806
11301


In [23]:
beam_size = 6
xsum_preds = [xsum_preds[i::beam_size] for i in range(beam_size)]

In [24]:
assert len(xsum_preds[0]) == len(xsum_target)

In [26]:
for i, p in enumerate(xsum_preds):
    print('- top {}:'.format(i + 1))
    scores = avg_evaluator.get_scores(p, xsum_target)
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        print(prepare_results(results['p'], results['r'], results['f']))

- top 1:
	rouge-1:	P: 41.50	R: 49.71	F1: 44.58
	rouge-2:	P: 19.55	R: 23.54	F1: 21.04
	rouge-3:	P: 10.91	R: 13.20	F1: 11.76
	rouge-4:	P:  6.57	R:  8.00	F1:  7.08
	rouge-l:	P: 39.06	R: 45.47	F1: 41.58
	rouge-w:	P: 27.78	R: 18.28	F1: 21.63
- top 2:
	rouge-1:	P: 42.29	R: 49.24	F1: 44.82
	rouge-2:	P: 19.93	R: 23.28	F1: 21.14
	rouge-3:	P: 11.11	R: 13.02	F1: 11.79
	rouge-4:	P:  6.69	R:  7.88	F1:  7.11
	rouge-l:	P: 39.72	R: 45.16	F1: 41.81
	rouge-w:	P: 28.37	R: 18.14	F1: 21.70
- top 3:
	rouge-1:	P: 42.96	R: 48.49	F1: 44.86
	rouge-2:	P: 20.24	R: 22.92	F1: 21.16
	rouge-3:	P: 11.22	R: 12.76	F1: 11.75
	rouge-4:	P:  6.70	R:  7.66	F1:  7.02
	rouge-l:	P: 40.32	R: 44.66	F1: 41.92
	rouge-w:	P: 28.90	R: 17.91	F1: 21.70
- top 4:
	rouge-1:	P: 43.91	R: 47.77	F1: 45.02
	rouge-2:	P: 20.82	R: 22.68	F1: 21.36
	rouge-3:	P: 11.62	R: 12.68	F1: 11.92
	rouge-4:	P:  7.00	R:  7.66	F1:  7.18
	rouge-l:	P: 41.24	R: 44.27	F1: 42.21
	rouge-w:	P: 29.75	R: 17.76	F1: 21.81
- top 5:
	rouge-1:	P: 45.10	R: 46.82	F1: 45.17
	roug

In [27]:
index, best_predictions = 0, []
for t in tqdm(xsum_target):
    best_rouge_1, best_pred = -1.0, None
    for s in range(beam_size):
        p = xsum_preds[s][index]
        score = avg_evaluator.get_scores([p], [t])
        if score['rouge-1']['f'] > best_rouge_1:
            best_rouge_1 = score['rouge-1']['f']
            best_pred = p
    best_predictions.append(best_pred)
    index += 1

100%|██████████| 11301/11301 [03:28<00:00, 54.08it/s]


In [28]:
scores = avg_evaluator.get_scores(best_predictions, xsum_target)
for metric, results in sorted(scores.items(), key=lambda x: x[0]):
    print(prepare_results(results['p'], results['r'], results['f']))

	rouge-1:	P: 49.15	R: 51.97	F1: 49.76
	rouge-2:	P: 24.57	R: 25.77	F1: 24.77
	rouge-3:	P: 14.44	R: 15.04	F1: 14.50
	rouge-4:	P:  9.09	R:  9.43	F1:  9.10
	rouge-l:	P: 45.10	R: 47.21	F1: 45.63
	rouge-w:	P: 33.19	R: 19.19	F1: 23.89
