In [1]:
import rouge
import os
import sys

from tqdm import tqdm
sys.path.append('/home/ml/cadencao/summary_reward_no_reference/')
from rewarder import Rewarder

In [2]:
def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)

avg_evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                            max_n=4,
                            limit_length=True,
                            length_limit=200,
                            length_limit_type='words',
                            apply_avg=True,
                            apply_best=False,
                            alpha=0.5, # Default F1_score
                            weight_factor=1.2,
                            stemming=True)

In [3]:
rewarder = Rewarder(os.path.join('/home/ml/cadencao/summary_reward_no_reference/trained_models','sample.model'))

#### CNN/DailyMail

In [4]:
from utils import read_lines

In [14]:
cnndm_preds = 'preds/cnndm_test_bm4_all.hypo'
cnndm_source = '/home/ml/cadencao/cnn-dailymail/cnn_dm/test.source'
cnndm_target = '/home/ml/cadencao/cnn-dailymail/cnn_dm/test.target'

In [15]:
preds = read_lines(cnndm_preds)
source = read_lines(cnndm_source)
target = read_lines(cnndm_target)

In [16]:
print(len(preds))
print((len(source)))
print((len(target)))

45960
11490
11490


In [17]:
beam_size = 4
preds = [preds[i::beam_size] for i in range(beam_size)]

In [18]:
assert len(preds[0]) == len(source) == len(target)

In [24]:
index, best_predictions = 0, []
for d in tqdm(source[:1000]):
    d = ' '.join(d.split()[:350])
    best_rouge_1, best_pred = -1.0, None
    for s in range(beam_size):
        p = preds[s][index]
        score = rewarder(d, p)
        if score > best_rouge_1:
            best_rouge_1 = score
            best_pred = p
    best_predictions.append(best_pred)
    index += 1



  0%|          | 0/1000 [00:00<?, ?it/s][A[A

  0%|          | 1/1000 [00:00<15:24,  1.08it/s][A[A

  0%|          | 2/1000 [00:01<14:39,  1.13it/s][A[A

  0%|          | 3/1000 [00:02<15:33,  1.07it/s][A[A

  0%|          | 4/1000 [00:03<16:38,  1.00s/it][A[A

  0%|          | 5/1000 [00:05<17:15,  1.04s/it][A[A

  1%|          | 6/1000 [00:05<15:46,  1.05it/s][A[A

  1%|          | 7/1000 [00:06<15:51,  1.04it/s][A[A

  1%|          | 8/1000 [00:07<14:48,  1.12it/s][A[A

  1%|          | 9/1000 [00:08<16:07,  1.02it/s][A[A

  1%|          | 10/1000 [00:09<14:32,  1.14it/s][A[A

  1%|          | 11/1000 [00:10<16:11,  1.02it/s][A[A

  1%|          | 12/1000 [00:11<16:32,  1.00s/it][A[A

  1%|▏         | 13/1000 [00:12<15:01,  1.09it/s][A[A

  1%|▏         | 14/1000 [00:13<14:36,  1.13it/s][A[A

  2%|▏         | 15/1000 [00:13<12:51,  1.28it/s][A[A

  2%|▏         | 16/1000 [00:14<15:04,  1.09it/s][A[A

  2%|▏         | 17/1000 [00:16<16:19,  1.00it/

In [25]:
best_predictions[56]

'David Lynch has confirmed he will no longer direct the revival of "Twin Peaks" He said he felt the network was not offering enough money to produce the show "the way it needed to be done" The groundbreaking series is considered one of the most influential shows in television history.'

In [26]:
target[56]

'David Lynch says he won\'t be directing new episodes of Twin Peaks . Showtime "saddened" over decision, which involved a dispute over money .'

In [27]:
scores = avg_evaluator.get_scores(best_predictions, target[:1000])
for metric, results in sorted(scores.items(), key=lambda x: x[0]):
    print(prepare_results(results['p'], results['r'], results['f']))

	rouge-1:	P: 29.47	R: 44.91	F1: 34.87
	rouge-2:	P: 12.18	R: 18.87	F1: 14.49
	rouge-3:	P:  6.89	R: 10.75	F1:  8.22
	rouge-4:	P:  4.56	R:  7.14	F1:  5.43
	rouge-l:	P: 26.76	R: 38.39	F1: 31.09
	rouge-w:	P: 17.34	R: 13.60	F1: 14.80


#### XSum

In [None]:
xsum_preds_path = 'preds/xsum_test_bm6_all.hypo.tokenized'
xsum_target_path = 'preds/test_xsum.target.tokenzied'

In [None]:
xsum_preds = read_lines(xsum_preds_path)
xsum_target = read_lines(xsum_target_path)

In [None]:
print(len(xsum_preds))
print(len(xsum_target))

In [None]:
beam_size = 6
xsum_preds = [xsum_preds[i::beam_size] for i in range(beam_size)]

In [None]:
assert len(xsum_preds[0]) == len(xsum_target)

In [None]:
for i, p in enumerate(xsum_preds):
    print('- top {}:'.format(i + 1))
    scores = avg_evaluator.get_scores(p, xsum_target)
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        print(prepare_results(results['p'], results['r'], results['f']))

In [None]:
index, best_predictions = 0, []
for t in tqdm(xsum_target):
    best_rouge_1, best_pred = -1.0, None
    for s in range(beam_size):
        p = xsum_preds[s][index]
        score = avg_evaluator.get_scores([p], [t])
        if score['rouge-1']['f'] > best_rouge_1:
            best_rouge_1 = score['rouge-1']['f']
            best_pred = p
    best_predictions.append(best_pred)
    index += 1

In [None]:
scores = avg_evaluator.get_scores(best_predictions, xsum_target)
for metric, results in sorted(scores.items(), key=lambda x: x[0]):
    print(prepare_results(results['p'], results['r'], results['f']))