In [25]:
import numpy as np
import pandas as pd
import os
import json

num_trails = 10
cwd = os.getcwd()

results_path = cwd[:-28] + "Results/Pegasus/all_scores_range_0_138_trail_{}.npz"

results_intent_path = cwd[:-28] + "Results/Pegasus/all_scores_range_by_intent_0_138_trail_{}.txt"

intents = ["What is the state's policy regarding education?",
           "What are the major historical events in this state?",
           "How is the weather of the state?",
           "Which places seem interesting to you for visiting in this state?",
           "What are some of the most interesting things about this state?"]

sample_size = 10

model = "Pegasus Baseline"


In [26]:
full_results = []
total_data = None
total_data_intent = {}

for intent in intents:
    total_data_intent[intent] = []

# get all of the "normal" data
for i in range(sample_size):
    temp = np.load(results_path.format(i))
    data = temp['scores']
    
    if total_data is None:
        # we need to take the mean of axis-one first because each "user-sample" contains a set of predicted summaries. So we first avg those sets, before average across all the data/trials
        total_data = data.mean(axis=1)
    else:
        total_data = np.concatenate((total_data, data.mean(axis=1)), axis=0)

# get all of the "by-intent" data
for i in range(sample_size):
    res = None
    with open(results_intent_path.format(i)) as f:
        res = json.load(f)

    for intent in intents:
        total_data_intent[intent].extend(np.mean(res[intent], axis=1).tolist())

# now calculate the relavent stats
total_mean = np.mean(total_data, axis=0)
total_stddev = np.std(total_data, axis=0)

mean_intent = {}
stddev_intent = {}

for intent in intents:
    mean = np.mean(total_data_intent[intent], axis=0)
    stddev = np.std(total_data_intent[intent], axis=0)
    mean_intent[intent] = mean
    stddev_intent[intent] = stddev

Print off the total (avg across all intents) results

Note, the four results printed for each Rouge score are (in order) precision, recall, f1, f2

In [32]:
metrics = ["Rouge-1", "Rouge-2", "Rouge-L"]

for i, (row_m, row_s) in enumerate(zip(total_mean, total_stddev)):
    row_str = model + " & "
    for col_m, col_s in zip(row_m, row_s):
        row_str += str(np.round(col_m, 4)) + " $\\pm$ " + str(np.round(col_s, 4)) + " & "
    
    row_str = row_str[:-3]
    row_str += " \\\\"
    print(metrics[i])
    print(row_str)

Rouge-1
Pegasus Baseline & 0.1398 $\pm$ 0.0436 & 0.6225 $\pm$ 0.1186 & 0.2205 $\pm$ 0.0574 & 0.1378 $\pm$ 0.0359 \\
Rouge-2
Pegasus Baseline & 0.0713 $\pm$ 0.0416 & 0.3383 $\pm$ 0.1702 & 0.1139 $\pm$ 0.0626 & 0.0712 $\pm$ 0.0391 \\
Rouge-L
Pegasus Baseline & 0.1058 $\pm$ 0.0388 & 0.4812 $\pm$ 0.1361 & 0.1677 $\pm$ 0.0547 & 0.1048 $\pm$ 0.0342 \\


In [33]:
metrics = ["Rouge-1", "Rouge-2", "Rouge-L"]

for intent in intents:
    print("\nResults for intent {}\n".format(intent))
    for i, (row_m, row_s) in enumerate(zip(mean_intent[intent], stddev_intent[intent])):
        row_str = "\t"+model + " & "
        for col_m, col_s in zip(row_m, row_s):
            row_str += str(np.round(col_m, 4)) + " $\\pm$ " + str(np.round(col_s, 4)) + " & "
        
        row_str = row_str[:-3]
        row_str += " \\\\"
        print("\t"+metrics[i])
        print(row_str)


Results for intent What is the state's policy regarding education?

	Rouge-1
	Pegasus Baseline & 0.159 $\pm$ 0.0392 & 0.5848 $\pm$ 0.1176 & 0.2437 $\pm$ 0.0528 & 0.1523 $\pm$ 0.033 \\
	Rouge-2
	Pegasus Baseline & 0.0818 $\pm$ 0.0412 & 0.3201 $\pm$ 0.1582 & 0.127 $\pm$ 0.062 & 0.0794 $\pm$ 0.0388 \\
	Rouge-L
	Pegasus Baseline & 0.1218 $\pm$ 0.0364 & 0.4579 $\pm$ 0.1282 & 0.1875 $\pm$ 0.0523 & 0.1172 $\pm$ 0.0327 \\

Results for intent What are the major historical events in this state?

	Rouge-1
	Pegasus Baseline & 0.12 $\pm$ 0.0407 & 0.6673 $\pm$ 0.09 & 0.1972 $\pm$ 0.0545 & 0.1232 $\pm$ 0.034 \\
	Rouge-2
	Pegasus Baseline & 0.0609 $\pm$ 0.0339 & 0.3568 $\pm$ 0.15 & 0.1011 $\pm$ 0.0523 & 0.0632 $\pm$ 0.0327 \\
	Rouge-L
	Pegasus Baseline & 0.0913 $\pm$ 0.0346 & 0.5162 $\pm$ 0.1123 & 0.1505 $\pm$ 0.049 & 0.0941 $\pm$ 0.0306 \\

Results for intent How is the weather of the state?

	Rouge-1
	Pegasus Baseline & 0.1542 $\pm$ 0.047 & 0.6867 $\pm$ 0.1152 & 0.2437 $\pm$ 0.0633 & 0.1523 $\pm$ 0