Correlation exepriments from the paper [Rethinking the Evaluation of Video Summaries](https://arxiv.org/abs/1903.11328)

In [8]:
import sys
import h5py
import numpy as np
from scipy import stats
sys.path.append("../../")
from summarizer.utils.eval import evaluate_summary, evaluate_scores, upsample
from summarizer.utils.io import load_tvsum_mat

# TVSum

In [2]:
dataset = h5py.File("summarizer_dataset_tvsum_google_pool5.h5", "r")
video_1_h5 = dataset["video_1"]
video_name_h5 = video_1_h5["video_name"][...].item()
n_frames_h5 = video_1_h5["n_frames"][...].item()
positions_h5 = video_1_h5["picks"][...]
gtscore_h5 = np.expand_dims(upsample(video_1_h5["gtscore"][...], n_frames_h5, positions_h5), axis=0)

dataset = load_tvsum_mat("videos/tvsum/ydata-tvsum50.mat")
video_1_mat = dataset[0]
video_name_mat = video_1_mat["video"]
n_frames_mat = int(video_1_mat["nframes"])

user_summary = video_1_h5["user_summary"][...] # (n_users, n_frames) [0 1 ... 1]
user_scores = video_1_mat["user_anno"].T       # (n_users, n_frames) [1 5 ... 2]
user_scores = (user_scores-1.0)/(5.0-1.0)      # normalize them 
n_users, n_frames = user_scores.shape

assert video_name_h5 == video_name_mat
assert n_frames_h5 == n_frames_mat

print(".h5: ", video_name_h5, user_summary.shape, n_frames)
print(".mat: ", video_name_mat, user_scores.shape, n_frames)

.h5:  AwmHb44_ouw (20, 10597) 10597
.mat:  AwmHb44_ouw (20, 10597) 10597


## Using summaries `[0 1 ... 1]` as target

In [3]:
# Random scores: should be 0
corrs = []
for _ in range(50):
    machine_scores = np.random.uniform(0, 1, (n_frames,))
    corrs.append(evaluate_scores(machine_scores, user_summary))
np.mean(corrs)

0.00033878737760119457

In [4]:
# Human scores: should be maximum
# Leave-one-out strategy
corrs = []
for i in range(n_users):
    machine_scores = user_scores[i]
    corrs.append(evaluate_scores(machine_scores, np.delete(user_summary, i, 0)))
np.mean(corrs)

0.17628410427445923

## Using scores `[0.8 0.3 ... 0.1]` as target

In [5]:
# Random scores: should be 0
corrs = []
for _ in range(50):
    machine_scores = np.random.uniform(0, 1, (n_frames,))
    corrs.append(evaluate_scores(machine_scores, user_scores))
np.mean(corrs)

-0.00123821421247682

In [6]:
# Human scores: should be maximum
# Leave-one-out strategy
corrs = []
for i in range(n_users):
    machine_scores = user_scores[i]
    corrs.append(evaluate_scores(machine_scores, np.delete(user_scores, i, 0)))
np.mean(corrs)

0.2740630927883905

_Observations:_
* Using `user_summary` is consistent: random is near 0 and human is high (~0.17)
* Using `user_scores` matches the paper: random also near 0 and human close to 0.26 from paper (us ~0.27)

In the paper they have 0.26 so they might have averaged over all videos in TVSum.

We should use `user_scores` for correlation because `user_summary` is actually a summary, so KTS+Knapsack+15% constraint have been used. They do not reflect the absolute importance of a single frame.

# SumMe

In [7]:
dataset = h5py.File("summarizer_dataset_summe_google_pool5.h5", "r")
video_1 = dataset["video_1"]
video_name = video_1["video_name"][...].item()
n_frames = video_1["n_frames"][...]
positions = video_1["picks"][...]

user_summary = video_1["user_summary"][...] # (n_users, n_frames)
n_users, _ = user_summary.shape
# For SumMe, we consider that all users annotated the same scores, the gtscore
# Transform it to (1, n_frames)
user_scores = np.expand_dims(upsample(video_1["gtscore"][...], n_frames, positions), axis=0)

print(".h5: ", video_name, user_summary.shape, user_scores.shape, n_frames)

.h5:  b'Air_Force_One' (15, 4494) (1, 4494) 4494


## Using summaries `[0 1 ... 1]` as target

In [8]:
# Random scores: should be 0
corrs = []
for _ in range(50):
    machine_scores = np.random.uniform(0, 1, (n_frames,))
    corrs.append(evaluate_scores(machine_scores, user_summary))
np.mean(corrs)

-0.0002016706546212282

In [9]:
# Human scores: should be maximum
corrs = []
for i in range(n_users):
    machine_scores = user_summary[i] # here we don't have scores from user but only summary
    corrs.append(evaluate_scores(machine_scores, np.delete(user_summary, i, 0)))
np.mean(corrs)

0.22560639713572397

## Using scores `[0.8 0.3 ... 0.1]` as target

In [10]:
# Random scores: should be 0
corrs = []
for _ in range(50):
    machine_scores = np.random.uniform(0, 1, (n_frames,))
    corrs.append(evaluate_scores(machine_scores, user_scores))
np.mean(corrs)

-0.00012561221892703857

In [11]:
# Human scores: should be maximum
corrs = []
for i in range(n_users):
    machine_scores = user_summary[i] # here we don't have scores from user but only summary
    user_scores_ = np.expand_dims(np.delete(user_summary, i, 0).mean(axis=0), axis=0)
    corrs.append(evaluate_scores(machine_scores, user_scores_))
np.mean(corrs)

0.3319613038488442

_Observations:_
* Using `user_summary` is consistent: random scores leads to 0 and human is high (~0.22)
* Using `user_scores` matches the paper: random also near 0 and human high (us ~0.33)

In the paper they did not experiment this with SumMe since there is no importance score annotations.

Here, we should use `user_scores` (in other words `/gtscore`) to be consistent with TVSum, even if `/gtscore` is computed from summaries of annotators instead of absolute frame importance score. But, these summaries were made by annotators directly, without usiing KTS. So it is not biased by KTS. Which is the dark side of using F-score. So we can assume we're fine?

# Human performances

In [16]:
# TVSum
dataset = h5py.File("summarizer_dataset_tvsum_google_pool5.h5", "r")
human_avg_fscore, human_max_fscore, human_corr = [], [], []
for key in list(dataset.keys()):
    d = dataset[key]
    user_scores = d["user_scores"][...]
    user_summary = d["user_summary"][...]
    n_frames = d["n_frames"][()]
    positions = d["picks"][...]
    n_users = user_scores.shape[0]
    avg_fscores, max_fscores, avg_corrs = [], [], []
    for i in range(user_scores.shape[0]):
        machine_scores = user_scores[i]
        avg_corr = evaluate_scores(machine_scores, np.delete(user_scores, i, 0))
        avg_corrs.append(avg_corr)
    for i in range(user_summary.shape[0]):
        machine_summary = user_summary[i]
        avg_f_score, max_f_score = evaluate_summary(machine_summary, np.delete(user_summary, i, 0))
        avg_fscores.append(avg_f_score)
        max_fscores.append(max_f_score)
    human_avg_fscore.append(np.mean(avg_fscores))
    human_max_fscore.append(np.mean(max_fscores))
    human_corr.append(np.mean(avg_corrs))
np.mean(human_avg_fscore), np.mean(human_max_fscore), np.mean(human_corr)

(0.5383566506493562, 0.7754353055538261, 0.2041724108142788)

In [17]:
# SumMe
dataset = h5py.File("summarizer_dataset_summe_google_pool5.h5", "r")
human_avg_fscore, human_max_fscore, human_corr = [], [], []
for key in list(dataset.keys()):
    d = dataset[key]
    user_scores = d["user_scores"][...]
    user_summary = d["user_summary"][...]
    n_frames = d["n_frames"][()]
    positions = d["picks"][...]
    n_users = user_scores.shape[0]
    avg_fscores, max_fscores, avg_corrs = [], [], []
    for i in range(user_summary.shape[0]):
        machine_scores = user_summary[i] # here we don't have scores from user but only summary
        user_scores_ = np.expand_dims(np.delete(user_summary, i, 0).mean(axis=0), axis=0)
        avg_corr = evaluate_scores(machine_scores, user_scores_)
        avg_corrs.append(avg_corr)
    for i in range(user_summary.shape[0]):
        machine_summary = user_summary[i]
        avg_f_score, max_f_score = evaluate_summary(machine_summary, np.delete(user_summary, i, 0))
        avg_fscores.append(avg_f_score)
        max_fscores.append(max_f_score)
    human_avg_fscore.append(np.mean(avg_fscores))
    human_max_fscore.append(np.mean(max_fscores))
    human_corr.append(np.mean(avg_corrs))
np.mean(human_avg_fscore), np.mean(human_max_fscore), np.mean(human_corr)

(0.310737693643112, 0.5432709621250025, 0.3292369598490524)

# About correlation metrics

In [12]:
x = np.asarray([0.9,0.3,0.7])
y = np.asarray([0.4,0.8,1.0])
x, y

(array([0.9, 0.3, 0.7]), array([0.4, 0.8, 1. ]))

In [13]:
print(stats.kendalltau(stats.rankdata(-1*x), stats.rankdata(-1*y)))
print(stats.kendalltau(stats.rankdata(x), stats.rankdata(y)))
print(stats.kendalltau(x, y))

KendalltauResult(correlation=-0.33333333333333337, pvalue=1.0)
KendalltauResult(correlation=-0.33333333333333337, pvalue=1.0)
KendalltauResult(correlation=-0.33333333333333337, pvalue=1.0)


In [14]:
print(stats.spearmanr(stats.rankdata(-1*x), stats.rankdata(-1*y)))
print(stats.spearmanr(stats.rankdata(x), stats.rankdata(y)))
print(stats.spearmanr(x, y))

SpearmanrResult(correlation=-0.5, pvalue=0.6666666666666667)
SpearmanrResult(correlation=-0.5, pvalue=0.6666666666666667)
SpearmanrResult(correlation=-0.5, pvalue=0.6666666666666667)


Conclusion: ranking and/or sorting seems to have no effect