## Methods performance estimation for 
## *Compressed Video Quality Assessment Dataset (CVQAD)*
### and
## Results reproduction for paper 
## "Video compression dataset and benchmark of learning-based video-quality metrics"

In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from statsmodels.stats.weightstats import DescrStatsW
from matplotlib.legend import Legend
import json
import scipy.stats as stats
import ipywidgets as widgets

In [3]:
# read dataframe with metrics data
df = pd.read_csv('Metric_scores_example.csv')

# read dict with "content category"-"list of corresponding videos" mapping 
with open("video_categories.json") as f:
    cat_dict = json.load(f)

<font size="4">Generating table with correlation coefficients for each "sequence-preset" group</font>

#### Due to our subjective study methodology, correlation coefficients should only be computed within each "sequence-preset" group (without mixing) and then averaged.

#### (!) Note that single codecs and bitrate ranges are available only for the full dataset (CVQAD + hidden part), which is used to estimate methods performance only in the MSU Video Quality Metrics Benchmark

In [4]:
# choose compression standards
standard_list = ['all'] #['all', 'h265', 'av1', 'vvc'] - for full (open + hidden parts) dataset

#choose bitrate categories
bit_cat_list = ['all'] #['all', 'high', 'low'] - for full (open + hidden parts) dataset

corrs = pd.DataFrame()

for seq in tqdm(df.sequence.unique(), desc='Video sequences processing'):
    for preset in df.preset.unique():
        for standard in standard_list: 
            for bit_cat in bit_cat_list: 
                
                flt = ((df.sequence == seq) & (df.preset == preset))
                if bit_cat != 'all':
                    flt = flt & (df['bitrate'] == bit_cat)
                if standard != 'all':
                    flt = flt & (df['standard'] == standard)
                subj = df[flt]
                subj = subj.drop(["codec", "preset", "sequence", "crf",
                                  "standard", "real_bitrate", "bitrate"], axis=1)
                
                if (subj.shape[0] < 3):
                    continue
                    
                q = subj
                for corr in ['spearman', 'kendall']:
                    # Pandas KROCC implementation isn't stable in the presence of duplicates
                    if (corr == 'kendall'):
                        subj_corr = subj.corr(method=lambda x, y: stats.kendalltau(x, y)[0])['subjective_score']
                    else:
                        subj_corr = subj.corr(method=corr)['subjective_score']
                    
                    # Correlation calculation between each method and subjective scores
                    dct = pd.DataFrame({
                        'corr': corr,
                        'sample_size': subj.shape[0],
                        'sequence': seq,
                        'preset': preset,
                        'standard': standard,
                        'bitrate' : bit_cat,
                        **subj_corr
                    }, index=[0])

                    dct = dct.drop(["subjective_score"], axis=1)
                    corrs = pd.concat([corrs, dct], axis=0, ignore_index=True)

Video sequences processing:   0%|          | 0/36 [00:00<?, ?it/s]

In [5]:
corrs.head()

Unnamed: 0,corr,sample_size,sequence,preset,standard,bitrate,FR YUV-SSIM,FR Y-MS-SSIM,FR Y-VMAF NEG,FR Y-VMAF (v061),...,NR SPAQ MT-S,NR SPAQ MT-A,NR MEON,NR NIMA,FR GREED,FR DISTS,FR LPIPS,FR FOV VIDEO,FR DVQA,FR YUV-PSNR
0,spearman,12,crowd-run-2019,subjective,all,all,0.993007,0.951049,0.958042,0.965035,...,0.965035,0.965035,-0.482517,0.825175,0.48951,-0.93007,-0.895105,0.429588,0.951049,0.979021
1,kendall,12,crowd-run-2019,subjective,all,all,0.969697,0.848485,0.848485,0.878788,...,0.878788,0.878788,-0.454545,0.606061,0.424242,-0.787879,-0.727273,0.356686,0.878788,0.909091
2,spearman,12,kayak-trip-2019,subjective,all,all,0.965035,0.923077,0.986014,0.979021,...,0.958042,0.944056,-0.160839,0.895105,0.72028,-0.916084,-0.902098,0.746497,0.909091,0.944056
3,kendall,12,kayak-trip-2019,subjective,all,all,0.878788,0.818182,0.939394,0.909091,...,0.848485,0.818182,-0.242424,0.757576,0.515152,-0.787879,-0.757576,0.635831,0.757576,0.848485
4,spearman,12,making-alcohol-2019,subjective,all,all,0.958042,0.958042,0.972028,0.972028,...,0.972028,0.965035,0.090909,0.93007,0.895105,-0.902098,-0.93007,0.816113,0.923077,0.944056


<font size="4"> Mean and confidence intervals computing 

In [6]:
def weigh_func(col, weights, mode='mean'):
    st = DescrStatsW(col, weights=weights)
    l = st.mean - 1.96 * st.std_mean
    
    u = st.mean + 1.96 * st.std_mean
    if st.mean > 0:
        l = np.clip(l, 0, np.arctanh(0.99999))
        u = np.clip(u, 0, np.arctanh(0.99999))
    else:
        t = u
        u = np.clip(l, -np.arctanh(0.99999), 0)
        l = np.clip(t, -np.arctanh(0.99999), 0)

    if mode == 'mean':
        return st.mean
    elif mode == '-se':
        return l
    elif mode == '+se':
        return u
    else:
        raise ValueError('Unknown mode')

<font size="4"> Our dataset subsets which were presented in the paper

In [7]:
pools = dict()
pools["FULL DATASET"] = ('all','all',cat_dict["FULL DATASET"])
pools["LOW BITRATE"] = ('low','all',cat_dict["FULL DATASET"])
pools["HIGH BITRATE"] = ('high','all',cat_dict["FULL DATASET"])
pools["H.265"] = ('all','h265',cat_dict["FULL DATASET"])
pools["AV1"] = ('all','av1',cat_dict["FULL DATASET"])
pools["VVC"] = ('all','vvc',cat_dict["FULL DATASET"])
pools["UGC"] = ('all','all', cat_dict["ugc"])
pools["SHAKING"] = ('all','all', cat_dict["shaking"])
pools["SPORTS"] = ('all','all', cat_dict["sports"])
pools["NATURE"] = ('all','all', cat_dict["nature"])
pools["GAMING and ANIMATION"] = ('all','all', cat_dict["gaming_animation"])

style = {'description_width': 'initial'}
pool_option = widgets.ToggleButtons(
    options=pools.keys(),
    description='Videos Category:',
    button_style='primary', style=style
)

corr_option = widgets.ToggleButtons(
    options=["spearman", "kendall"],
    description='Correlation Coefficient:',
    button_style='primary', style=style
)

<font size="4">Correlation and category options choice

### (!) Note that only "FULL DATASET" is available for the CVQAD

In [8]:
pool_option

ToggleButtons(button_style='primary', description='Videos Category:', options=('FULL DATASET', 'LOW BITRATE', …

In [9]:
corr_option

ToggleButtons(button_style='primary', description='Correlation Coefficient:', options=('spearman', 'kendall'),…

<font size="4"> Final results generation

In [10]:
min_samples_srocc = 15
min_samples_krocc = 6
cols = corrs.columns[6:]
bit_cat, standard, seq = pools[pool_option.value]
corr = corr_option.value
preset = 'all'

# Correlation dataframe filtering

flt = (corrs['corr'] == corr)

if type(seq) is list:
    flt = flt & (corrs.sequence.isin(seq))
elif seq != 'all':
    flt = flt & (corrs.sequence == seq)

if type(preset) is list:
    flt = flt & (corrs.preset.isin(preset))
elif preset != 'all':
    flt = flt & (corrs.preset == preset)
    
if type(standard) is list:
    flt = flt & (corrs.standard.isin(standard))
else:
    flt = flt & (corrs.standard == standard)
    
if type(bit_cat) is list:
    flt = flt & (corrs.bitrate.isin(bit_cat))
else:
    flt = flt & (corrs.bitrate == bit_cat)
    
if (corr == "spearman"):
    min_samples = min_samples_srocc
else:
    min_samples = min_samples_krocc

flt = flt & (corrs.sample_size >= min_samples)

# Z-Fisher transform (Inverse Hyperbolic Tangent) applying
# to average correlation coefficients from different groups ("sequence-preset" pairs)
# and compute cofidence intervals
 
d_mean_correlation = (corrs[flt][cols].apply(lambda x: np.arctanh(x)).replace([np.inf, -np.inf], [np.arctanh(0.99), np.arctanh(-0.99)])\
.apply(lambda x: weigh_func(x, corrs[flt]['sample_size'], 'mean')))\
.apply(lambda x: np.tanh(x)).abs().replace([0.99], 1).sort_values(ascending=False)

d_ci_lower_bound = (corrs[flt][cols].apply(lambda x: np.arctanh(x)).replace([np.inf, -np.inf], [np.arctanh(0.99), np.arctanh(-0.99)])\
.apply(lambda x: weigh_func(x, corrs[flt]['sample_size'], '-se')))\
.apply(lambda x: np.tanh(x)).abs().replace([0.99], 1).sort_values(ascending=False)

d_ci_upper_bound = (corrs[flt][cols].apply(lambda x: np.arctanh(x)).replace([np.inf, -np.inf], [np.arctanh(0.99), np.arctanh(-0.99)])\
.apply(lambda x: weigh_func(x, corrs[flt]['sample_size'], '+se')))\
.apply(lambda x: np.tanh(x)).abs().replace([0.99], 1).sort_values(ascending=False)

In [11]:
d_final_correlation = pd.concat([d_ci_lower_bound, d_mean_correlation, d_ci_upper_bound], axis=1).reset_index()
d_final_correlation.columns = ["Metric", "CI Lower Bound", "Mean "+ corr[0].upper() + corr[1:] + " Correlation", "CI Upper Bound"]

In [12]:
d_final_correlation

Unnamed: 0,Metric,CI Lower Bound,Mean Spearman Correlation,CI Upper Bound
0,FR YUV-PSNR,0.94674,0.94988,0.952839
1,FR YUV-VMAF (v061),0.946099,0.949192,0.952113
2,FR YUV-SSIM,0.94563,0.949001,0.952168
3,FR YUV-VMAF NEG,0.94477,0.947771,0.950614
4,FR Y-MS-SSIM,0.943964,0.94711,0.950084
5,FR Y-VMAF NEG,0.94306,0.94591,0.948622
6,FR Y-VMAF (v061),0.942823,0.945633,0.948308
7,NR MDTVSFA,0.928352,0.931813,0.935113
8,NR SPAQ MT-S,0.910137,0.916205,0.921879
9,NR SPAQ BL,0.908784,0.915402,0.92156
