In [1]:
%load_ext autoreload
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import scipy.stats as st
from tqdm import tqdm

In [2]:
from NOT import transform_full_df, load_models
from score_methods import *

In [3]:
data = pd.read_feather('results.ft')

In [4]:
def chunker(df, size):
    return [df.iloc[pos:pos + size] for pos in range(0, len(df), size)]


metric_to_evaluate = 'meta-iqa'
data_to_evaluate = data\
    [[
    'dataset',
    'attack',
    f'{metric_to_evaluate}_clear',
    f'{metric_to_evaluate}_attacked',
    f'{metric_to_evaluate}_ssim',
    f'{metric_to_evaluate}_psnr',
    f'{metric_to_evaluate}_mse'
    ]].reset_index(drop=True)
data_to_evaluate

Unnamed: 0,dataset,attack,meta-iqa_clear,meta-iqa_attacked,meta-iqa_ssim,meta-iqa_psnr,meta-iqa_mse
0,DERF,amifgsm,0.368074,0.953593,0.941180,39.916359,0.000102
1,DERF,amifgsm,0.373570,0.955572,0.941655,39.939890,0.000101
2,DERF,amifgsm,0.395932,0.945653,0.941388,39.931558,0.000102
3,DERF,amifgsm,0.378934,0.945657,0.940658,39.895225,0.000102
4,DERF,amifgsm,0.368836,0.955125,0.939671,39.839911,0.000104
...,...,...,...,...,...,...,...
77227,VIMEO,std-fgsm,0.533421,0.702039,0.568967,28.130854,0.001538
77228,VIMEO,std-fgsm,0.207382,0.290742,0.488029,28.130804,0.001538
77229,VIMEO,std-fgsm,0.490816,0.576149,0.604439,28.177142,0.001522
77230,VIMEO,std-fgsm,0.213780,0.446270,0.674324,28.178245,0.001521


Domain transformation

In [5]:
from tqdm import tqdm
CHUNK_SIZE = 1000
def chunker(df, size):
    return [df.iloc[pos:pos + size] for pos in range(0, len(df), size)]


# Run domain transform by chunks
data_transformed = pd.DataFrame()
for cur_df in tqdm(chunker(data_to_evaluate, 1000)):
    cur_data_transformed = transform_full_df(df=cur_df, metrics=[metric_to_evaluate], path_to_models='./models', domain='mdtvsfa')
    data_transformed = pd.concat([data_transformed, cur_data_transformed])
data_transformed    

100%|██████████| 78/78 [00:11<00:00,  6.86it/s]


Unnamed: 0,dataset,attack,meta-iqa_clear,meta-iqa_attacked,meta-iqa_ssim,meta-iqa_psnr,meta-iqa_mse
0,DERF,amifgsm,0.587065,1.183652,0.941180,39.916359,0.000102
1,DERF,amifgsm,0.594696,1.185642,0.941655,39.939890,0.000101
2,DERF,amifgsm,0.620342,1.174646,0.941388,39.931558,0.000102
3,DERF,amifgsm,0.599683,1.174953,0.940658,39.895225,0.000102
4,DERF,amifgsm,0.589288,1.185706,0.939671,39.839911,0.000104
...,...,...,...,...,...,...,...
77227,VIMEO,std-fgsm,0.788274,0.916142,0.568967,28.130854,0.001538
77228,VIMEO,std-fgsm,0.337719,0.460866,0.488029,28.130804,0.001538
77229,VIMEO,std-fgsm,0.736035,0.824355,0.604439,28.177142,0.001522
77230,VIMEO,std-fgsm,0.346962,0.680013,0.674324,28.178245,0.001521


Evaluate results

Overall scores (on all datasets and attacks)

In [6]:
abs_gain_scores = normalized_absolute_gain(data_transformed[f'{metric_to_evaluate}_clear'], data_transformed[f'{metric_to_evaluate}_attacked'])
low, high = st.t.interval(0.95, len(abs_gain_scores)-1, loc=np.mean(abs_gain_scores), scale=st.sem(abs_gain_scores))
print('Absolute gain: {:.3f} ({:.3f}, {:.3f})'.format(abs_gain_scores.mean(), low, high))

rel_gain_scores = normalized_relative_gain(data_transformed[f'{metric_to_evaluate}_clear'], data_transformed[f'{metric_to_evaluate}_attacked'])
low, high = st.t.interval(0.95, len(rel_gain_scores)-1, loc=np.mean(rel_gain_scores), scale=st.sem(rel_gain_scores))
print('Relative gain: {:.3f} ({:.3f}, {:.3f})'.format(rel_gain_scores.mean(), low, high))

rob_scores = robustness_score(data_transformed[f'{metric_to_evaluate}_clear'], data_transformed[f'{metric_to_evaluate}_attacked'])
low, high = st.t.interval(0.95, len(rob_scores)-1, loc=np.mean(rob_scores), scale=st.sem(rob_scores))
print('Robustness score: {:.3f} ({:.3f}, {:.3f})'.format(rob_scores.mean(), low, high))

print('Wasserstein score: {:.3f}'.format(calc_wasserstein_score(data_transformed[f'{metric_to_evaluate}_clear'], data_transformed[f'{metric_to_evaluate}_attacked'])))
print('Energy score: {:.3f}'.format(energy_distance_score(data_transformed[f'{metric_to_evaluate}_clear'], data_transformed[f'{metric_to_evaluate}_attacked'])))

Absolute gain: 0.240 (0.238, 0.243)
Relative gain: 0.181 (0.179, 0.184)
Robustness score: 1.169 (1.161, 1.176)
Wasserstein score: 0.240
Energy score: 0.324


Scores on selected dataset\attack

In [7]:
# Available attacks
data.attack.unique()

array(['amifgsm', 'cumulative-uap_COCO_amp0.2',
       'cumulative-uap_COCO_amp0.4', 'cumulative-uap_COCO_amp0.8',
       'cumulative-uap_VOC2012_amp0.2', 'cumulative-uap_VOC2012_amp0.4',
       'cumulative-uap_VOC2012_amp0.8', 'default-uap_COCO_amp0.2',
       'default-uap_COCO_amp0.4', 'default-uap_COCO_amp0.8',
       'default-uap_VOC2012_amp0.2', 'default-uap_VOC2012_amp0.4',
       'default-uap_VOC2012_amp0.8', 'generative-uap_COCO_amp0.2',
       'generative-uap_COCO_amp0.4', 'generative-uap_COCO_amp0.8',
       'generative-uap_VOC2012_amp0.2', 'generative-uap_VOC2012_amp0.4',
       'generative-uap_VOC2012_amp0.8', 'ifgsm', 'korhonen-et-al', 'madc',
       'mifgsm', 'std-fgsm'], dtype=object)

In [8]:
# Available datasets
data.dataset.unique()

array(['DERF', 'NIPS', 'VIMEO'], dtype=object)

In [9]:
dataset = 'NIPS'  # "NIPS", "DERF", "VIMEO" or "all"
attack = 'std-fgsm'  # attack name or "all"
eval_df = data_transformed.copy()
if dataset != 'all':
    eval_df = data_transformed[data_transformed.dataset == dataset]
if attack != 'all':
    eval_df = data_transformed[data_transformed.attack == attack]

abs_gain_scores = normalized_absolute_gain(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])
low, high = st.t.interval(0.95, len(abs_gain_scores)-1, loc=np.mean(abs_gain_scores), scale=st.sem(abs_gain_scores))
print('Absolute gain: {:.3f} ({:.3f}, {:.3f})'.format(abs_gain_scores.mean(), low, high))

rel_gain_scores = normalized_relative_gain(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])
low, high = st.t.interval(0.95, len(rel_gain_scores)-1, loc=np.mean(rel_gain_scores), scale=st.sem(rel_gain_scores))
print('Relative gain: {:.3f} ({:.3f}, {:.3f})'.format(rel_gain_scores.mean(), low, high))

rob_scores = robustness_score(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])
low, high = st.t.interval(0.95, len(rob_scores)-1, loc=np.mean(rob_scores), scale=st.sem(rob_scores))
print('Robustness score: {:.3f} ({:.3f}, {:.3f})'.format(rob_scores.mean(), low, high))

print('Wasserstein score: {:.3f}'.format(calc_wasserstein_score(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])))
print('Energy score: {:.3f}'.format(energy_distance_score(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])))

Absolute gain: 0.335 (0.329, 0.342)
Relative gain: 0.252 (0.247, 0.258)
Robustness score: 0.400 (0.387, 0.412)
Wasserstein score: 0.335
Energy score: 0.530
