In [1]:
%load_ext autoreload
import numpy as np
import pandas as pd
#import seaborn as sns
#from matplotlib import pyplot as plt
#from tqdm import tqdm
import scipy.stats as st
import os
from pathlib import Path 
import re

In [2]:
from NOT import transform_full_df, load_models
from score_methods import *

In [5]:
# get attack name from filename
def get_atk(filename, metric):
    pattern = f"(.+)_{metric}"
    match = re.search(pattern, filename)
    if match:
        return match.group(1)
    else:
        return None


# parse results for one metric
def parse_results(path='./res', metric='maniqa'):
    required_cols = ['clear', 'attacked', 'ssim', 'psnr', 'mse']
    uap_amps = [0.2, 0.4, 0.8]
    uap_train_dsets = ['VOC2012', 'COCO']
    result_df = pd.DataFrame()
    for file in Path(path).iterdir():
        if not str(file).endswith('test.csv'):
            continue
        atk_name = get_atk(str(file.stem), metric)
        if atk_name is None:
            continue
        
        #print(atk_name)
        cur_df = pd.read_csv(file).rename(columns={'test_dataset': 'dataset'})

        if 'uap' not in str(file.stem):
            data_to_add = cur_df[['dataset'] + required_cols].copy()
            data_to_add = data_to_add.rename(columns={x: f'{metric}_{x}' for x in required_cols})
            data_to_add['attack'] = atk_name
            data_to_add.dataset.replace(to_replace={'NIPS2017': 'NIPS'}, inplace=True)
            result_df = pd.concat([result_df, data_to_add], axis=0)
        else:
            for amp in uap_amps:
                for train_set in uap_train_dsets:
                    cur_atk_name = atk_name
                    if atk_name == 'uap':
                        cur_atk_name = 'default-uap'
                    cur_atk_name += f'_{train_set}_amp{str(amp)}'
                    data_to_add = cur_df[cur_df['amplitude'] == amp][cur_df['train_dataset'] == train_set][['dataset'] + required_cols].copy()
                    data_to_add = data_to_add.rename(columns={x: f'{metric}_{x}' for x in required_cols})
                    data_to_add['attack'] = cur_atk_name
                    data_to_add.dataset.replace(to_replace={'NIPS2017': 'NIPS'}, inplace=True)
                    result_df = pd.concat([result_df, data_to_add], axis=0)
    return result_df


In [9]:
metric_to_evaluate = 'maniqa'

# parse demo results
data = parse_results(path='./res', metric=metric_to_evaluate)

# results as of the time of writing
# available at https://calypso.gml-team.ru:5001/sharing/NFLRz05g9 (password: 'neurips_benchmark_2023')
#data = pd.read_feather('results.ft')

In [10]:
data_to_evaluate = data\
    [[
    'dataset',
    'attack',
    f'{metric_to_evaluate}_clear',
    f'{metric_to_evaluate}_attacked',
    f'{metric_to_evaluate}_ssim',
    f'{metric_to_evaluate}_psnr',
    f'{metric_to_evaluate}_mse'
    ]].reset_index(drop=True)
data_to_evaluate

Unnamed: 0,dataset,attack,maniqa_clear,maniqa_attacked,maniqa_ssim,maniqa_psnr,maniqa_mse
0,NIPS,amifgsm,0.497564,1.109380,0.810342,30.222343,0.000950
1,NIPS,amifgsm,0.531079,1.077701,0.875497,29.726215,0.001065
2,NIPS,amifgsm,0.668703,1.114359,0.855742,29.728792,0.001064
3,NIPS,amifgsm,0.568488,1.090326,0.906851,29.779349,0.001052
4,NIPS,amifgsm,0.659713,1.073918,0.747340,30.407143,0.000911
...,...,...,...,...,...,...,...
77275,VIMEO,default-uap_COCO_amp0.8,0.520685,0.149498,0.384374,24.441853,0.003596
77276,VIMEO,default-uap_COCO_amp0.8,0.411854,0.204567,0.546673,24.441853,0.003596
77277,VIMEO,default-uap_COCO_amp0.8,0.370396,0.178519,0.506352,24.441853,0.003596
77278,VIMEO,default-uap_COCO_amp0.8,0.420029,0.203137,0.391060,24.441853,0.003596


Domain transformation

In [11]:
from tqdm import tqdm

# affects VRAM usage if run on GPU
CHUNK_SIZE = 1000
def chunker(df, size):
    return [df.iloc[pos:pos + size] for pos in range(0, len(df), size)]


# Run domain transform by chunks
data_transformed = pd.DataFrame()
for cur_df in tqdm(chunker(data_to_evaluate, 1000)):
    cur_data_transformed = transform_full_df(df=cur_df, metrics=[metric_to_evaluate], path_to_models='./models', domain='mdtvsfa', dev='cuda:0')
    data_transformed = pd.concat([data_transformed, cur_data_transformed])
data_transformed    

100%|██████████| 78/78 [00:13<00:00,  5.99it/s]


Unnamed: 0,dataset,attack,maniqa_clear,maniqa_attacked,maniqa_ssim,maniqa_psnr,maniqa_mse
0,NIPS,amifgsm,0.592009,1.385288,0.810342,30.222343,0.000950
1,NIPS,amifgsm,0.641324,1.346277,0.875497,29.726215,0.001065
2,NIPS,amifgsm,0.838071,1.389977,0.855742,29.728792,0.001064
3,NIPS,amifgsm,0.698492,1.362370,0.906851,29.779349,0.001052
4,NIPS,amifgsm,0.829641,1.340757,0.747340,30.407143,0.000911
...,...,...,...,...,...,...,...
77275,VIMEO,default-uap_COCO_amp0.8,0.624594,0.099642,0.384374,24.441853,0.003596
77276,VIMEO,default-uap_COCO_amp0.8,0.440777,0.116113,0.546673,24.441853,0.003596
77277,VIMEO,default-uap_COCO_amp0.8,0.371981,0.110458,0.506352,24.441853,0.003596
77278,VIMEO,default-uap_COCO_amp0.8,0.456308,0.117133,0.391060,24.441853,0.003596


Evaluate results

Overall scores (on all datasets and attacks)

In [12]:
eval_df = data_transformed.copy()

# W/o domain transform:
# eval_df = data.copy()

abs_gain_scores = normalized_absolute_gain(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])
low, high = st.t.interval(0.95, len(abs_gain_scores)-1, loc=np.mean(abs_gain_scores), scale=st.sem(abs_gain_scores))
print('Absolute gain: {:.3f} ({:.3f}, {:.3f})'.format(abs_gain_scores.mean(), low, high))

rel_gain_scores = normalized_relative_gain(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])
low, high = st.t.interval(0.95, len(rel_gain_scores)-1, loc=np.mean(rel_gain_scores), scale=st.sem(rel_gain_scores))
print('Relative gain: {:.3f} ({:.3f}, {:.3f})'.format(rel_gain_scores.mean(), low, high))

rob_scores = robustness_score(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])
low, high = st.t.interval(0.95, len(rob_scores)-1, loc=np.mean(rob_scores), scale=st.sem(rob_scores))
print('Robustness score: {:.3f} ({:.3f}, {:.3f})'.format(rob_scores.mean(), low, high))

print('Wasserstein score: {:.3f}'.format(calc_wasserstein_score(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])))
print('Energy score: {:.3f}'.format(energy_distance_score(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])))

Absolute gain: 0.094 (0.091, 0.097)
Relative gain: 0.073 (0.070, 0.075)
Robustness score: 0.645 (0.641, 0.650)
Wasserstein score: 0.203
Energy score: 0.238


In [13]:
# Available attacks
data.attack.unique()

array(['amifgsm', 'cumulative-uap_VOC2012_amp0.2',
       'cumulative-uap_COCO_amp0.2', 'cumulative-uap_VOC2012_amp0.4',
       'cumulative-uap_COCO_amp0.4', 'cumulative-uap_VOC2012_amp0.8',
       'cumulative-uap_COCO_amp0.8', 'generative-uap_VOC2012_amp0.2',
       'generative-uap_COCO_amp0.2', 'generative-uap_VOC2012_amp0.4',
       'generative-uap_COCO_amp0.4', 'generative-uap_VOC2012_amp0.8',
       'generative-uap_COCO_amp0.8', 'ifgsm', 'korhonen-et-al', 'madc',
       'mifgsm', 'std-fgsm', 'default-uap_VOC2012_amp0.2',
       'default-uap_COCO_amp0.2', 'default-uap_VOC2012_amp0.4',
       'default-uap_COCO_amp0.4', 'default-uap_VOC2012_amp0.8',
       'default-uap_COCO_amp0.8'], dtype=object)

In [14]:
# Available datasets
data.dataset.unique()

array(['NIPS', 'DERF', 'VIMEO'], dtype=object)

Scores on selected dataset\attack

In [20]:
dataset = 'NIPS'  # "NIPS", "DERF", "VIMEO" or "all"
attack = 'std-fgsm'  # attack name or "all" or "uap-based" or "iterative"
uap_attacks = [ x for x in data_transformed.attack.unique() if 'uap' in x]

eval_df = data_transformed.copy()

# W/o domain transform
# eval_df = data.copy()


if dataset != 'all':
    eval_df = eval_df[eval_df.dataset == dataset]
if attack != 'all' and attack != 'uap-based' and attack != 'iterative':
    eval_df = eval_df[eval_df.attack == attack]
elif attack == 'uap-based':
    eval_df = eval_df[eval_df.attack.isin(uap_attacks)]
elif attack == 'iterative':
    eval_df = eval_df[~eval_df.attack.isin(uap_attacks)]

abs_gain_scores = normalized_absolute_gain(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])
low, high = st.t.interval(0.95, len(abs_gain_scores)-1, loc=np.mean(abs_gain_scores), scale=st.sem(abs_gain_scores))
print('Absolute gain: {:.3f} ({:.3f}, {:.3f})'.format(abs_gain_scores.mean(), low, high))

rel_gain_scores = normalized_relative_gain(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])
low, high = st.t.interval(0.95, len(rel_gain_scores)-1, loc=np.mean(rel_gain_scores), scale=st.sem(rel_gain_scores))
print('Relative gain: {:.3f} ({:.3f}, {:.3f})'.format(rel_gain_scores.mean(), low, high))

rob_scores = robustness_score(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])
low, high = st.t.interval(0.95, len(rob_scores)-1, loc=np.mean(rob_scores), scale=st.sem(rob_scores))
print('Robustness score: {:.3f} ({:.3f}, {:.3f})'.format(rob_scores.mean(), low, high))

print('Wasserstein score: {:.3f}'.format(calc_wasserstein_score(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])))
print('Energy score: {:.3f}'.format(energy_distance_score(eval_df[f'{metric_to_evaluate}_clear'], eval_df[f'{metric_to_evaluate}_attacked'])))

Absolute gain: 0.297 (0.288, 0.305)
Relative gain: 0.200 (0.193, 0.207)
Robustness score: 0.399 (0.380, 0.419)
Wasserstein score: 0.297
Energy score: 0.530
