# Import Libraries

In [1]:
import sys
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score, roc_curve, auc
from sklearn import svm

import pprint

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation

%matplotlib inline

Using TensorFlow backend.


# Data preparation

In [2]:
features = ['dimension',
            'size',
            'temporal_dct-mean', 
            'temporal_gaussian_mse-mean', 
            'temporal_gaussian_difference-mean',
            'temporal_threshold_gaussian_difference-mean'
           ]


path = '../../machine_learning/cloud_functions/data-large.csv'

metric_processor = MetricProcessor(features,'UL', path, reduced=False, bins=0)
df = metric_processor.read_and_process_data()
df.shape

dimension
size
temporal_dct-mean
temporal_gaussian_mse-mean
temporal_gaussian_difference-mean
temporal_threshold_gaussian_difference-mean


(201040, 9)

In [3]:
display(df.head())

Unnamed: 0,dimension,size,temporal_dct-mean,temporal_gaussian_mse-mean,temporal_gaussian_difference-mean,temporal_threshold_gaussian_difference-mean,attack_ID,title,attack
0,1080,5841311,0.0,0.0,0.0,0.0,1,/tmp/1080p/-3MYFnEaYu4.mp4,1080p
1,1080,5264133,11633.920898,1.40216,8391589.0,9367.222222,10,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_black_and_white
2,1080,1194152,2241.402283,0.085664,3293602.0,0.0,12,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_low_bitrate_4
3,1080,599858,3447.552338,0.30867,6842059.0,5.333333,10,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_low_bitrate_8
4,1920,5978909,314221.152344,424.608446,324256900.0,168671.666667,10,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_rotate_90_clockwise


In [4]:
# We remove the low bitrates since we are only focused on tampering attacks. The rotation attacks are also
# removed since they will be detected by the pre-verifier just by checking output dimensions
df = df[~(df['attack'].str.contains('low_bitrate')) & ~(df['attack'].str.contains('rotate'))]

In [5]:
df.head()

Unnamed: 0,dimension,size,temporal_dct-mean,temporal_gaussian_mse-mean,temporal_gaussian_difference-mean,temporal_threshold_gaussian_difference-mean,attack_ID,title,attack
0,1080,5841311,0.0,0.0,0.0,0.0,1,/tmp/1080p/-3MYFnEaYu4.mp4,1080p
1,1080,5264133,11633.920898,1.40216,8391589.0,9367.222222,10,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_black_and_white
5,1080,5526314,51759.428101,11.632621,37175390.0,81018.888889,10,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_vignette
6,1080,4815850,3380.053368,1.571478,3589129.0,7434.333333,11,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_watermark
7,1080,4828433,1580.337811,0.546626,1679861.0,3206.777778,11,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_watermark-345x114


In [6]:
(X_train, X_test, X_attacks), (df_train, df_test, df_attacks) = metric_processor.split_test_and_train(df)

print('Shape of train: {}'.format(X_train.shape))
print('Shape of test: {}'.format(X_test.shape))
print('Shape of attacks: {}'.format(X_attacks.shape))

Shape of train: (18014, 6)
Shape of test: (4504, 6)
Shape of attacks: (111778, 6)


The train and test are **only** composed by legit assets

In [7]:
# Scaling the data
ss = StandardScaler()
x_train = ss.fit_transform(X_train)
x_test = ss.transform(X_test)
x_attacks = ss.transform(X_attacks)

# One Class SVM

In [8]:
# Train the model
OCSVM = svm.OneClassSVM(kernel='rbf',gamma='auto', nu=0.01, cache_size=5000)
OCSVM.fit(x_train)

OneClassSVM(cache_size=5000, coef0=0.0, degree=3, gamma='auto', kernel='rbf',
      max_iter=-1, nu=0.01, random_state=None, shrinking=True, tol=0.001,
      verbose=False)

In [9]:
fb, area, tnr, tpr_train, tpr_test = evaluation.unsupervised_evaluation(OCSVM, x_train, x_test, x_attacks)

In [10]:
# Show global results of classification
print('TNR: {}\nTPR_test: {}\nTPR_train: {}\n'.format(tnr, tpr_test, tpr_train))
print('F20: {}\nAUC: {}'.format(fb, area))

TNR: 0.8769972624308898
TPR_test: 0.9824600355239786
TPR_train: 0.984845120461863

F20: 0.9750798725556029
AUC: 0.9297286489774341


In [11]:
# Show mean distances to the decision function. A negative distance means that the data is classified as
# an attack
train_scores = OCSVM.decision_function(x_train)
test_scores = OCSVM.decision_function(x_test)
attack_scores = OCSVM.decision_function(x_attacks)

print('Mean score values:\n-Train: {}\n-Test: {}\n-Attacks: {}'.format(np.mean(train_scores),
                                                                       np.mean(test_scores),
                                                                       np.mean(attack_scores)))

Mean score values:
-Train: 0.697270521920494
-Test: 0.6871460213667405
-Attacks: -4.114003380259059


In [12]:
train_preds = OCSVM.predict(x_train)
test_preds = OCSVM.predict(x_test)
attack_preds = OCSVM.predict(x_attacks)

In [13]:
df_train['dist_to_dec_funct'] = train_scores
df_test['dist_to_dec_funct'] = test_scores
df_attacks['dist_to_dec_funct'] = attack_scores

In [14]:
df_train['prediction'] = train_preds
df_test['prediction'] = test_preds
df_attacks['prediction'] = attack_preds

# Report

In [15]:
# Zoom in in the mean distances of the test set to the decision function by resolution. Percentiles, standard 
# deviation, min and max values are shown too.
display(df_test[['dist_to_dec_funct', 'dimension']].groupby('dimension').describe())

Unnamed: 0_level_0,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
dimension,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
144,750.0,0.384982,0.526982,-5.478606,0.299524,0.423076,0.561472,0.96478
240,751.0,0.645751,0.409889,-5.028284,0.580943,0.671511,0.758675,1.186582
360,751.0,0.858006,0.370707,-5.395669,0.737261,0.913881,1.031872,1.466536
480,751.0,1.155328,0.361671,-5.118767,1.020612,1.235215,1.354436,1.660667
720,751.0,0.96532,0.435993,-5.440966,0.730039,1.046307,1.264385,1.60472
1080,750.0,0.112322,0.115657,-0.353376,0.00945,0.063926,0.21219,0.330692


In [16]:
# Zoom in in the mean distances of the attack set to the decision function by resolution. Percentiles, standard 
# deviation, min and max values are shown too.
display(df_attacks[['dist_to_dec_funct', 'dimension']].groupby('dimension').describe())

Unnamed: 0_level_0,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
dimension,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
144,18630.0,-3.006042,2.50369,-5.507489,-5.507489,-3.581898,-0.33828,1.04031
240,18630.0,-3.48822,2.564594,-5.507489,-5.507489,-5.257354,-0.942326,1.364775
360,18629.0,-3.9518,2.48056,-5.507489,-5.507489,-5.505116,-2.534756,1.644762
480,18629.0,-4.260586,2.279355,-5.507489,-5.507489,-5.507488,-4.221773,1.816107
720,18630.0,-4.74096,1.736887,-5.507489,-5.507489,-5.507489,-5.480272,1.598262
1080,18630.0,-5.236412,0.899734,-5.507489,-5.507489,-5.507489,-5.507489,0.52312


In [17]:
# Zoom in in the mean distances of the test set to the decision function by attack type. Percentiles, standard 
# deviation, min and max values are shown too.
df_attacks['attack_'] = df_attacks['attack'].apply(lambda x: x[x.find('p') + 2:])
display(df_attacks[['dist_to_dec_funct', 'attack_']].groupby(['attack_']).describe())

Unnamed: 0_level_0,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
attack_,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
black_and_white,22518.0,-5.308967,0.989769,-5.507489,-5.507489,-5.507489,-5.507489,1.488898
flip_vertical,810.0,-5.507484,0.000112,-5.507489,-5.507489,-5.507489,-5.507489,-5.504521
vignette,22518.0,-5.499352,0.197645,-5.507489,-5.507489,-5.507489,-5.507489,1.352268
watermark,22517.0,-3.691618,2.328051,-5.507489,-5.507489,-5.303898,-1.53638,1.75764
watermark-345x114,21708.0,-1.209895,2.386246,-5.507489,-3.298119,-0.159237,0.781334,1.816107
watermark-856x856,21707.0,-4.727679,1.408149,-5.507489,-5.507489,-5.50391,-4.638264,1.364005


In [18]:
resolutions = sorted(df_attacks['dimension'].unique())

In [19]:
pp = pprint.PrettyPrinter()

In [20]:
# Accuracy of the test set by resolution
results = {}
for res in resolutions:
    selection = df_test[df_test['dimension'] == res]
    count = sum(selection['prediction'] == 1)
    results[res] = count/len(selection)
pp.pprint(results)

{144: 0.968,
 240: 0.9906790945406125,
 360: 0.9893475366178429,
 480: 0.9946737683089214,
 720: 0.9946737683089214,
 1080: 0.9573333333333334}


In [21]:
# Accuracy on the attack set by resolution
results = {}
for res in resolutions:
    selection = df_attacks[df_attacks['dimension'] == res]
    count = sum(selection['prediction'] == -1)
    results[res] = count/len(selection)
pp.pprint(results)

{144: 0.7895866881374127,
 240: 0.8008588298443371,
 360: 0.8459391271673198,
 480: 0.8863062966342798,
 720: 0.9447128287707998,
 1080: 0.9945786366076221}


In [22]:
attacks = df_attacks['attack_'].unique()

In [23]:
# Accuracy on the attack set by attack type
results = {}
for attk in attacks:
    selection = df_attacks[df_attacks['attack_'] == attk]
    count = sum(selection['prediction'] == -1)
    results[attk] = count/len(selection)
pp.pprint(results)

{'black_and_white': 0.9810373923083755,
 'flip_vertical': 1.0,
 'vignette': 0.999333866240341,
 'watermark': 0.8812896922325354,
 'watermark-345x114': 0.5208218168417174,
 'watermark-856x856': 0.9893122034366794}


In [24]:

# Accuracy on the attack set by attack type
results = {}
for res in resolutions:
    results[res] = {}
    for attk in attacks:
        selection = df_attacks[(df_attacks['attack_'] == attk) & (df_attacks['dimension'] == res)]
        count = sum(selection['prediction'] == -1)
        results[res][attk] = count/len(selection)
pp.pprint(results)

{144: {'black_and_white': 0.9661604050093259,
       'flip_vertical': 1.0,
       'vignette': 0.9992006394884093,
       'watermark': 0.7503330668798295,
       'watermark-345x114': 0.2269209508015478,
       'watermark-856x856': 0.9845218352681039},
 240: {'black_and_white': 0.9757527311484147,
       'flip_vertical': 1.0,
       'vignette': 0.9992006394884093,
       'watermark': 0.7679189981348254,
       'watermark-345x114': 0.2517965726920951,
       'watermark-856x856': 0.9894969596462134},
 360: {'black_and_white': 0.980548894217959,
       'flip_vertical': 1.0,
       'vignette': 0.9992006394884093,
       'watermark': 0.8723687716493472,
       'watermark-345x114': 0.37092316196793806,
       'watermark-856x856': 0.989217583632845},
 480: {'black_and_white': 0.9845456967759126,
       'flip_vertical': 1.0,
       'vignette': 0.9992006394884093,
       'watermark': 0.9224413646055437,
       'watermark-345x114': 0.5226644555002764,
       'watermark-856x856': 0.9892205638474295