# Import Libraries

In [1]:
import sys
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score, roc_curve, auc
from sklearn import svm

import pprint

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation

%matplotlib inline

Using TensorFlow backend.


# Data preparation

In [2]:
features = ['dimension',
            'size',
            'temporal_dct-mean', 
            'temporal_gaussian-mean', 
            'temporal_gaussian_difference-mean',
            'temporal_gaussian_difference_threshold-mean'
           ]


path = '../../machine_learning/cloud_functions/data-large.csv'

metric_processor = MetricProcessor(features,'UL', path, reduced=False, bins=0)
df = metric_processor.read_and_process_data()
df.shape

dimension
size
temporal_dct-mean
temporal_gaussian-mean
temporal_gaussian_difference-mean
temporal_gaussian_difference_threshold-mean


(201040, 9)

In [3]:
display(df.head())

Unnamed: 0,dimension,size,temporal_dct-mean,temporal_gaussian-mean,temporal_gaussian_difference-mean,temporal_gaussian_difference_threshold-mean,attack_ID,title,attack
0,1080,5841311,0.0,0.0,0.0,0.0,1,/tmp/1080p/-3MYFnEaYu4.mp4,1080p
1,1080,5264133,11633.920898,1.40216,8391589.0,9367.222222,10,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_black_and_white
2,1080,1194152,2241.402283,0.085664,3293602.0,0.0,12,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_low_bitrate_4
3,1080,599858,3447.552338,0.30867,6842059.0,5.333333,10,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_low_bitrate_8
4,1920,5978909,314221.152344,424.608446,324256900.0,168671.666667,10,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_rotate_90_clockwise


In [4]:
# We remove the low bitrates since we are only focused on tampering attacks. The rotation attacks are also
# removed since they will be detected by the pre-verifier just by checking output dimensions
df = df[~(df['attack'].str.contains('low_bitrate')) & ~(df['attack'].str.contains('rotate'))]

In [5]:
df.head()

Unnamed: 0,dimension,size,temporal_dct-mean,temporal_gaussian-mean,temporal_gaussian_difference-mean,temporal_gaussian_difference_threshold-mean,attack_ID,title,attack
0,1080,5841311,0.0,0.0,0.0,0.0,1,/tmp/1080p/-3MYFnEaYu4.mp4,1080p
1,1080,5264133,11633.920898,1.40216,8391589.0,9367.222222,10,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_black_and_white
5,1080,5526314,51759.428101,11.632621,37175390.0,81018.888889,10,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_vignette
6,1080,4815850,3380.053368,1.571478,3589129.0,7434.333333,11,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_watermark
7,1080,4828433,1580.337811,0.546626,1679861.0,3206.777778,11,/tmp/1080p/-3MYFnEaYu4.mp4,1080p_watermark-345x114


In [6]:
(X_train, X_test, X_attacks), (df_train, df_test, df_attacks) = metric_processor.split_test_and_train(df)

print('Shape of train: {}'.format(X_train.shape))
print('Shape of test: {}'.format(X_test.shape))
print('Shape of attacks: {}'.format(X_attacks.shape))

Shape of train: (18014, 6)
Shape of test: (4504, 6)
Shape of attacks: (111778, 6)


The train and test are **only** composed by legit assets

In [7]:
# Scaling the data
ss = StandardScaler()
x_train = ss.fit_transform(X_train)
x_test = ss.transform(X_test)
x_attacks = ss.transform(X_attacks)

# One Class SVM

In [8]:
# Train the model
OCSVM = svm.OneClassSVM(kernel='rbf',gamma='auto', nu=0.01, cache_size=5000)
OCSVM.fit(x_train)

OneClassSVM(cache_size=5000, coef0=0.0, degree=3, gamma='auto', kernel='rbf',
      max_iter=-1, nu=0.01, random_state=None, shrinking=True, tol=0.001,
      verbose=False)

In [9]:
fb, area, tnr, tpr_train, tpr_test = evaluation.unsupervised_evaluation(OCSVM, x_train, x_test, x_attacks)

In [10]:
# Show global results of classification
print('TNR: {}\nTPR_test: {}\nTPR_train: {}\n'.format(tnr, tpr_test, tpr_train))
print('F20: {}\nAUC: {}'.format(fb, area))

TNR: 0.8770151550394532
TPR_test: 0.9844582593250444
TPR_train: 0.9859553680470745

F20: 0.9770593274685251
AUC: 0.9307367071822488


In [11]:
# Show mean distances to the decision function. A negative distance means that the data is classified as
# an attack
train_scores = OCSVM.decision_function(x_train)
test_scores = OCSVM.decision_function(x_test)
attack_scores = OCSVM.decision_function(x_attacks)

print('Mean score values:\n-Train: {}\n-Test: {}\n-Attacks: {}'.format(np.mean(train_scores),
                                                                       np.mean(test_scores),
                                                                       np.mean(attack_scores)))

Mean score values:
-Train: 0.6965852009781303
-Test: 0.686450436003615
-Attacks: -4.113957529024418


In [12]:
train_preds = OCSVM.predict(x_train)
test_preds = OCSVM.predict(x_test)
attack_preds = OCSVM.predict(x_attacks)

In [13]:
df_train['dist_to_dec_funct'] = train_scores
df_test['dist_to_dec_funct'] = test_scores
df_attacks['dist_to_dec_funct'] = attack_scores

In [14]:
df_train['prediction'] = train_preds
df_test['prediction'] = test_preds
df_attacks['prediction'] = attack_preds

# Report

In [15]:
# Zoom in in the mean distances of the test set to the decision function by resolution. Percentiles, standard 
# deviation, min and max values are shown too.
display(df_test[['dist_to_dec_funct', 'dimension']].groupby('dimension').describe())

Unnamed: 0_level_0,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
dimension,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
144,750.0,0.385148,0.527022,-5.478552,0.299384,0.423409,0.561527,0.964925
240,751.0,0.645752,0.409859,-5.02826,0.580821,0.671429,0.758521,1.187137
360,751.0,0.857473,0.370588,-5.395615,0.736928,0.914088,1.031171,1.466897
480,751.0,1.154538,0.36142,-5.118713,1.019623,1.233863,1.353688,1.659532
720,751.0,0.96435,0.435477,-5.440912,0.729421,1.045379,1.262629,1.60286
1080,750.0,0.110274,0.114431,-0.354699,0.009396,0.060812,0.208075,0.327893


In [16]:
# Zoom in in the mean distances of the attack set to the decision function by resolution. Percentiles, standard 
# deviation, min and max values are shown too.
display(df_attacks[['dist_to_dec_funct', 'dimension']].groupby('dimension').describe())

Unnamed: 0_level_0,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
dimension,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
144,18630.0,-3.005954,2.503763,-5.507435,-5.507435,-3.581987,-0.338547,1.040526
240,18630.0,-3.48816,2.564628,-5.507435,-5.507435,-5.257299,-0.942295,1.364772
360,18629.0,-3.951768,2.480545,-5.507435,-5.507435,-5.505062,-2.534613,1.644551
480,18629.0,-4.260564,2.279305,-5.507435,-5.507435,-5.507434,-4.221907,1.815469
720,18630.0,-4.740928,1.736823,-5.507435,-5.507435,-5.507435,-5.480217,1.596314
1080,18630.0,-5.236371,0.899641,-5.507435,-5.507435,-5.507435,-5.507435,0.520427


In [17]:
# Zoom in in the mean distances of the test set to the decision function by attack type. Percentiles, standard 
# deviation, min and max values are shown too.
df_attacks['attack_'] = df_attacks['attack'].apply(lambda x: x[x.find('p') + 2:])
display(df_attacks[['dist_to_dec_funct', 'attack_']].groupby(['attack_']).describe())

Unnamed: 0_level_0,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct,dist_to_dec_funct
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
attack_,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
black_and_white,22518.0,-5.308923,0.989709,-5.507435,-5.507435,-5.507435,-5.507435,1.487182
flip_vertical,810.0,-5.50743,0.000112,-5.507435,-5.507435,-5.507435,-5.507435,-5.504466
vignette,22518.0,-5.499298,0.19764,-5.507435,-5.507435,-5.507435,-5.507435,1.352202
watermark,22517.0,-3.691577,2.328045,-5.507435,-5.507435,-5.303841,-1.536398,1.75733
watermark-345x114,21708.0,-1.209834,2.386288,-5.507435,-3.298315,-0.159175,0.78173,1.815469
watermark-856x856,21707.0,-4.727649,1.408098,-5.507435,-5.507435,-5.503856,-4.638262,1.363131


In [18]:
resolutions = sorted(df_attacks['dimension'].unique())

In [19]:
pp = pprint.PrettyPrinter()

In [20]:
# Accuracy of the test set by resolution
results = {}
for res in resolutions:
    selection = df_test[df_test['dimension'] == res]
    count = sum(selection['prediction'] == 1)
    results[res] = count/len(selection)
pp.pprint(results)

{144: 0.968,
 240: 0.9906790945406125,
 360: 0.9893475366178429,
 480: 0.9946737683089214,
 720: 0.9946737683089214,
 1080: 0.9693333333333334}


In [21]:
# Accuracy on the attack set by resolution
results = {}
for res in resolutions:
    selection = df_attacks[df_attacks['dimension'] == res]
    count = sum(selection['prediction'] == -1)
    results[res] = count/len(selection)
pp.pprint(results)

{144: 0.7896403650026839,
 240: 0.8008588298443371,
 360: 0.8459391271673198,
 480: 0.8863599763809115,
 720: 0.9447128287707998,
 1080: 0.9945786366076221}


In [22]:
attacks = df_attacks['attack_'].unique()

In [23]:
# Accuracy on the attack set by attack type
results = {}
for attk in attacks:
    selection = df_attacks[df_attacks['attack_'] == attk]
    count = sum(selection['prediction'] == -1)
    results[attk] = count/len(selection)
pp.pprint(results)

{'black_and_white': 0.9810373923083755,
 'flip_vertical': 1.0,
 'vignette': 0.999333866240341,
 'watermark': 0.8813341031220855,
 'watermark-345x114': 0.5208678828081813,
 'watermark-856x856': 0.9893122034366794}
