In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import random_projection
from sklearn import svm
import xgboost as xgb
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import time
import pickle
import json
import sys

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation

%matplotlib inline

Using TensorFlow backend.


# Metamodel

In [2]:
UL_features = ['temporal_canny-euclidean', 'temporal_cross_correlation-euclidean',
               'temporal_difference-euclidean', 'temporal_histogram_distance-euclidean',
               'temporal_dct-euclidean', 'size', 'dimension', 'fps',
               'temporal_dct-std', 'temporal_dct-manhattan', 'temporal_gaussian-euclidean']

SL_features = ['dimension', 'fps', 'size', 'temporal_canny-euclidean',
               'temporal_canny-manhattan', 'temporal_canny-max', 'temporal_canny-mean',
               'temporal_canny-std', 'temporal_cross_correlation-euclidean',
               'temporal_cross_correlation-manhattan',
               'temporal_cross_correlation-max', 'temporal_cross_correlation-mean',
               'temporal_cross_correlation-std', 'temporal_dct-euclidean',
               'temporal_dct-manhattan', 'temporal_dct-max', 'temporal_dct-mean',
               'temporal_dct-std', 'temporal_difference-euclidean',
               'temporal_difference-manhattan', 'temporal_difference-max',
               'temporal_difference-mean', 'temporal_difference-std',
               'temporal_gaussian-euclidean', 'temporal_gaussian-manhattan',
               'temporal_gaussian-max', 'temporal_gaussian-mean',
               'temporal_gaussian-std', 'temporal_histogram_distance-euclidean',
               'temporal_histogram_distance-manhattan',
               'temporal_histogram_distance-max', 'temporal_histogram_distance-mean',
               'temporal_histogram_distance-std']


path = '../../machine_learning/cloud_functions/data-large.csv'

In [3]:
data = pd.read_csv(path)
df = pd.DataFrame(data)
df.head()

Unnamed: 0.1,Unnamed: 0,attack,dimension,fps,kind,path,size,temporal_canny-euclidean,temporal_canny-manhattan,temporal_canny-max,...,temporal_gaussian-mean,temporal_gaussian-series,temporal_gaussian-std,temporal_histogram_distance-euclidean,temporal_histogram_distance-manhattan,temporal_histogram_distance-max,temporal_histogram_distance-mean,temporal_histogram_distance-series,temporal_histogram_distance-std,title
0,0,1080p_black_and_white,1080,29,asset_input,/tmp/1080p_black_and_white/-3MYFnEaYu4.mp4,5264133,7.16759,0.013203,0.130887,...,1.514054,[1.86029 1.87435 1.89874 1.86723 1.88355 1.911...,0.285477,3.340637,0.028868,0.38537,0.309919,[0.29224 0.29607 0.29938 0.29908 0.3001 0.310...,0.031505,/tmp/-3MYFnEaYu4.mp4
1,1,1080p_low_bitrate_4,1080,29,asset_input,/tmp/1080p_low_bitrate_4/-3MYFnEaYu4.mp4,1194152,7.732898,0.012222,0.064453,...,0.030526,[0.01059 0.01044 0.01212 0.00831 0.00823 0.009...,0.018843,0.366303,0.394302,0.062149,0.030749,[0.01043 0.0098 0.01126 0.01094 0.00974 0.009...,0.014875,/tmp/-3MYFnEaYu4.mp4
2,2,1080p_rotate_90_clockwise,1080,29,asset_input,/tmp/1080p_rotate_90_clockwise/-3MYFnEaYu4.mp4,5978909,5.940646,0.016015,0.274441,...,208.458241,[218.2681 213.59853 216.40738 216.20153 217.3...,20.621722,0.046587,1.976843,0.006103,0.004297,[0.00477 0.00423 0.0037 0.00391 0.00474 0.004...,0.000639,/tmp/-3MYFnEaYu4.mp4
3,3,1080p_vignette,1080,29,asset_input,/tmp/1080p_vignette/-3MYFnEaYu4.mp4,5526314,7.290457,0.012975,0.117133,...,11.046699,[10.59078 10.40433 10.5296 10.54755 10.62077 ...,0.847564,3.906239,0.024558,0.466188,0.362786,[0.33418 0.32359 0.33261 0.33229 0.33937 0.349...,0.032724,/tmp/-3MYFnEaYu4.mp4
4,4,1080p_watermark,1080,29,asset_input,/tmp/1080p_watermark/-3MYFnEaYu4.mp4,4815850,7.570566,0.012493,0.10989,...,0.805858,[0.82355 0.81863 0.81722 0.81286 0.81449 0.819...,0.030146,0.236711,0.651023,0.024057,0.022053,[0.02078 0.02065 0.0212 0.02081 0.0213 0.020...,0.000959,/tmp/-3MYFnEaYu4.mp4


In [4]:
attack_IDs = []

for row_index, row in df.iterrows():

    if row['attack'] in ['1080p', '720p', '480p', '360p', '240p', '144p']:
        attack_IDs.append(1)
    elif 'bitrate' in row['attack'] or 'watermark' in row['attack']:
        attack_IDs.append(2)
    else:
        attack_IDs.append(0)
        
df['attack_ID'] = attack_IDs


In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,attack,dimension,fps,kind,path,size,temporal_canny-euclidean,temporal_canny-manhattan,temporal_canny-max,...,temporal_gaussian-series,temporal_gaussian-std,temporal_histogram_distance-euclidean,temporal_histogram_distance-manhattan,temporal_histogram_distance-max,temporal_histogram_distance-mean,temporal_histogram_distance-series,temporal_histogram_distance-std,title,attack_ID
0,0,1080p_black_and_white,1080,29,asset_input,/tmp/1080p_black_and_white/-3MYFnEaYu4.mp4,5264133,7.16759,0.013203,0.130887,...,[1.86029 1.87435 1.89874 1.86723 1.88355 1.911...,0.285477,3.340637,0.028868,0.38537,0.309919,[0.29224 0.29607 0.29938 0.29908 0.3001 0.310...,0.031505,/tmp/-3MYFnEaYu4.mp4,0
1,1,1080p_low_bitrate_4,1080,29,asset_input,/tmp/1080p_low_bitrate_4/-3MYFnEaYu4.mp4,1194152,7.732898,0.012222,0.064453,...,[0.01059 0.01044 0.01212 0.00831 0.00823 0.009...,0.018843,0.366303,0.394302,0.062149,0.030749,[0.01043 0.0098 0.01126 0.01094 0.00974 0.009...,0.014875,/tmp/-3MYFnEaYu4.mp4,2
2,2,1080p_rotate_90_clockwise,1080,29,asset_input,/tmp/1080p_rotate_90_clockwise/-3MYFnEaYu4.mp4,5978909,5.940646,0.016015,0.274441,...,[218.2681 213.59853 216.40738 216.20153 217.3...,20.621722,0.046587,1.976843,0.006103,0.004297,[0.00477 0.00423 0.0037 0.00391 0.00474 0.004...,0.000639,/tmp/-3MYFnEaYu4.mp4,0
3,3,1080p_vignette,1080,29,asset_input,/tmp/1080p_vignette/-3MYFnEaYu4.mp4,5526314,7.290457,0.012975,0.117133,...,[10.59078 10.40433 10.5296 10.54755 10.62077 ...,0.847564,3.906239,0.024558,0.466188,0.362786,[0.33418 0.32359 0.33261 0.33229 0.33937 0.349...,0.032724,/tmp/-3MYFnEaYu4.mp4,0
4,4,1080p_watermark,1080,29,asset_input,/tmp/1080p_watermark/-3MYFnEaYu4.mp4,4815850,7.570566,0.012493,0.10989,...,[0.82355 0.81863 0.81722 0.81286 0.81449 0.819...,0.030146,0.236711,0.651023,0.024057,0.022053,[0.02078 0.02065 0.0212 0.02081 0.0213 0.020...,0.000959,/tmp/-3MYFnEaYu4.mp4,2


In [6]:
df = df.drop(['Unnamed: 0', 'path', 'kind'], axis=1)
print(df.shape)
df = df.dropna()
print(df.shape)

(138069, 42)
(128164, 42)


In [7]:
train_prop = 0.8

In [8]:
num_train = int(df.shape[0]*train_prop)

df_attacks = df[df['attack_ID'] == 0] # All attacks except Watermarks and Lowbitrates
df_train_all = df[0:num_train]
df_test_all = df[num_train:]

In [9]:
df_train_1 = df_train_all[df_train_all['attack_ID'] == 2] # Watermarks and Lowbitrates
df_train_0 = df_train_all[df_train_all['attack_ID'] == 1] # Legit assets


df_test_1 = df_test_all[df_test_all['attack_ID'] == 2] # Watermarks and Lowbitrates
df_test_0 = df_test_all[df_test_all['attack_ID'] == 1] # Legit assets

In [10]:
df_train_1 = df_train_1.sample(frac=1)
df_train_0 = df_train_0.sample(frac=1)

df_test_1 = df_test_1.sample(frac=1)
df_test_0 = df_test_0.sample(frac=1)

In [11]:
df_sample_train = df_train_1.sample(df_train_0.shape[0])
df_train = df_train_0.append(df_sample_train)
df_train = df_train.sample(frac=1)

df_sample_test = df_test_1.sample(df_test_0.shape[0])
df_test = df_test_0.append(df_sample_test)
df_test = df_test.sample(frac=1)


In [12]:
X_train = df_train[SL_features]

X_test = df_test[SL_features]
Y_train = df_train['attack_ID']
Y_test = df_test['attack_ID']

X_attacks = df_attacks[SL_features]
Y_attacks = df_attacks['attack_ID']

UL_x_train = df_train[UL_features][(~df_train['attack'].str.contains('bitrate')) &
                      (~df_train['attack'].str.contains('watermark'))]

UL_x_test = df_test[UL_features][(~df_test['attack'].str.contains('bitrate')) &
                      (~df_test['attack'].str.contains('watermark'))]

UL_x_attacks = df_attacks[UL_features]


In [13]:
print(X_train.shape, X_test.shape, X_attacks.shape, UL_x_train.shape,
     UL_x_test.shape, UL_x_attacks.shape)

(29100, 33) (7276, 33) (66259, 33) (14550, 11) (3638, 11) (66259, 11)


In [14]:
x_train = np.array(X_train)
x_test = np.array(X_test)

y_train = np.array(Y_train)
y_test = np.array(Y_test)

x_attacks = np.array(X_attacks)
y_attacks = np.array(Y_attacks)

ul_train = np.array(UL_x_train)
ul_test = np.array(UL_x_test)
ul_attacks = np.array(UL_x_attacks)

In [15]:
mM = MinMaxScaler()
sl_x_train = mM.fit_transform(x_train)
sl_x_test = mM.transform(x_test)
sl_x_attacks = mM.transform(x_attacks)


ss = StandardScaler()
ul_x_train = ss.fit_transform(UL_x_train)
ul_x_test = ss.transform(UL_x_test)
ul_x_attacks = ss.transform(UL_x_attacks)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [16]:
print(sl_x_train.shape, y_train.shape)

(29100, 33) (29100,)


In [17]:
sl_rp = random_projection.SparseRandomProjection(n_components=33, random_state=7)

sl_x_train = sl_rp.fit_transform(sl_x_train)
sl_x_test = sl_rp.transform(sl_x_test)
sl_x_attacks = sl_rp.transform(sl_x_attacks)

ul_rp = random_projection.SparseRandomProjection(n_components=10, random_state=7)
ul_x_train = ul_rp.fit_transform(ul_x_train)
ul_x_test = ul_rp.transform(ul_x_test)
ul_x_attacks = ul_rp.transform(ul_x_attacks)

In [18]:
xgb_classifier = xgb.XGBClassifier()
grid = {'max_depth': 10}
xgb_classifier.set_params(**grid)

xgb_classifier.fit(sl_x_train, -y_train+2)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [19]:
fb, area, tnr, tpr = evaluation.supervised_evaluation(xgb_classifier,
                                                      sl_x_test, -y_test+2)

In [20]:
fb, area, tnr, tpr

(0.9650840937506855,
 0.9637163276525563,
 0.9623419461242441,
 0.9650907091808686)

In [21]:
ocsvm_classifier = svm.OneClassSVM(kernel='rbf', gamma='auto', nu=0.01, cache_size=7000)
ocsvm_classifier.fit(ul_x_train)

OneClassSVM(cache_size=7000, coef0=0.0, degree=3, gamma='auto', kernel='rbf',
      max_iter=-1, nu=0.01, random_state=None, shrinking=True, tol=0.001,
      verbose=False)

In [22]:
fb, area, tnr, tpr_train, tpr_test = evaluation.unsupervised_evaluation(ocsvm_classifier,
                                                                        ul_x_train,
                                                                        ul_x_test,
                                                                        ul_x_attacks)

In [23]:
fb, area, tnr, tpr_train, tpr_test

(0.9835690433157346,
 0.9861275020001847,
 0.9881978297287916,
 0.9900343642611684,
 0.9840571742715778)

In [24]:
df_test_all.head()

Unnamed: 0,attack,dimension,fps,size,temporal_canny-euclidean,temporal_canny-manhattan,temporal_canny-max,temporal_canny-mean,temporal_canny-series,temporal_canny-std,...,temporal_gaussian-series,temporal_gaussian-std,temporal_histogram_distance-euclidean,temporal_histogram_distance-manhattan,temporal_histogram_distance-max,temporal_histogram_distance-mean,temporal_histogram_distance-series,temporal_histogram_distance-std,title,attack_ID
110297,240p_black_and_white,240,29,316036,6.759143,0.014136,0.254123,0.159917,[0.18772 0.13859 0.20858 0.12728 0.13745 0.145...,0.042924,...,[13.46098 13.3865 13.4334 13.37493 13.29295 ...,6.639035,20.995767,0.004463,2.076829,1.957038,[1.94013 1.9393 1.9631 1.95279 1.9565 1.960...,0.056915,/tmp/nZo2aqM1WLA.mp4,0
110298,240p_low_bitrate_4,240,29,75433,6.688808,0.014293,0.249566,0.166703,[0.18991 0.13837 0.20888 0.12674 0.13618 0.143...,0.043495,...,[0.40349 0.39298 0.40838 0.45251 0.47301 0.488...,0.077633,2.866402,0.033861,0.322376,0.265502,[0.29704 0.29349 0.29764 0.29988 0.29999 0.299...,0.030892,/tmp/nZo2aqM1WLA.mp4,2
110299,240p_rotate_90_clockwise,240,29,355590,6.190281,0.015538,0.310818,0.215431,[0.25727 0.20535 0.2755 0.21102 0.20543 0.229...,0.050422,...,[71.91643 71.47779 71.19989 70.92352 71.09838 ...,2.92017,1.508558,0.066075,0.160927,0.140298,[0.13777 0.14152 0.14164 0.14358 0.14031 0.139...,0.01028,/tmp/nZo2aqM1WLA.mp4,0
110300,240p_vignette,240,29,313250,6.780251,0.01409,0.248589,0.15793,[0.18435 0.13384 0.20283 0.1231 0.13729 0.144...,0.042682,...,[18.42755 18.24399 18.20767 18.08262 18.12104 ...,0.744284,6.359597,0.014901,0.639582,0.592273,[0.63692 0.63164 0.63958 0.62809 0.62193 0.623...,0.030061,/tmp/nZo2aqM1WLA.mp4,0
110301,240p_watermark,240,29,288171,6.555434,0.014609,0.267524,0.179849,[0.2034 0.15245 0.22217 0.15945 0.14993 0.174...,0.045044,...,[0.6658 0.65717 0.65073 0.66724 0.66387 0.680...,0.158152,2.365951,0.041118,0.247919,0.220175,[0.23004 0.23161 0.22889 0.22861 0.21624 0.223...,0.0141,/tmp/nZo2aqM1WLA.mp4,2


In [25]:
test_UL_0 = df_test_all[df_test_all['attack_ID'] != 1][UL_features]
test_UL_1 = df_test_all[df_test_all['attack_ID'] == 1][UL_features]

test_SL_0 = df_test_all[df_test_all['attack_ID'] != 1][SL_features]
test_SL_1 = df_test_all[df_test_all['attack_ID'] == 1][SL_features]

In [26]:
test_SL_0 = mM.transform(test_SL_0)
test_SL_1 = mM.transform(test_SL_1)

test_UL_0 = ss.transform(test_UL_0)
test_UL_1 = ss.transform(test_UL_1)

  after removing the cwd from sys.path.
  """


In [27]:
test_SL_0 = sl_rp.transform(test_SL_0)
test_SL_1 = sl_rp.transform(test_SL_1)

test_UL_0 = ul_rp.transform(test_UL_0)
test_UL_1 = ul_rp.transform(test_UL_1)

In [28]:
pred_SL_0 = xgb_classifier.predict(test_SL_0)
pred_SL_1 = xgb_classifier.predict(test_SL_1)

pred_UL_0 = ocsvm_classifier.predict(test_UL_0)
pred_UL_1 = ocsvm_classifier.predict(test_UL_1)

In [29]:
tnr_sl = sum(pred_SL_0 == 0)/pred_SL_0.shape[0]
tpr_sl = sum(pred_SL_1 == 1)/pred_SL_1.shape[0]
print('The SL TNR is {} and the TPR is {}'.format(tnr_sl, tpr_sl))

The SL TNR is 0.9824505569447601 and the TPR is 0.9650907091808686


In [30]:
tnr_ul = sum(pred_UL_0 == -1)/pred_UL_0.shape[0]
tpr_ul = sum(pred_UL_1 == 1)/pred_UL_1.shape[0]
print('The UL TNR is {} and the TPR is {}'.format(tnr_ul, tpr_ul))

The UL TNR is 0.7778131393498522 and the TPR is 0.9840571742715778


In [31]:
# From [-1, 1] to [0, 1] 
pred_UL_0 = (pred_UL_0 + 1)/2
pred_UL_1 = (pred_UL_1 + 1)/2

In [32]:
# Metamodel is just a logic OR of both outputs:
pred_metamodel_0 = np.logical_or(pred_UL_0, pred_SL_0)
pred_metamodel_1 = np.logical_or(pred_UL_1, pred_SL_1)


In [33]:
tnr_ul = sum(pred_metamodel_0 == 0)/pred_metamodel_0.shape[0]
tpr_ul = sum(pred_metamodel_1 == 1)/pred_metamodel_1.shape[0]
print('The metamodel TNR is {} and the TPR is {}'.format(tnr_ul, tpr_ul))

The metamodel TNR is 0.7759490793362128 and the TPR is 0.997526113249038


In [34]:
# Metamodel is just a logic AND of both outputs:
pred_metamodel_0 = np.logical_and(pred_UL_0, pred_SL_0)
pred_metamodel_1 = np.logical_and(pred_UL_1, pred_SL_1)


In [35]:
tnr_ul = sum(pred_metamodel_0 == 0)/pred_metamodel_0.shape[0]
tpr_ul = sum(pred_metamodel_1 == 1)/pred_metamodel_1.shape[0]
print('The metamodel TNR is {} and the TPR is {}'.format(tnr_ul, tpr_ul))

The metamodel TNR is 0.9843146169583996 and the TPR is 0.9516217702034084
