In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import random_projection
from sklearn import svm
import xgboost as xgb
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import time
import pickle
import json
import sys

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation

%matplotlib inline

# Metamodel

In [None]:
UL_features = ['temporal_canny-euclidean', 'temporal_cross_correlation-euclidean',
               'temporal_difference-euclidean', 'temporal_histogram_distance-euclidean',
               'temporal_dct-euclidean', 'size', 'dimension', 'fps',
               'temporal_dct-std', 'temporal_dct-manhattan', 'temporal_gaussian-euclidean']

SL_features = ['dimension', 'fps', 'size', 'temporal_canny-euclidean',
               'temporal_canny-manhattan', 'temporal_canny-max', 'temporal_canny-mean',
               'temporal_canny-std', 'temporal_cross_correlation-euclidean',
               'temporal_cross_correlation-manhattan',
               'temporal_cross_correlation-max', 'temporal_cross_correlation-mean',
               'temporal_cross_correlation-std', 'temporal_dct-euclidean',
               'temporal_dct-manhattan', 'temporal_dct-max', 'temporal_dct-mean',
               'temporal_dct-std', 'temporal_difference-euclidean',
               'temporal_difference-manhattan', 'temporal_difference-max',
               'temporal_difference-mean', 'temporal_difference-std',
               'temporal_gaussian-euclidean', 'temporal_gaussian-manhattan',
               'temporal_gaussian-max', 'temporal_gaussian-mean',
               'temporal_gaussian-std', 'temporal_histogram_distance-euclidean',
               'temporal_histogram_distance-manhattan',
               'temporal_histogram_distance-max', 'temporal_histogram_distance-mean',
               'temporal_histogram_distance-std']


path = '../../machine_learning/cloud_functions/data-large.csv'

In [None]:
data = pd.read_csv(path)
df = pd.DataFrame(data)
df.head()

In [None]:
attack_IDs = []

for row_index, row in df.iterrows():

    if row['attack'] in ['1080p', '720p', '480p', '360p', '240p', '144p']:
        attack_IDs.append(1)
    elif 'bitrate' in row['attack'] or 'watermark' in row['attack']:
        attack_IDs.append(2)
    else:
        attack_IDs.append(0)
        
df['attack_ID'] = attack_IDs


In [None]:
df.head()

In [None]:
df = df.drop(['Unnamed: 0', 'path', 'kind'], axis=1)
print(df.shape)
df = df.dropna()
print(df.shape)

In [None]:
train_prop = 0.8

In [None]:
num_train = int(df.shape[0]*train_prop)

df_attacks = df[df['attack_ID'] == 0] # All attacks except Watermarks and Lowbitrates
df_train_all = df[0:num_train]
df_test_all = df[num_train:]

In [None]:
df_train_1 = df_train_all[df_train_all['attack_ID'] == 2] # Watermarks and Lowbitrates
df_train_0 = df_train_all[df_train_all['attack_ID'] == 1] # Legit assets


df_test_1 = df_test_all[df_test_all['attack_ID'] == 2] # Watermarks and Lowbitrates
df_test_0 = df_test_all[df_test_all['attack_ID'] == 1] # Legit assets

In [None]:
df_train_1 = df_train_1.sample(frac=1)
df_train_0 = df_train_0.sample(frac=1)

df_test_1 = df_test_1.sample(frac=1)
df_test_0 = df_test_0.sample(frac=1)

In [None]:
df_sample_train = df_train_1.sample(df_train_0.shape[0])
df_train = df_train_0.append(df_sample_train)
df_train = df_train.sample(frac=1)

df_sample_test = df_test_1.sample(df_test_0.shape[0])
df_test = df_test_0.append(df_sample_test)
df_test = df_test.sample(frac=1)


In [None]:
X_train = df_train[SL_features]

X_test = df_test[SL_features]
Y_train = df_train['attack_ID']
Y_test = df_test['attack_ID']

X_attacks = df_attacks[SL_features]
Y_attacks = df_attacks['attack_ID']

UL_x_train = df_train[UL_features][(~df_train['attack'].str.contains('bitrate')) &
                      (~df_train['attack'].str.contains('watermark'))]

UL_x_test = df_test[UL_features][(~df_test['attack'].str.contains('bitrate')) &
                      (~df_test['attack'].str.contains('watermark'))]

UL_x_attacks = df_attacks[UL_features]


In [None]:
print(X_train.shape, X_test.shape, X_attacks.shape, UL_x_train.shape,
     UL_x_test.shape, UL_x_attacks.shape)

In [None]:
x_train = np.array(X_train)
x_test = np.array(X_test)

y_train = np.array(Y_train)
y_test = np.array(Y_test)

x_attacks = np.array(X_attacks)
y_attacks = np.array(Y_attacks)

ul_train = np.array(UL_x_train)
ul_test = np.array(UL_x_test)
ul_attacks = np.array(UL_x_attacks)

In [None]:
mM = MinMaxScaler()
sl_x_train = mM.fit_transform(x_train)
sl_x_test = mM.transform(x_test)
sl_x_attacks = mM.transform(x_attacks)


ss = StandardScaler()
ul_x_train = ss.fit_transform(UL_x_train)
ul_x_test = ss.transform(UL_x_test)
ul_x_attacks = ss.transform(UL_x_attacks)


In [None]:
print(sl_x_train.shape, y_train.shape)

In [None]:
sl_rp = random_projection.SparseRandomProjection(n_components=33, random_state=7)

sl_x_train = sl_rp.fit_transform(sl_x_train)
sl_x_test = sl_rp.transform(sl_x_test)
sl_x_attacks = sl_rp.transform(sl_x_attacks)

ul_rp = random_projection.SparseRandomProjection(n_components=10, random_state=7)
ul_x_train = ul_rp.fit_transform(ul_x_train)
ul_x_test = ul_rp.transform(ul_x_test)
ul_x_attacks = ul_rp.transform(ul_x_attacks)

In [None]:
xgb_classifier = xgb.XGBClassifier()
grid = {'max_depth': 10}
xgb_classifier.set_params(**grid)

xgb_classifier.fit(sl_x_train, -y_train+2)

In [None]:
fb, area, tnr, tpr = evaluation.supervised_evaluation(xgb_classifier,
                                                      sl_x_test, -y_test+2)

In [None]:
fb, area, tnr, tpr

In [None]:
ocsvm_classifier = svm.OneClassSVM(kernel='rbf', gamma='auto', nu=0.01, cache_size=7000)
ocsvm_classifier.fit(ul_x_train)

In [None]:
fb, area, tnr, tpr_train, tpr_test = evaluation.unsupervised_evaluation(ocsvm_classifier,
                                                                        ul_x_train,
                                                                        ul_x_test,
                                                                        ul_x_attacks)

In [None]:
fb, area, tnr, tpr_train, tpr_test

In [None]:
df_test_all.head()

In [None]:
test_UL_0 = df_test_all[df_test_all['attack_ID'] != 1][UL_features]
test_UL_1 = df_test_all[df_test_all['attack_ID'] == 1][UL_features]

test_SL_0 = df_test_all[df_test_all['attack_ID'] != 1][SL_features]
test_SL_1 = df_test_all[df_test_all['attack_ID'] == 1][SL_features]

In [None]:
test_SL_0 = mM.transform(test_SL_0)
test_SL_1 = mM.transform(test_SL_1)

test_UL_0 = ss.transform(test_UL_0)
test_UL_1 = ss.transform(test_UL_1)

In [None]:
test_SL_0 = sl_rp.transform(test_SL_0)
test_SL_1 = sl_rp.transform(test_SL_1)

test_UL_0 = ul_rp.transform(test_UL_0)
test_UL_1 = ul_rp.transform(test_UL_1)

In [None]:
pred_SL_0 = xgb_classifier.predict(test_SL_0)
pred_SL_1 = xgb_classifier.predict(test_SL_1)

pred_UL_0 = ocsvm_classifier.predict(test_UL_0)
pred_UL_1 = ocsvm_classifier.predict(test_UL_1)

In [None]:
tnr_sl = sum(pred_SL_0 == 0)/pred_SL_0.shape[0]
tpr_sl = sum(pred_SL_1 == 1)/pred_SL_1.shape[0]
print('The SL TNR is {} and the TPR is {}'.format(tnr_sl, tpr_sl))

In [None]:
tnr_ul = sum(pred_UL_0 == -1)/pred_UL_0.shape[0]
tpr_ul = sum(pred_UL_1 == 1)/pred_UL_1.shape[0]
print('The UL TNR is {} and the TPR is {}'.format(tnr_ul, tpr_ul))

In [None]:
# From [-1, 1] to [0, 1] 
pred_UL_0 = (pred_UL_0 + 1)/2
pred_UL_1 = (pred_UL_1 + 1)/2

In [None]:
# Metamodel is just a logic OR of both outputs:
pred_metamodel_0 = np.logical_or(pred_UL_0, pred_SL_0)
pred_metamodel_1 = np.logical_or(pred_UL_1, pred_SL_1)


In [None]:
tnr_ul = sum(pred_metamodel_0 == 0)/pred_metamodel_0.shape[0]
tpr_ul = sum(pred_metamodel_1 == 1)/pred_metamodel_1.shape[0]
print('The metamodel TNR is {} and the TPR is {}'.format(tnr_ul, tpr_ul))

In [None]:
# Metamodel is just a logic AND of both outputs:
pred_metamodel_0 = np.logical_and(pred_UL_0, pred_SL_0)
pred_metamodel_1 = np.logical_and(pred_UL_1, pred_SL_1)


In [None]:
tnr_ul = sum(pred_metamodel_0 == 0)/pred_metamodel_0.shape[0]
tpr_ul = sum(pred_metamodel_1 == 1)/pred_metamodel_1.shape[0]
print('The metamodel TNR is {} and the TPR is {}'.format(tnr_ul, tpr_ul))