In [None]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
import keras.backend as K
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from itertools import product
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn

%matplotlib inline

In [None]:
data = pd.read_csv('../../data_analytics/output/metrics-large.csv')
df = pd.DataFrame(data)
df['title'] = df['level_0']
attack_series = []
attack_IDs = []
dimensions_series = []
for _, row in df.iterrows():
    attack_series.append(row['level_1'].split('/')[-2])

df['attack'] = attack_series

for _, row in df.iterrows():
    dimension = int(row['attack'].split('_')[0].replace('p',''))
    dimensions_series.append(dimension)
    if row['attack'] in ['1080p', '720p', '480p', '360p', '240p', '144p']:
            attack_IDs.append(1)
    else:
        attack_IDs.append(0)

df['attack_ID'] = attack_IDs
df['dimension'] = dimensions_series
df = df.drop(['Unnamed: 0',
         'temporal_canny-series',
         'temporal_cross_correlation-series', 
         'temporal_difference-series', 
         'temporal_histogram_distance-series', 
         'temporal_histogram_distance-cosine',
         'level_0', 
#           'dimension',
#           'temporal_canny-cosine',
#           'temporal_cross_correlation-cosine',
#               'temporal_difference-cosine',
         'level_1'],axis=1)
#df =df.dropna(axis=0)

In [None]:
# There were some errors when calculatin metrics, so we add just the dct euclidean
new_data = pd.read_csv('../../data_analytics/output/metrics.csv')
new_df = pd.DataFrame(new_data)
df['temporal_dct-euclidean'] = new_df['temporal_dct-euclidean']

In [None]:
df.head(5)

In [None]:
df = df.dropna(axis=1)
df.head(5)

In [None]:
df_corr = df.corr()
plt.figure(figsize=(10,10))
corr = df_corr.corr('spearman')
corr.style.background_gradient().set_precision(2)

In [None]:
train_prop = 0.8

df_1 = df[df['attack_ID'] == 1]
df_0 = df[df['attack_ID'] == 0]


num_train = int(df_1.shape[0]*0.8)
df_train = df_1[0:num_train]
df_test = df_1[num_train:]
df_attacks = df_0

df_train = df_train.sample(frac=1)
df_test = df_test.sample(frac=1)
df_attacks = df_attacks.sample(frac=1)

X_train = df_train.drop(['title',
                         'attack', 
                         'attack_ID',
                         'dimension',
                         'vmaf'],axis=1)
X_train = np.asarray(X_train)

X_test = df_test.drop(['title',  
                     'attack', 
                     'attack_ID',
                     'dimension',  
                     'vmaf'],axis=1)
X_test = np.asarray(X_test)

X_attacks = df_attacks.drop(['title',  
                     'attack', 
                     'attack_ID',
                     'dimension',
                     'vmaf'],axis=1)

X_attacks = np.asarray(X_attacks)

print('Shape of train: {}'.format(X_train.shape))
print('Shape of test: {}'.format(X_test.shape))
print('Shape of attacks: {}'.format(X_attacks.shape))


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

x_train = ss.fit_transform(X_train)
x_test = ss.transform(X_test)
x_attacks = ss.transform(X_attacks)

variances = []
components = reversed(range(1,x_test.shape[1]+1))
for i in components:
    pca = PCA(n_components=i)
    pca.fit(x_train)
    variances.append(sum(pca.explained_variance_ratio_))
plt.plot(list(reversed(range(1,x_test.shape[1]+1))), variances)   
plt.grid()
plt.title('PCA')
plt.xlabel('Number of components')
plt.ylabel('Explained variance')

In [None]:
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(x_train)
test_reduced = pca.transform(x_test)
attack_reduced = pca.transform(x_attacks)


In [None]:
f, ax = plt.subplots(1,3, figsize=(25,10))
ax[0].set_title("Train set")
ax[1].set_title("Test set")
ax[2].set_title("Attack set")
ax[0].scatter(X_reduced[:,0], X_reduced[:,1], color='black')
ax[1].scatter(test_reduced[:,0], test_reduced[:,1], color='red')
ax[2].scatter(attack_reduced[:,0], attack_reduced[:,1], color='blue')

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(attack_reduced[:,0], attack_reduced[:,1], color='red', label='attack')
plt.scatter(X_reduced[:,0], X_reduced[:,1], color='green', label='Train')
plt.scatter(test_reduced[:,0], test_reduced[:,1], color='yellow', label='Test')
plt.legend()


# One Class SVM

In [None]:
from sklearn import metrics 

nus = [0.1, 0.01, 0.001, 0.0001]

svm_results = pd.DataFrame(columns=['gamma', 'nu', 'n_components', 'TPR_train', 
                                    'TPR_test', 'TNR', 'model', 'auc', 'f_beta'])
for n in reversed(range(1,x_test.shape[1]+1)):
    pca = PCA(n_components=n)
    X_reduced = pca.fit_transform(x_train)
    test_reduced = pca.transform(x_test)
    attack_reduced = pca.transform(x_attacks)

    gammas = [X_reduced.shape[1], 2*X_reduced.shape[1], X_reduced.shape[1]/2, 'auto']                                 
    for nu in nus:
        for gamma in gammas:
            classifier = svm.OneClassSVM(kernel='rbf',gamma=gamma, nu=nu, cache_size=5000)
            classifier.fit(X_reduced)
            y_pred_train = classifier.predict(X_reduced)
            y_pred_test = classifier.predict(test_reduced)
            y_pred_outliers = classifier.predict(attack_reduced)
            n_accurate_train = y_pred_train[y_pred_train == 1].size
            n_accurate_test = y_pred_test[y_pred_test == 1].size
            n_accurate_outliers = y_pred_outliers[y_pred_outliers == -1].size
            
            
            fpr, tpr, _ = metrics.roc_curve(np.concatenate([np.ones(y_pred_test.shape[0]),
                                                             -1*np.ones(y_pred_outliers.shape[0])]), 
                                              np.concatenate([y_pred_test, y_pred_outliers]) , pos_label=1)
            fb = metrics.fbeta_score(np.concatenate([np.ones(y_pred_test.shape[0]),
                                                             -1*np.ones(y_pred_outliers.shape[0])]), 
                                              np.concatenate([y_pred_test, y_pred_outliers]),beta=20 ,pos_label=1)
            
            svm_results = svm_results.append({'nu': nu, 'gamma': gamma, 'n_components': n, 'TPR_train': n_accurate_train/X_reduced.shape[0],
                           'TPR_test': n_accurate_test/test_reduced.shape[0], 'TNR': n_accurate_outliers/attack_reduced.shape[0],
                           'model': 'svm', 'auc': metrics.auc(fpr, tpr), 'f_beta': fb}, ignore_index=True)


In [None]:
svm_results.sort_values('f_beta', ascending=False).head(10)

In [None]:
svm_results.sort_values('auc', ascending=False).head(20)

In [None]:
# Let's create a metric in order to find a best model from that metric
# We will get just accuracies from test y attack above 0.7, and will add them

In [None]:
def metric(row, th=0.7):
    if row['test_acc'] < th or row['attack_acc'] < th:
        return 0
    else:
        return row['test_acc'] + row['attack_acc']

In [None]:
svm_results['score'] = svm_results.apply(metric, axis=1)

In [None]:
svm_results[svm_results['test_acc'] > svm_results['attack_acc']].sort_values('score', ascending=False).head(10)

# Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

estimators = [100, 150, 200]
contaminations = [0.01]
isolation_results = pd.DataFrame(columns=['estimators', 'contamination', 'n_components', 'max_features',
                                          'TPR_test','TPR_train' , 'TNR', 'model', 'auc', 'f_beta'])
for n in reversed(range(1,x_test.shape[1]+1)):
    pca = PCA(n_components=n)
    X_reduced = pca.fit_transform(x_train)
    test_reduced = pca.transform(x_test)
    attack_reduced = pca.transform(x_attacks)
    max_features = list(range(1, n + 1))
    for estimator in estimators:
        for contamination in contaminations:
            for max_feature in max_features:
                classifier = IsolationForest(n_estimators=estimator,
                                             contamination=contamination,
                                             max_features=max_feature,
                                            n_jobs=5)
                
                classifier.fit(X_reduced)
                y_pred_train = classifier.predict(X_reduced)
                y_pred_test = classifier.predict(test_reduced)
                y_pred_outliers = classifier.predict(attack_reduced)
                n_error_train = y_pred_train[y_pred_train == 1].size
                n_error_test = y_pred_test[y_pred_test == 1].size
                n_error_outliers = y_pred_outliers[y_pred_outliers == -1].size
                
                fpr, tpr, _ = metrics.roc_curve(np.concatenate([np.ones(y_pred_test.shape[0]),
                                                 -1*np.ones(y_pred_outliers.shape[0])]), 
                                  np.concatenate([y_pred_test, y_pred_outliers]) , pos_label=1)
                
                fb = metrics.fbeta_score(np.concatenate([np.ones(y_pred_test.shape[0]),
                                                             -1*np.ones(y_pred_outliers.shape[0])]), 
                                              np.concatenate([y_pred_test, y_pred_outliers]),beta=20 ,pos_label=1)

                isolation_results = isolation_results.append({'estimators': estimator, 'contamination': contamination,
                                                              'n_components': n, 'max_features': max_feature,
                                                              'TPR_train': n_error_train/X_reduced.shape[0],
                                                              'TPR_test': n_error_train/X_reduced.shape[0],
                                                              'TNR': n_error_outliers/attack_reduced.shape[0],
                                                              'model': 'isolation_forest',
                                                              'auc': metrics.auc(fpr, tpr),
                                                              'f_beta': fb}, ignore_index=True)

In [None]:
isolation_results['score'] = isolation_results.apply(metric, axis=1)

In [None]:
isolation_results[isolation_results['test_acc'] > isolation_results['attack_acc']].sort_values('test_acc', ascending=False).head(10)

In [None]:
isolation_results.sort_values('f_beta', ascending=False).head(10)

# Let's plot ROCs

In [None]:
def plot_roc(fpr, tpr, title):
    roc_auc = metrics.auc(fpr, tpr)
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
            lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic: {}'.format(title))
    plt.legend(loc="lower right")
    plt.show()

In [None]:
best_svm = svm_results.sort_values('f_beta', ascending=False).iloc[0]
pca = PCA(n_components=best_svm['n_components'])
X_reduced = pca.fit_transform(x_train)
test_reduced = pca.transform(x_test)
attack_reduced = pca.transform(x_attacks)
classifier = svm.OneClassSVM(kernel='rbf',gamma=best_svm['gamma'], nu=best_svm['nu'], cache_size=5000)
classifier.fit(X_reduced)

y_pred_train = classifier.predict(X_reduced)
y_pred_test = classifier.predict(test_reduced)
y_pred_outliers = classifier.predict(attack_reduced)
n_accurate_train = y_pred_train[y_pred_train == 1].size
n_accurate_test = y_pred_test[y_pred_test == 1].size
n_accurate_outliers = y_pred_outliers[y_pred_outliers == -1].size

fpr, tpr, _ = metrics.roc_curve(np.concatenate([np.ones(y_pred_test.shape[0]),
                                                 -1*np.ones(y_pred_outliers.shape[0])]), 
                                  np.concatenate([y_pred_test, y_pred_outliers]) , pos_label=1)

plot_roc(fpr, tpr, 'One Class SVM')

In [None]:
pca = PCA(n_components=1)
X_reduced = pca.fit_transform(x_train)
test_reduced = pca.transform(x_test)
attack_reduced = pca.transform(x_attacks)


classifier = IsolationForest(n_estimators=150,
                            contamination=0.01,
                            max_features=1,
                            n_jobs=5)
classifier.fit(X_reduced)
y_pred_train = classifier.predict(X_reduced)
y_pred_test = classifier.predict(test_reduced)
y_pred_outliers = classifier.predict(attack_reduced)
n_accurate_train = y_pred_train[y_pred_train == 1].size
n_accurate_test = y_pred_test[y_pred_test == 1].size
n_accurate_outliers = y_pred_outliers[y_pred_outliers == -1].size

fpr, tpr, _ = metrics.roc_curve(np.concatenate([np.ones(y_pred_test.shape[0]),
                                                 -1*np.ones(y_pred_outliers.shape[0])]), 
                                  np.concatenate([y_pred_test, y_pred_outliers]) , pos_label=1)

plot_roc(fpr, tpr, 'Isolation Forest')

In [None]:
best_svm = svm_results.sort_values('f_beta', ascending=False).iloc[0]
pca = PCA(n_components=best_svm['n_components']-1)
X_reduced = pca.fit_transform(x_train)
test_reduced = pca.transform(x_test)
attack_reduced = pca.transform(x_attacks)
classifier = svm.OneClassSVM(kernel='rbf',gamma=best_svm['gamma'], nu=best_svm['nu'], cache_size=5000)
# pca = PCA(n_components=1)
# X_reduced = pca.fit_transform(x_train)
# test_reduced = pca.transform(x_test)
# attack_reduced = pca.transform(x_attacks)


# classifier = IsolationForest(n_estimators=150,
#                             contamination=0.01,
#                             max_features=1,
#                             n_jobs=5)
classifier.fit(X_reduced)
classifier.fit(X_reduced)

In [None]:
y_pred_outliers = classifier.predict(attack_reduced)

In [None]:
df_attacks_reset = df_attacks.reset_index()

accurate_outliers_df = pd.DataFrame(y_pred_outliers, columns=['pred']) 
undetected = df_attacks_reset[accurate_outliers_df['pred'] == 1][['attack', 'title', 'dimension']]


In [None]:
undetected.groupby(['dimension', 'attack']).count()

In [None]:
undetected.groupby('title').count().sort_values(['attack'], ascending=False)

In [None]:
import matplotlib.font_manager

xx, yy = np.meshgrid(np.linspace(-3, 30, 500), np.linspace(-3, 30, 500))

y_pred_train = classifier.predict(X_reduced)
y_pred_test = classifier.predict(test_reduced)
y_pred_outliers = classifier.predict(attack_reduced)
n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

# plot the line, the points, and the nearest vectors to the plane
Z = classifier.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize=(15,10))
plt.title("Novelty Detection")
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')

s = 40
c = plt.scatter(attack_reduced[:, 1], attack_reduced[:, 0], c='gold', s=s,
                edgecolors='k')
b2 = plt.scatter(X_test[:, 1], X_test[:, 0], c='blueviolet', s=s,
                 edgecolors='k')

b1 = plt.scatter(X_train[:, 1], X_train[:, 0], c='white', s=s, edgecolors='k')

plt.axis('tight')
plt.xlim((-3, 30))
plt.ylim((-3, 30))
plt.legend([a.collections[0], b1, b2, c],
           ["learned frontier", "training observations",
            "new regular observations", "new abnormal observations"],
           loc="upper right",
           prop=matplotlib.font_manager.FontProperties(size=11))
plt.xlabel(
    "train: %d ; novel regular: %d ; "
    "novel abnormal: %d"
    % (n_error_train, n_error_test, n_error_outliers))
plt.show()

# Conclusions

After the analysis made with the results of the best SVM, we have reached some interesting results:
* The attacks that pass through the system belong to the same attacks: watermarks and low bitrate
* The attacks that pass through the network do not belong to the same resolution

This means that we might be able to generalize into other attacks and resolutions

# Autoencoder

In [None]:
latent_dim = 3
input_vector = Input(shape=(X_train.shape[1],))
encoded = Dense(latent_dim, activation='relu')(input_vector)
decoded = Dense(X_train.shape[1], activity_regularizer=regularizers.l1(10e-5))(encoded)
autoencoder = Model(input_vector, decoded)
encoder = Model(input_vector, encoded)
autoencoder.compile(optimizer=Adam(lr=0.001), loss='mse')
network_history = autoencoder.fit(x_train, x_train, shuffle=True, batch_size=16, epochs=100,
                                  validation_data=(x_test, x_test), verbose=True)


In [None]:
def plot_history(network_history,title='Autoencoder Loss'):
    plt.figure(figsize=(10,5))
    plt.title(title)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.plot(network_history.history['loss'])
    plt.plot(network_history.history['val_loss'])
    plt.legend(['Training', 'Validation'])
    plt.show()

In [None]:
plot_history(network_history)

In [None]:
print('Mean loss on train: {}'.format(autoencoder.evaluate(x_train, x_train, batch_size=8, verbose=False)))
print('Mean loss on test: {}'.format(autoencoder.evaluate(x_test, x_test, batch_size=8, verbose=False)))
print('Mean loss on attacks: {}'.format(autoencoder.evaluate(x_attacks, x_attacks, batch_size=8, verbose=False)))

In [None]:
x_train_pred = autoencoder.predict(x_train, batch_size=8)
x_test_pred = autoencoder.predict(x_test, batch_size=8)
x_attacks_pred = autoencoder.predict(x_attacks, batch_size=8)

In [None]:
mse_train = (((x_train - x_train_pred)**2).mean(axis=1))
mse_test = (((x_test - x_test_pred)**2).mean(axis=1))
mse_attacks = (((x_attacks - x_attacks_pred)**2).mean(axis=1))

plt.figure()
kwargs = dict(histtype='stepfilled', alpha=0.3, normed=True, bins=200)

plt.hist(mse_train, **kwargs)
plt.hist(mse_test, **kwargs)
plt.hist(mse_attacks, **kwargs)
plt.legend(['Train', 'Test', 'Attacks'])
plt.title('Histograms of mse')
plt.xlim([0, 5])


In [None]:
x_train_red = encoder.predict(x_train, batch_size=8)
x_test_red = encoder.predict(x_test, batch_size=8)
x_attacks_red = encoder.predict(x_attacks, batch_size=8)

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(x_attacks_red[:,0], x_attacks_red[:,1], color='red', label='attack')
plt.scatter(x_train_red[:,0], x_train_red[:,1], color='green', label='Train')
plt.scatter(x_test_red[:,0], x_test_red[:,1], color='yellow', label='Test')
plt.legend()

In [None]:
ae_svm_results = pd.DataFrame(columns=['gamma', 'nu', 'n_components', 'TPR_train', 
                                    'TPR_test', 'TNR', 'model', 'auc', 'f_beta'])

gammas = [x_train_red.shape[1], 2*x_train_red.shape[1], x_train_red.shape[1]/2, 'auto']                                 
for nu in nus:
    for gamma in gammas:
        classifier = svm.OneClassSVM(kernel='rbf',gamma=gamma, nu=nu, cache_size=5000)
        classifier.fit(X_reduced)
        y_pred_train = classifier.predict(X_reduced)
        y_pred_test = classifier.predict(test_reduced)
        y_pred_outliers = classifier.predict(attack_reduced)
        n_accurate_train = y_pred_train[y_pred_train == 1].size
        n_accurate_test = y_pred_test[y_pred_test == 1].size
        n_accurate_outliers = y_pred_outliers[y_pred_outliers == -1].size


        fpr, tpr, _ = metrics.roc_curve(np.concatenate([np.ones(y_pred_test.shape[0]),
                                                         -1*np.ones(y_pred_outliers.shape[0])]), 
                                          np.concatenate([y_pred_test, y_pred_outliers]) , pos_label=1)
        fb = metrics.fbeta_score(np.concatenate([np.ones(y_pred_test.shape[0]),
                                                         -1*np.ones(y_pred_outliers.shape[0])]), 
                                          np.concatenate([y_pred_test, y_pred_outliers]),beta=20 ,pos_label=1)

        ae_svm_results = ae_svm_results.append({'nu': nu, 'gamma': gamma, 'n_components': n, 'TPR_train': n_accurate_train/X_reduced.shape[0],
                       'TPR_test': n_accurate_test/test_reduced.shape[0], 'TNR': n_accurate_outliers/attack_reduced.shape[0],
                       'model': 'ae-svm', 'auc': metrics.auc(fpr, tpr), 'f_beta': fb}, ignore_index=True)


In [None]:
ae_svm_results.sort_values('f_beta', ascending=False).head(10)

In [None]:
classifier = svm.OneClassSVM(kernel='rbf',gamma='auto', nu=0.01, cache_size=5000)
classifier.fit(X_reduced)
y_pred_train = classifier.predict(X_reduced)
y_pred_test = classifier.predict(test_reduced)
y_pred_outliers = classifier.predict(attack_reduced)
n_accurate_train = y_pred_train[y_pred_train == 1].size
n_accurate_test = y_pred_test[y_pred_test == 1].size
n_accurate_outliers = y_pred_outliers[y_pred_outliers == -1].size

fpr, tpr, _ = metrics.roc_curve(np.concatenate([np.ones(y_pred_test.shape[0]),
                                                 -1*np.ones(y_pred_outliers.shape[0])]), 
                                  np.concatenate([y_pred_test, y_pred_outliers]) , pos_label=1)

plot_roc(fpr, tpr, 'Autoencoder + SVM')