In [None]:
import sys
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from scipy import signal

from keras import Sequential
from keras.layers import LSTM, RepeatVector, TimeDistributed, Dense, Dropout, Input
from keras.optimizers import Adam 

offline.init_notebook_mode(connected=True)

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation

## Background

This notebook explores the influence of using a reduced number of frames to aggregate the features used in the model (temporal_dct and temporal_gaussian).
The experiment evaluates the F20 score for different number of samples frames

In [None]:
path = '../../machine_learning/cloud_functions/data-large.csv'
data = pd.read_csv(path)
df = pd.DataFrame(data)

columns = ['attack',
           'dimension',
           'size',
           'title',
           'temporal_dct-series',
           'temporal_gaussian_mse-series']


df = df[columns]
df = df.dropna()
        
df['attack_ID'] = df.apply(lambda row: row['attack'] in ['1080p', '720p', '480p', '360p', '240p', '144p'] , axis=1)
for column in columns:
    if 'series' in column:
        df[column] = df.apply(lambda row: np.fromstring(row[column].replace('[', '').replace(']', ''), 
                                                dtype=np.float, sep=' '), axis=1)
        df['{}-len'.format(column)] = df.apply(lambda row: len(row[column]), axis=1)

In [None]:
display(df.head())
df.describe()

# OCSVM

We will be conducting the experiments on the model with the best results achieved so far: One Class Support Vector Machine

In [None]:
# Helper function to evaluate models with different data sets

def evaluate_data_set(df, X_train_120):
    features = df.columns
    metric_processor = MetricProcessor(features,'UL', path)
    (X_train, X_test, X_attacks), (df_train, df_test, df_attacks) = metric_processor.split_test_and_train(df)
    
    # Scaling the data
    ss = StandardScaler()
    x_train = ss.fit_transform(X_train_120)
    x_test = ss.transform(X_test)
    x_attacks = ss.transform(X_attacks)
    # Dataframe to store results
    svm_results = pd.DataFrame(columns=['gamma', 'nu', 'n_components', 'TPR_test',
                                        'TNR', 'model', 'auc', 'f_beta', 'projection'])

    # Train the models
    svm_results = evaluation.one_class_svm(x_train, x_test, x_attacks, svm_results)
    display(svm_results.sort_values('f_beta', ascending=False).head(1))
    return svm_results.sort_values('f_beta', ascending=False).head(1)

In [None]:
frame_nums = [1, 5, 10, 15, 30, 60, 90, 120]
features = df.columns
df_samples = df.copy()
print(df_samples.shape)
metric_processor = MetricProcessor(features,'UL', path)
df_results = pd.DataFrame(columns=['frames','gamma', 'nu', 'n_components', 'TPR_test',
                                        'TNR', 'model', 'auc', 'f_beta', 'projection'])
        
for column in columns:
    if 'series' in column:
        df_samples[column] = df_samples.apply(lambda row: np.mean(row[column][:120]), axis=1)

(X_train_120, X_test, X_attacks), (df_train, df_test, df_attacks) = metric_processor.split_test_and_train(df_samples)

for frame_num in frame_nums:
    df_samples = df[df['temporal_dct-series-len']>100].copy()
    
    print('**********************************')
    print('Frame number:', frame_num)
    for column in columns:
        if 'series' in column:
            df_samples[column] = df_samples.apply(lambda row: np.mean(np.random.choice(row[column], frame_num)), axis=1)
            
            
    df_results = pd.concat([df_results, evaluate_data_set(df_samples, X_train_120)], axis=0, sort=False)


In [None]:
df_results['frames'] = frame_nums
display(df_results)

## Plot results

In [None]:
list_dct = []
list_gaussian = []
n_frames = 100
for index, row in df.iterrows():
    if len(row['temporal_dct-series'])>=n_frames:
        list_dct.append(row['temporal_dct-series'][:n_frames])
        list_gaussian.append(row['temporal_gaussian_mse-series'][:n_frames])
print(len(list_gaussian))
df_dct = pd.DataFrame(data=list_dct)
df_gaussian = pd.DataFrame(data=list_gaussian)



In [None]:
df_gaussian.mean().plot(title='Mean gaussian')

In [None]:
df_dct.mean().plot(title='Mean DCT')