In [None]:
import sys
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline
from sklearn.manifold import TSNE


offline.init_notebook_mode(connected=False)

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation


# Source videos analysis

In [None]:
path = '../../machine_learning/cloud_functions/data-temporal-large.csv'
data = pd.read_csv(path)
sources_df = data[data['path'] == data['title']]
sources_df = sources_df.loc[:, (sources_df != 0).any(axis=0)]
sources_df = sources_df.drop(['Unnamed: 0', 'attack', 'kind', 'path', 'dimension'], axis=1)
sources_df.head()

In [None]:
non_series_sources_df = sources_df.filter(regex = '^((?!series).)*$').filter(regex = '^((?!ssim).)*$') 

print(non_series_sources_df.shape)
non_series_sources_df.head()

## Clustering algorithm analysis

In [None]:
import time
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn import cluster, mixture
from sklearn.neighbors import kneighbors_graph
from itertools import cycle, islice

#X = np.asarray(non_series_sources_df.drop(['title', 'fps'], axis=1))
X = np.asarray(non_series_sources_df[[#'temporal_difference-mean', 
                                      'temporal_spatial_complexity-mean', 
                                      #'temporal_difference-std', 
                                      'temporal_spatial_complexity-std'
                                      ]])
X = StandardScaler().fit_transform(X)

In [None]:
# ============
# Create cluster object
# ============
gmm = mixture.GaussianMixture(
    n_components=5, covariance_type='full')

clustering_algorithms = [
    ('GaussianMixture', gmm)
]

# ============
# Create TSNE reduction
# ============
tsne = TSNE(n_components=2, verbose=1, perplexity=50, learning_rate=200, n_iter=2000)
tsne_results = tsne.fit_transform(X)

In [None]:
offline.init_notebook_mode(connected=False)
gmm.fit(X)

traceTSNE = go.Scatter(
    x = tsne_results[:,0],
    y = tsne_results[:,1],
    mode = 'markers',
    text = non_series_sources_df['title'].values,
    showlegend = True,
     marker = dict(
         color = gmm.predict(X),
         size=non_series_sources_df['size']/(50000 * non_series_sources_df['fps']),
         line= dict(
                  color= 'rgb(255, 255, 255)',
                  width= 0
                 ),
        showscale = False,
        opacity = 0.8
    )
)
data = [traceTSNE]

layout = dict(title = 'TSNE (T-Distributed Stochastic Neighbour Embedding)',
              hovermode= 'closest',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False),
              showlegend= True,
             )

fig = dict(data=data, layout=layout)

offline.iplot(fig)

In [None]:
# Retrieve the dataset and attach the clustering information
features = ['temporal_dct-mean', 'temporal_gaussian-mean', 'size',
            'dimension', 'temporal_gaussian_difference-mean', 'temporal_difference-mean']


path = '../../machine_learning/cloud_functions/data-large.csv'

data = pd.read_csv(path)
attacks_df = data[data['path'] != data['title']]
attacks_df = attacks_df.loc[:, (attacks_df != 0).any(axis=0)]
display(attacks_df.head())
attacks_df = attacks_df[features]

attacks_df.head(100)

In [None]:
non_series_sources_df['gauss_cluster'] = gmm.predict(X)
non_series_sources_df['tsne_x'] = tsne_results[:,0]
non_series_sources_df['tsne_y'] = tsne_results[:,1]
display(non_series_sources_df.head())
attacks_df['gauss_cluster'] = attacks_df.groupby('path')['path'].transform(
    lambda x : x['path'] == non_series_sources_df['path']
)
attacks_df['gauss_cluster'] = attacks_df.apply(lambda x: if x['path']== )

In [None]:
# Dataframe to store results
svm_results = pd.DataFrame(columns=['gamma', 'nu', 'n_components', 'TPR_test',
                                    'TNR', 'model', 'auc', 'f_beta', 'projection'])

# Train the models
svm_results = evaluation.one_class_svm(x_train, x_test, x_attacks, svm_results)

