## Import necessary dependencies

In [None]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
import keras.backend as K
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from itertools import product
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn

%matplotlib inline
pd.options.display.max_columns = 999

## Clean up features from dataset

Data coming from the preprocessing stage may contain experimental columns and unnecessary features

In [None]:
# Create the DataFrame from the output csv
data = pd.read_csv('../../machine_learning/cloud_functions/data.csv')

In [None]:
df = pd.DataFrame(data)
display(df.head())
# Rename columns 'level_0' and 'level_1'
df['title'] = df['level_0']
df['variation'] = df['level_1']

# Define a series containing the name of the kind of rendition variation
attack_series = []
# Define a series containing the label
attack_IDs = []
# Define a series containing the dimensions of the asset
dimensions_series = []

# Define a series with the variations that are only scaled down versions of the original
# at a bitrate defined by Youtube's metadata
renditions = ['1080p', '720p', '480p', '360p', '240p', '144p']

# Scan all the rows
for index, row in df.iterrows():
    variation = row['variation'].split('/')[-2]
    dimension = int(variation.split('_')[0].replace('p',''))
    dimensions_series.append(dimension)
    attack_series.append(variation)
    
    for column in df.columns:
        cell_value = str(row[column])

        if '[' in cell_value:
            cell_value = cell_value.replace('[','').replace(']','').split('  ')
            if len(cell_value) == 1:
                df.set_value(index, column, float(cell_value[0]))
            else:
                print(pd.to_numeric(cell_value, downcast='float', errors='coerce'))
                print(range(5))
                histogram = np.histogram(pd.to_numeric(cell_value, downcast='float', errors='coerce'), bins=[1,2,3,4,5])
                df.set_value(index, column, histogram)
            

    # Every variation not belonging to the list of renditions is considered as an attack
    # whose encodings were generated with good settings.
    # Attacks (negative) are labeled as 0
    # Non-attacks (positive) are labeled 1
    if variation in renditions:
            attack_IDs.append(1)
    else:
        attack_IDs.append(0)

# Add the created series as columns of the dataframe
df['attack'] = attack_series
df['attack_ID'] = attack_IDs
df['dimension'] = dimensions_series

# Clean up 
df = df.drop(['Unnamed: 0',
         'level_1'],axis=1)

df =df.dropna(axis=1)

In [None]:
df.head(5)

In [None]:
df.to_csv('../../data_analytics/output/metrics-clean.csv')