# Import dependencies as setup notebook

In [None]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
from keras.regularizers import l2
import keras.backend as K
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.metrics import roc_curve, auc, fbeta_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import xgboost as xgb
from itertools import product
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn
import time
import pickle
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

%matplotlib inline

## Helper functions

In [None]:
def plot_history(network_history,title='Loss and accuracy (Keras model)'):
    plt.figure(figsize=(15,10))
    plt.subplot(211)
    plt.title(title)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.plot(network_history.history['loss'])
    #plt.plot(network_history.history['val_loss'])
    plt.legend(['Training', 'Validation'])

    plt.subplot(212)
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.plot(network_history.history['acc'])
    #plt.plot(network_history.history['val_acc'])
    plt.legend(['Training', 'Validation'], loc='lower right')
    plt.show()

In [None]:
def plot_roc(fpr, tpr, title):
    roc_auc = auc(fpr, tpr)
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic {}'.format(title))
    plt.legend(loc="lower right")
    plt.show()

# Prepare features from generated data

In [None]:
data = pd.read_csv('../../machine_learning/cloud_functions/data-large.csv')
df = pd.DataFrame(data)

attack_IDs = []

for _, row in df.iterrows():
    dimension = int(row['attack'].split('_')[0].replace('p',''))

    if row['attack'] in ['1080p', '720p', '480p', '360p', '240p', '144p']:
            attack_IDs.append(1)
    else:
        attack_IDs.append(0)

df['attack_ID'] = attack_IDs

df = df.drop(['Unnamed: 0',
              'attack',
              'title',
              'path',
              'kind',
             'temporal_canny-series',
             'temporal_cross_correlation-series',
             'temporal_difference-series',
             'temporal_histogram_distance-series', 
             'temporal_dct-series', 
        ],axis=1)
df=df.dropna(axis=1)
print('Number of attacks:',df[df['attack_ID']==0].shape)
print('Number of legit renditions:',df[df['attack_ID']==1].shape)

# Data Visualization

In [None]:
df.head(25)

In [None]:
df.describe()

In [None]:
df_corr = df.corr()

In [None]:
plt.figure(figsize=(10,10))
corr = df_corr.drop(['attack_ID'],axis=1).corr('spearman')
corr.style.background_gradient().set_precision(2)

# Define input data

We choose the input of our models (we try to make balanced dataset)

In [None]:
int(df.shape[0]*0.8)

In [None]:
# Define training set as 80% of total specimens
num_train = int(df.shape[0]*0.8)

df_train_all = df[0:num_train]
df_test_all = df[num_train:]
print('We have {} train specimens and {} test specimens'.format(len(df_train_all), len(df_test_all)))

# Balance the training dataset by limiting the number of negative specimens to resemble what we have of positive
df_train_1 = df_train_all[df_train_all['attack_ID'] == 1]
df_train_0 = df_train_all[df_train_all['attack_ID'] == 0]

print('We have {} positive training specimens and {} negative training specimens'.format(len(df_train_1), len(df_train_0)))
# Take a sample from the training positives and build the final training set
df_sample_train = df_train_0.sample(df_train_1.shape[0])
df_train = df_train_1.append(df_sample_train)
df_train = df_train.sample(frac=1)
print('Balanced training set established with shape {}'.format(df_train.shape))

X_test_all = df_test_all.drop(['attack_ID'],axis=1)
df_test_1 = df_test_all[df_test_all['attack_ID'] == 1]
df_test_0 = df_test_all[df_test_all['attack_ID'] == 0]

print('We have {} positive testing specimens and {} negative testing specimens'.format(len(df_test_1), len(df_test_0)))
# Get another sample from the testing positives and build the final test set
df_sample_test = df_test_0.sample(df_test_0.shape[0])
df_test = df_test_1.append(df_sample_test)
df_test = df_test.sample(frac=0.6)
print('Balanced test set established with shape {}'.format(df_test.shape))

X_test_all = np.asarray(X_test_all)

y_test_all = df_test_all['attack_ID']
y_test_all = np.asarray(y_test_all)

X_train = df_train.drop(['attack_ID'],axis=1)
X_train = np.asarray(X_train)

X_test = df_test.drop(['attack_ID'],axis=1)
X_test = np.asarray(X_test)

y_train = df_train['attack_ID']
y_train = np.asarray(y_train)

y_test = df_test['attack_ID']
y_test = np.asarray(y_test)

First we define a model where we take all variables :

In [None]:
print('TRAIN:', X_train.shape)
print('TEST:', X_test.shape)
print(df_train.columns)

Then we can normalize input data to facilitate model convergence

In [None]:
MinMax_scaler = MinMaxScaler()
Standard_scaler = StandardScaler()

X_train_scaled_MinMax = MinMax_scaler.fit_transform(X_train) 
X_test_scaled_MinMax = MinMax_scaler.transform(X_test) 
X_test_scaled_MinMax_all = MinMax_scaler.transform(X_test_all) 

X_train_scaled_standard = Standard_scaler.fit_transform(X_train)
X_test_scaled_standard = Standard_scaler.transform(X_test)
X_test_scaled_standard_all = Standard_scaler.transform(X_test_all)

# Define models
We will explore results with different ML techniques

## Keras neural network

In [None]:
def model():
    model = Sequential()
    
    model.add(Dense(100, input_shape=(X_train.shape[1],), activation= "relu", kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.3))
    
    model.add(Dense(100, activation= "relu", kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.3))
    
    model.add(Dropout(0.2))
    model.add(Dense(128, kernel_initializer='glorot_uniform', activation='sigmoid'))
    model.add(Dropout(0.2))
    model.add(Dense(256, kernel_initializer='glorot_uniform', activation= "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(256, kernel_initializer='glorot_uniform', activation= "relu"))
    model.add(Dropout(0.4))
    model.add(Dense(256, kernel_initializer='glorot_uniform'))
    model.add(Dropout(0.3))
    
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    network_history = model.fit(X_train_scaled_standard, y_train, batch_size=128, epochs=500, verbose=0, validation_data=(X_test_scaled_standard,y_test))
    plot_history(network_history)
    return model

NN_model = model()
print(NN_model.metrics_names)

NN_model.evaluate(X_test_scaled_standard, y_test)

# Save the weights
NN_model.save_weights('../output/models/NN_model_weights.h5')

# Save the model architecture
with open('../output/models/model_architecture.json', 'w') as f:
    f.write(NN_model.to_json())

In [None]:
y_pred = NN_model.predict(X_test_scaled_standard)

In [None]:
rounded = [round(x[0]) for x in y_pred]
y_pred_bin = np.array(rounded, dtype='int64')
confusion_matrix(y_test, y_pred_bin)

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_bin)
fb = fbeta_score(y_test, y_pred_bin, beta=20, pos_label=1) 
print('TNR={}, TPR={}, F20={}'.format(1-fpr[1],tpr[1], fb))
plot_roc(fpr, tpr, 'Neural network')

Test only with Class : 0

# Random forest with scikit-learn

In [None]:
X_train_RF = X_train_scaled_MinMax
X_test_RF = X_test_scaled_MinMax

random_forest = RandomForestClassifier(n_estimators=150, random_state=12)
random_forest.fit(X_train_RF, y_train)

In [None]:
random_forest.score(X_test_RF, y_test)

In [None]:
prediction_RF = random_forest.predict(X_test_RF)

In [None]:
confusion_matrix(y_test, prediction_RF)

In [None]:
fpr, tpr, _ = roc_curve(y_test, prediction_RF)
fb = fbeta_score(y_test, prediction_RF, beta=20, pos_label=1) 
print('TNR={}, TPR={}, F20={}'.format(1-fpr[1],tpr[1], fb))
plot_roc(fpr, tpr, 'Random Forest')

### Export model to use in CLI

In [None]:
# save model to file
pickle.dump(random_forest, open("../output/models/random_forest.pickle.dat", "wb"))

# AdaBoost with scikit-learn

In [None]:
X_train_AB = X_train_scaled_MinMax
X_test_AB = X_test_scaled_MinMax

AdaBoost = AdaBoostClassifier(learning_rate=0.55, random_state=3)
AdaBoost.fit(X_train_AB, y_train)

In [None]:
AdaBoost.score(X_test_AB, y_test)

In [None]:
prediction_AdaBoost = AdaBoost.predict(X_test_AB)

In [None]:
confusion_matrix(y_test, prediction_AdaBoost)

In [None]:
fpr, tpr, _ = roc_curve(y_test, prediction_AdaBoost)
fb = fbeta_score(y_test, prediction_AdaBoost, beta=20,pos_label=1) 
print('TNR={}, TPR={}, F20={}'.format(1-fpr[1],tpr[1], fb))
plot_roc(fpr, tpr, 'AdaBoost')

In [None]:
print(AdaBoost)

### Export model to use in CLI

In [None]:
# save model to file
pickle.dump(AdaBoost, open("../output/models/AdaBoost.pickle.dat", "wb"))

# SVM with scikit-learn

In [None]:
X_train_SVM = X_train_scaled_standard
X_test_SVM = X_test_scaled_standard_all
Y_test_SVM = y_test_all

classifier = svm.SVC(kernel='rbf')
classifier.fit(X_train_SVM, y_train)
prediction_SVM = classifier.predict(X_test_SVM)

In [None]:
classifier.score(X_test_SVM,Y_test_SVM)

In [None]:
confusion_matrix(Y_test_SVM, prediction_SVM)

In [None]:
fpr, tpr, _ = roc_curve(Y_test_SVM, prediction_SVM)
fb = fbeta_score(Y_test_SVM, prediction_SVM, beta=20, pos_label=1) 
print('TNR={}, TPR={}, F20={}'.format(1-fpr[1],tpr[1], fb))
plot_roc(fpr, tpr, 'SVM')

# XGBoost

In [None]:
X_train_XGB = X_train
X_test_XGB = X_test_all
Y_test_XGB = y_test_all

XGBoost = xgb.XGBClassifier()
grid = {'max_depth':10}
XGBoost.set_params(**grid)
XGBoost.fit(X_train_XGB, y_train)
prediction_XGB = XGBoost.predict(X_test_XGB)

In [None]:
XGBoost.score(X_test_XGB,Y_test_XGB)

In [None]:
confusion_matrix(Y_test_XGB, prediction_XGB)

In [None]:
fpr, tpr, _ = roc_curve(Y_test_XGB, prediction_XGB)
fb = fbeta_score(y_test_all, prediction_XGB, beta=20, pos_label=1) 
print('TNR={}, TPR={}, F20={}'.format(1-fpr[1],tpr[1], fb))
plot_roc(fpr, tpr, 'XGB')

### Export model to use in CLI

In [None]:
# save model to file
pickle.dump(XGBoost, open("../output/models/XGBoost.pickle.dat", "wb"))