<a href="https://colab.research.google.com/github/ntauth/28th-summer-school-hpc/blob/master/src/ml/PTAnomalyDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---
## Dependencies

In [1]:
# Install Dependencies
!pip install numpy pandas matplotlib scikit-learn tensorflow pyod

Collecting pyod
[?25l  Downloading https://files.pythonhosted.org/packages/17/d5/12bf702a918ac36e944fedb2257d51167b049b951f2b72503fc9a54dc9e8/pyod-0.7.7.1.tar.gz (87kB)
[K     |████████████████████████████████| 92kB 3.0MB/s 
Collecting combo
  Downloading https://files.pythonhosted.org/packages/78/52/e880bd923eba122515307d29ab43c1c356bad60610c27bed2cdec25d0240/combo-0.1.0.tar.gz
Collecting suod
[?25l  Downloading https://files.pythonhosted.org/packages/8b/b9/aeaa3371f71fc039da982979ea6d6ffd9e356926c9fc630f42a5dc9d494e/suod-0.0.3.tar.gz (2.1MB)
[K     |████████████████████████████████| 2.1MB 9.8MB/s 
Building wheels for collected packages: pyod, combo, suod
  Building wheel for pyod (setup.py) ... [?25l[?25hdone
  Created wheel for pyod: filename=pyod-0.7.7.1-cp36-none-any.whl size=98961 sha256=b79242860a7b17f4b3fe83b9daa47ad08dc85293e784e296d22a8910ee845af0
  Stored in directory: /root/.cache/pip/wheels/9c/9e/bc/5bac69bdb00bfefbbfa74a43409d328962c349ad1b0954470a
  Building wheel 



---
## Imports & Definitions

In [31]:
import numpy as np
import pandas as pd
import torch as T

from sklearn import manifold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import altair as alt

from pyod.models.auto_encoder import AutoEncoder
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize


class PTAutoEncoder(T.nn.Module):
    """Non-Variational Autoencoder"""

    __hl_1_units = 126
    __hl_2_units = 16
    __hl_3_units = 8
    __hl_4_units = 8
    __hl_5_units = 16
    __hl_6_units = 126

    def __init__(self):
        super().__init__()

        self.tanh = T.nn.Tanh()

        # Encoder
        self.layer1 = T.nn.Linear(__hl_1_units, __hl_2_units)
        self.layer2 = T.nn.Linear(__hl_2_units, __hl_3_units)

        # Embedding
        self.layer3 = T.nn.Linear(__hl_3_units, __hl_4_units)

        # Decoder
        self.layer4 = T.nn.Linear(__hl_4_units, __hl_5_units)
        self.layer5 = T.nn.Linear(__hl_5_units, __hl_6_units)

    def forward(self, *x):
        z = self.tanh(self.layer1(x))
        z = self.tanh(self.layer2(z))

        z = self.tanh(self.layer3(z))

        z = self.tanh(self.layer4(z))
        z = self.tanh(self.layer5(z))

        return z


def unpack_trace_sample(x):
  return list(map(lambda y: int(y, 16), list(x)))


alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [0]:
# Load data
cfg_data = pd.read_csv('cfggen.out', header=None)
rop_data = pd.read_csv('ropgen.out', header=None)

cfg_data = cfg_data[0].apply(lambda x: pd.Series(unpack_trace_sample(x)))
rop_data = rop_data[0].apply(lambda x: pd.Series(unpack_trace_sample(x)))

cfg_data['label'] = 0  # benign trace
rop_data['label'] = 1  # malicious trace (`mprotect` chain)

data = pd.concat([cfg_data, rop_data])

feature_scaler = StandardScaler().fit(data.loc[:, data.columns != 'label'])

train_data = data.sample(frac=0.5).reset_index(drop=True)
test_data = data[~data.index.isin(train_data.index)].reset_index(drop=True)

X_train = pd.DataFrame(feature_scaler.transform(
    train_data.loc[:, train_data.columns != 'label']))
y_train = train_data.loc[:, ['label']]

X_test = pd.DataFrame(feature_scaler.
                      transform(test_data.loc[:, test_data.columns != 'label']))
y_test = test_data.loc[:, ['label']]



---
## Principal Component Analysis


In [4]:
# PCA
pca = PCA(2)
x_pca = pca.fit_transform(X_train)
x_pca = pd.DataFrame(x_pca)
x_pca.columns = ['PC1', 'PC2']

# Plot
x_pca = pd.concat([x_pca, y_train], axis=1).sample(frac=0.1)

# cmap = list(map(lambda x: , y_train.values.ravel()))
# plt.scatter(x_pca['PC1'], x_pca['PC2'], c=cmap, alpha=0.7)
# plt.title('Scatter plot')
# plt.xlabel('x')
# plt.ylabel('y')

alt.Chart(x_pca).mark_point().encode(
  x='PC1',
  y='PC2',
  color='label:O'
).interactive()



---
## t-SNE


In [5]:
# t-SNE
data2 = data.sample(n=5000).reset_index(drop=True)
X_train2 = pd.DataFrame(StandardScaler().
                        fit_transform(data2.loc[:, data2.columns != 'label']))
y_train2 = data2.loc[:, ['label']]

man = manifold.TSNE(n_components=2, init='pca', random_state=0)
tsne = man.fit_transform(X_train2)

dtsne = pd.concat([pd.DataFrame(tsne, columns=["x1", "x2"]), y_train2], axis=1)

alt.Chart(dtsne).mark_circle().encode(
    x='x1',
    y='x2',
    color=alt.Color('label:O',
                    scale=alt.Scale(scheme='viridis')),
    size=alt.value(50),
    tooltip=['x1','x2','label:O']
).properties(title = "t-SNE").interactive()



---
## Autoencoder
- Hidden Layer Configuration: 36, 16, 6, 16, 36


In [97]:
# Autoencoder
rop_data1 = rop_data.sample(frac=0.5)
rop_data2 = rop_data[~rop_data.index.isin(rop_data1.index)]

cfg_data1 = cfg_data.sample(frac=0.45)
cfg_data2 = cfg_data[~cfg_data.index.isin(cfg_data1.index)]

ae_data = pd.concat([cfg_data1, rop_data1])

ae_train_data = ae_data # .sample(frac=0.5)
# ae_test_data = ae_data[~ae_data.index.isin(ae_train_data.index)]
ae_test_data = pd.concat([cfg_data2, rop_data2]) # rop_data.sample(frac=0.5)

X_train_ae = pd.DataFrame(feature_scaler.
                          transform(ae_train_data.
                                    loc[:, ae_train_data.columns
                                        != 'label']))
X_test_ae = pd.DataFrame(feature_scaler.
                         transform(ae_test_data.
                                   loc[:, ae_test_data.columns != 'label']))

y_train_ae = ae_train_data.loc[:, ['label']]
y_test_ae = ae_test_data.loc[:, ['label']]

c = len(cfg_data1.index) / len(ae_data.index)

print(c)

ae1 = AutoEncoder(hidden_neurons=[36, 16, 6, 16, 36], contamination=c, epochs=30)
ae1.fit(X_train_ae)

0.48891786179921776
Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_75 (Dense)             (None, 36)                1332      
_________________________________________________________________
dropout_65 (Dropout)         (None, 36)                0         
_________________________________________________________________
dense_76 (Dense)             (None, 36)                1332      
_________________________________________________________________
dropout_66 (Dropout)         (None, 36)                0         
_________________________________________________________________
dense_77 (Dense)             (None, 36)                1332      
_________________________________________________________________
dropout_67 (Dropout)         (None, 36)                0         
_________________________________________________________________
dense_78 (Dense)             (Non

AutoEncoder(batch_size=32, contamination=0.48891786179921776,
      dropout_rate=0.2, epochs=30, hidden_activation='relu',
      hidden_neurons=[36, 16, 6, 16, 36], l2_regularizer=0.1,
      loss=<function mean_squared_error at 0x7fd67455da60>,
      optimizer='adam', output_activation='sigmoid', preprocessing=True,
      random_state=None, validation_size=0.1, verbose=1)



---
### Anomaly Scores


In [98]:
# Get the outlier scores for the train data
y_train_scores = ae1.decision_scores_
y_train_pred = ae1.labels_

# Predict the anomaly scores
y_test_pred = ae1.predict(X_test_ae)
y_test_scores = ae1.decision_function(X_test_ae)  # Outlier scores
y_test_scores = pd.DataFrame(y_test_scores, columns=['score'])

# y_test_scores = pd.Series(y_test_scores)

# Plot
hist = alt.Chart(y_test_scores).mark_bar().encode(
    x=alt.X("score:Q"),
    y='count()',
)

# Threshold
thresh = pd.DataFrame([ae1.threshold_], columns=["threshold"])

line = alt.Chart(thresh).mark_rule(color='red').encode(
    x='threshold:Q'
)

(hist + line).properties(title="Test Set Anomaly Scores").interactive()

# plt.hist(y_test_scores, bins='auto')
# plt.title("Histogram for Model AE1 Anomaly Scores")



---
### ROC

In [111]:
# Test AE
df_test = X_test_ae.copy()
df_test['score'] = y_test_scores
df_test['cluster'] = np.where(df_test['score'] < ae1.threshold_, 0, 1)

# print(df_test['cluster'].value_counts())
# print(df_test.groupby('cluster').mean())

y_train_ae_ = 1 - y_train_ae
y_test_ae_ = 1 - y_test_ae

# test_samples = feature_scaler.transform(np.array([unpack_trace_sample('4831C0FFD49058C3905FC3905EC3905AC390')]))
# print(1 - ae1.predict(test_samples))

print("[+] On Training Data:")
evaluate_print('Non-Variational Deep Autoencoder', y_train_ae_, y_train_scores)
print("[+] On Test Data:")
evaluate_print('Non-Variational Deep Autoencoder', y_test_ae_, y_test_scores)

y_test_ae_ = y_test_ae_.values.ravel()

print("[+] Model Accuracy: {}".format(
      100 * (np.sum(y_test_ae_ == y_test_pred) / float(len(y_test_ae_)))))


# visualize('AE', X_train_ae, y_train_ae, X_test_ae, y_test_ae, y_train_pred,
#           y_test_pred, show_figure=True, save_figure=False)

[0]
[+] On Training Data:
Non-Variational Deep Autoencoder ROC:1.0, precision @ rank n:0.9991
[+] On Test Data:
Non-Variational Deep Autoencoder ROC:1.0, precision @ rank n:0.9995
[+] Model Accuracy: 99.92160193381898
