In [12]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import joblib

torch.cuda.empty_cache()

In [2]:
device = ("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
model = torch.load("anomaly_detection.pth")
model = model.to(device)
model = model.eval()

In [4]:
df = pd.read_csv("test_data.csv", index_col="Unnamed: 0")

In [5]:
df.head()

Unnamed: 0,level_0,index,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,0,0,udp,private,SF,105,146,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,1,3,0,udp,private,SF,105,146,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
2,2,4,0,udp,private,SF,105,146,0,0,...,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
3,3,5,0,udp,private,SF,105,146,0,0,...,255,1.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
4,4,6,0,udp,domain_u,SF,29,0,0,0,...,3,0.3,0.3,0.3,0.0,0.0,0.0,0.0,0.0,normal.


In [6]:
columns = ["protocol_type", "logged_in", "count", "srv_count", "srv_diff_host_rate", "dst_host_count", "dst_host_same_src_port_rate"]

In [7]:
testing_data = df[columns]

In [8]:
pipeline = joblib.load("pipeline_updated.pkl")

In [9]:
input_array = pipeline.transform(testing_data)
input_data = torch.tensor(input_array, dtype=torch.float32)
input_data = input_data.to(device=device)

In [10]:
preds = model(input_data)

In [11]:
preds

tensor([[-0.9626, -1.1387,  0.3218,  ...,  0.2199,  0.0182,  0.0142],
        [-0.9615, -1.1377,  0.3230,  ...,  0.2194,  0.0181,  0.0145],
        [-0.9619, -1.1380,  0.3224,  ...,  0.2196,  0.0182,  0.0146],
        ...,
        [-0.9607, -1.1370,  0.3236,  ...,  0.2191,  0.0181,  0.0149],
        [-1.2380, -0.8450, -2.5745,  ...,  0.7130,  0.1873,  0.3173],
        [-1.2273, -0.8586, -2.4503,  ...,  0.6921,  0.1805,  0.3054]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

In [13]:
criterion = nn.MSELoss()

In [24]:
attacks = []
for i in range(df.shape[0]):
    err = criterion(preds[i], input_data[1])
    if err > 0.5:
        attacks.append(i)

In [14]:
attacks = df[df["label"] != "normal."]

In [19]:
attacks

Unnamed: 0,level_0,index,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
1,1,3,0,udp,private,SF,105,146,0,0,...,254,1.00,0.01,0.00,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
2,2,4,0,udp,private,SF,105,146,0,0,...,254,1.00,0.01,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
3,3,5,0,udp,private,SF,105,146,0,0,...,255,1.00,0.00,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
9,9,13,0,udp,private,SF,105,146,0,0,...,252,0.99,0.01,0.00,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
14,14,19,0,udp,private,SF,105,146,0,0,...,254,1.00,0.01,0.00,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77112,77112,309594,0,udp,private,SF,105,105,0,0,...,253,0.99,0.01,0.00,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
77113,77113,309595,0,udp,private,SF,105,105,0,0,...,253,0.99,0.01,0.00,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
77145,77145,309742,0,udp,private,SF,105,147,0,0,...,255,1.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
77178,77178,309995,0,udp,private,SF,105,147,0,0,...,254,1.00,0.01,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.


In [20]:
point = df.iloc[77112]
point_orig = pd.DataFrame([point])
label = point_orig.label
point = point_orig[columns]
point_tr = pipeline.transform(point)
point_inp = torch.tensor(point_tr, dtype=torch.float32)
point_inp = point_inp.to(device=device)
prediction = model(point_inp)
error = criterion(prediction, point_inp)

print(f"Original datapoint: {point}")
print(f"Pipeline output: {point_tr}")
print(f"Input to the model: {point_inp}")
print(f"Output of the model: {prediction}")
print(f"Label is: {label}")
print(f"Error: {error}")

Original datapoint:       protocol_type  logged_in  count  srv_count  srv_diff_host_rate  \
77112           udp          0      2          1                 0.0   

       dst_host_count  dst_host_same_src_port_rate  
77112             255                          0.0  
Pipeline output: [[-1.54887479 -1.18516921  0.34815115  0.          0.          1.
   0.          0.          0.        ]]
Input to the model: tensor([[-1.5489, -1.1852,  0.3482,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
          0.0000]], device='cuda:0')
Output of the model: tensor([[-0.9618, -1.1380,  0.3227, -0.0114,  0.9145,  0.1168,  0.2195,  0.0181,
          0.0144]], device='cuda:0', grad_fn=<AddmmBackward0>)
Label is: 77112    snmpgetattack.
Name: label, dtype: object
Error: 0.22365699708461761


In [58]:
predictions = []
for i in range(df.shape[0]):
    error = criterion(preds[i], input_data[i])
    if error >= 0.1:
        predictions.append(1)
    else:
        predictions.append(0)

In [59]:
results_df = pd.DataFrame({"label": df["label"],
                           "prediction": predictions})

In [60]:
results_df["label"] = results_df["label"].apply(lambda x: 0 if x=="normal." else 1)

In [61]:
from sklearn.metrics import accuracy_score
accuracy_score(results_df["label"], results_df["prediction"])

0.5959296683960617