In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch import optim
from tqdm import tqdm

In [2]:
def oneHotEncode(df,colNames):
    for col in colNames:
        if( df[col].dtype == np.dtype('object') or True):
            dummies = pd.get_dummies(df[col],prefix=col)
            df = pd.concat([df,dummies],axis=1)

            #drop the encoded column
            df.drop([col],axis = 1 , inplace=True)
    return df


def preprocessing(df):
    # df["label"][df["label"]=="benign"] = 0
    # df["label"][df["label"]=="outlier"] = 1
    # df["label"][df["label"]=="malicious"] = 2
    # df["label"] = df["label"].astype(np.int16)

    df["dest_port"] = df["dest_port"] / df["dest_port"].max()
    df["src_port"] = df["src_port"] / df["src_port"].max()
    df["dest_port"][df["dest_port"].isna()] = -1
    df["src_port"][df["src_port"].isna()] = -1


class ThreatDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.features = self.df.drop(columns = ['label']).values
        df_enc = oneHotEncode(self.df, ["label"])
        self.label = df_enc[["label_benign", "label_malicious", "label_outlier"]].to_numpy()
        # self.label = self.df.label.values
        
    def __len__(self):
        return (len(self.df))
    
    def __getitem__(self, idx):
        return (torch.tensor(self.features[idx], dtype=torch.float), torch.tensor(self.label[idx], dtype=torch.float))

In [3]:
path = "Dataset/2020/06/2020.06.19/2020.06.19.csv"
df = pd.read_csv(path)
df

Unnamed: 0,avg_ipt,bytes_in,bytes_out,dest_ip,dest_port,entropy,num_pkts_out,num_pkts_in,proto,src_ip,src_port,time_end,time_start,total_entropy,label,duration
0,7.500000,342,3679,786,9200.0,5.436687,2,2,6,786,57392.0,1592533725648144,1592533725632946,21860.918000,benign,0.015198
1,0.000000,0,0,786,55972.0,0.000000,1,1,6,49453,49493.0,1592533744644904,1592533744644904,0.000000,outlier,0.000000
2,0.500000,15440,942,786,9300.0,2.203135,3,3,6,786,60512.0,1592533770936279,1592533770933553,36091.754000,benign,0.002726
3,20.217391,622,31010,786,9300.0,1.189945,23,5,6,786,60490.0,159253376770238,15925337672353,37640.355000,benign,0.467080
4,0.000000,0,0,786,59498.0,0.000000,1,1,6,786,9300.0,1592533772973114,1592533772973087,0.000000,benign,0.000027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
765355,118.000000,270,191,786,445.0,4.570315,6,6,6,38592,50822.0,1592611182688869,1592611181766454,2106.915300,outlier,0.922415
765356,6.000000,340,611,786,9200.0,6.196277,2,2,6,786,51148.0,159261118308182,1592611183069313,5892.659700,benign,0.012507
765357,1.111111,348,9126,786,9200.0,2.999871,9,3,6,786,51146.0,1592611183081913,1592611183070674,28420.775000,benign,0.011239
765358,1602.500000,112,112,786,,1.060336,2,2,1,16509,,1592611179165795,159261117275518,237.515150,malicious,6.410615


In [4]:
preprocessing(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["dest_port"][df["dest_port"].isna()] = -1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["src_port"][df["src_port"].isna()] = -1


In [5]:
X = df.drop(columns = ['label'])
y = df["label"]
# X = torch.tensor(X, dtype=torch.float32)
# y = torch.tensor(y, dtype=torch.float32)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True)
df_train = pd.concat([pd.DataFrame(X_train), y_train], axis=1)
df_test = pd.concat([pd.DataFrame(X_test), y_test], axis=1)

In [6]:
train_loader = DataLoader(ThreatDataset(df_train), batch_size=64, shuffle=True)
test_loader = DataLoader(ThreatDataset(df_test), batch_size=64, shuffle=False)

In [10]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(15, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 3),
            nn.Sigmoid(),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [11]:
model = NeuralNetwork().float()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [12]:
def test(dataloader, model):
    size = len(test_loader.dataset)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in tqdm(test_loader):
            pred = model(X)
            test_loss += loss_fn(pred, y).item()  # View target labels as column vector
            correct += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()
    test_loss /= size
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [13]:
# Train the model (adjusting for the different feature size)
epochs = 5
size = len(train_loader.dataset)
for t in (range(epochs)):
    for batch, (X, y) in tqdm(enumerate(train_loader), total=size/64):
        # Compute prediction and loss
        X = X.view(-1, 15)  # Reshape input to match the feature size
        pred = model(X)
        loss = loss_fn(pred, y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    loss_val, current = loss.item(), batch * len(X)
    print(f"loss: {loss_val:>7f}  [{current:>5d}/{size:>5d}]")

test(test_loader, model)

torch.save(model.state_dict(), 'model3.pt')


  0%|          | 0/8371.125 [00:00<?, ?it/s]

8372it [00:25, 334.00it/s]                              


loss: 1.164458  [66968/535752]


8372it [00:26, 310.56it/s]                              


loss: 1.098612  [66968/535752]


8372it [00:27, 304.89it/s]                              


loss: 1.098612  [66968/535752]


8372it [00:28, 292.82it/s]                              


loss: 1.069035  [66968/535752]


8372it [00:29, 280.63it/s]                              


loss: 1.069035  [66968/535752]


100%|██████████| 3588/3588 [00:06<00:00, 525.38it/s]

Test Error: 
 Accuracy: 48.8%, Avg loss: 0.017415 




