In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import openai

In [2]:
# Load the dataset
train_data = pd.read_csv("UNSW_NB15_training-set.csv")
test_data = pd.read_csv("UNSW_NB15_testing-set.csv")
print(pd.isnull(train_data).values.any(), pd.isnull(test_data).values.any())

False False


## Data Preprosessing 

In [3]:
train_data.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,...,1,2,0,0,0,1,2,0,Normal,0
1,2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,...,1,2,0,0,0,1,2,0,Normal,0
2,3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,...,1,3,0,0,0,1,3,0,Normal,0
3,4,6e-06,udp,-,INT,2,0,900,0,166666.6608,...,1,3,0,0,0,2,3,0,Normal,0
4,5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,...,1,3,0,0,0,2,3,0,Normal,0


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82332 entries, 0 to 82331
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 82332 non-null  int64  
 1   dur                82332 non-null  float64
 2   proto              82332 non-null  object 
 3   service            82332 non-null  object 
 4   state              82332 non-null  object 
 5   spkts              82332 non-null  int64  
 6   dpkts              82332 non-null  int64  
 7   sbytes             82332 non-null  int64  
 8   dbytes             82332 non-null  int64  
 9   rate               82332 non-null  float64
 10  sttl               82332 non-null  int64  
 11  dttl               82332 non-null  int64  
 12  sload              82332 non-null  float64
 13  dload              82332 non-null  float64
 14  sloss              82332 non-null  int64  
 15  dloss              82332 non-null  int64  
 16  sinpkt             823

In [5]:
test_data.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175341 entries, 0 to 175340
Data columns (total 45 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 175341 non-null  int64  
 1   dur                175341 non-null  float64
 2   proto              175341 non-null  object 
 3   service            175341 non-null  object 
 4   state              175341 non-null  object 
 5   spkts              175341 non-null  int64  
 6   dpkts              175341 non-null  int64  
 7   sbytes             175341 non-null  int64  
 8   dbytes             175341 non-null  int64  
 9   rate               175341 non-null  float64
 10  sttl               175341 non-null  int64  
 11  dttl               175341 non-null  int64  
 12  sload              175341 non-null  float64
 13  dload              175341 non-null  float64
 14  sloss              175341 non-null  int64  
 15  dloss              175341 non-null  int64  
 16  si

In [7]:
# encode obj columns
le = LabelEncoder()
train_data[['proto','service','state', 'attack_cat']] = train_data[['proto','service','state', 'attack_cat']].apply(lambda col: le.fit_transform(col))
test_data[['proto','service','state', 'attack_cat']] = test_data[['proto','service','state', 'attack_cat']].apply(lambda col: le.fit_transform(col))

In [8]:
# The input and label batches are extracted from the tensors using slicing.
input_ids = torch.tensor(train_data.iloc[:, 1:44].values, dtype=torch.long)
labels = torch.tensor(train_data["label"].values, dtype=torch.long)

## Constracting a model

In [15]:
class GPT2Model(nn.Module):
    def __init__(self, vocab_size, hidden_size=768, nhead=12, num_layers=12):
        super(GPT2Model, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.transformer = nn.Transformer(hidden_size, nhead, num_layers)
        self.fc = nn.Linear(hidden_size, vocab_size)
    def forward(self, input_ids):
        x = self.embedding(input_ids)
        x = self.transformer(x)
        x = self.fc(x)
        return x

In [16]:
# Load the pre-trained model
model = GPT2Model(vocab_size=50257)
state = openai.Model(engine="davinci")

In [17]:
# Fine-tune the model
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
num_epochs = 10
batch_size = 10
accumulation_steps = 4  # number of steps to accumulate gradients before updating the model parameters

In [21]:
for epoch in range(num_epochs):
    total_loss = 0
    for i in range(0, input_ids.shape[0], batch_size):
        if (i + batch_size > input_ids.shape[0]):
            break
        optimizer.zero_grad()
        input_batch = input_ids[i:i + batch_size]
        label_batch = labels[i:i + batch_size]
        output = model(input_batch)
        loss = criterion(output, label_batch) / accumulation_steps  # divide the loss by the accumulation steps
        loss.backward()
        if (input_batch.shape[0] % accumulation_steps == 0):  # update the parameters every "accumulation_steps" steps
            optimizer.step()
        total_loss += loss.item()
    print("Epoch: {} Loss: {}".format(epoch, total_loss / (input_ids.shape[0] / batch_size)))

IndexError: index out of range in self

### Evaluate the model on the test dataset

In [None]:
# The input and label batches are extracted from the tensors using slicing.
input_ids = torch.tensor(test_data.iloc[:, 1:44].values, dtype=torch.long)
labels = torch.tensor(test_data["label"].values, dtype=torch.long)

In [None]:
with torch.no_grad():
    outputs = model(input_ids_test)
    _, predicted = torch.max(outputs.data, 1)
    accuracy = (predicted == labels_test).sum().item() / labels_test.shape[0]
    print("Accuracy on test dataset: {:.2f}%".format(accuracy * 100))