In [39]:
import torch 
import pandas as pd 
import torch.nn as nn 
from torch.utils.data import random_split, DataLoader, TensorDataset 
import torch.nn.functional as F 
import numpy as np 
import torch.optim as optim 
from torch.optim import Adam 

In [40]:
# Loading the Data
df = pd.read_csv('dataset.csv') 
df.head()

Unnamed: 0,Initial packet size,Initial TTL,Window size,DF flag,port,Max Segment Size,Window Scalling Value,sackOK,Target
0,66,128,32,1,49201,1460,8,1,windows
1,66,128,32,1,49202,1460,8,1,windows
2,66,54,15525,1,49202,1410,8,1,windows
3,54,128,2,1,49202,0,0,0,windows
4,66,55,15525,1,49201,1410,8,1,windows


In [41]:
del df['port']
df

Unnamed: 0,Initial packet size,Initial TTL,Window size,DF flag,Max Segment Size,Window Scalling Value,sackOK,Target
0,66,128,32,1,1460,8,1,windows
1,66,128,32,1,1460,8,1,windows
2,66,54,15525,1,1410,8,1,windows
3,54,128,2,1,0,0,0,windows
4,66,55,15525,1,1410,8,1,windows
...,...,...,...,...,...,...,...,...
4856,180,64,65535,1,0,0,0,android
4857,590,64,65535,1,0,0,0,android
4858,384,64,65535,1,0,0,0,android
4859,312,16,65535,1,0,0,0,android


In [42]:
df.iloc[:,-1].value_counts()

windows    3978
android     883
Name: Target, dtype: int64

In [43]:
# Convert Iris species into numeric types: windows=0, android=1  
labels = {'windows':0, 'android':1} 
df['Target_num'] = df['Target']   # Create a new column "Target_num" 
df.Target_num = [labels[item] for item in df.Target_num]  # Convert the values to numeric ones 
df

Unnamed: 0,Initial packet size,Initial TTL,Window size,DF flag,Max Segment Size,Window Scalling Value,sackOK,Target,Target_num
0,66,128,32,1,1460,8,1,windows,0
1,66,128,32,1,1460,8,1,windows,0
2,66,54,15525,1,1410,8,1,windows,0
3,54,128,2,1,0,0,0,windows,0
4,66,55,15525,1,1410,8,1,windows,0
...,...,...,...,...,...,...,...,...,...
4856,180,64,65535,1,0,0,0,android,1
4857,590,64,65535,1,0,0,0,android,1
4858,384,64,65535,1,0,0,0,android,1
4859,312,16,65535,1,0,0,0,android,1


In [44]:
# Define input and output datasets 
input = df.iloc[:,:-2]            # inputs. 
print('\nInput values are:') 
print(input.head())   
output = df.loc[:, 'Target_num']   # targets
print('\nThe output value is:') 
print(output.head()) 


Input values are:
   Initial packet size  Initial TTL  Window size  DF flag  Max Segment Size  \
0                   66          128           32        1              1460   
1                   66          128           32        1              1460   
2                   66           54        15525        1              1410   
3                   54          128            2        1                 0   
4                   66           55        15525        1              1410   

   Window Scalling Value  sackOK  
0                      8       1  
1                      8       1  
2                      8       1  
3                      0       0  
4                      8       1  

The output value is:
0    0
1    0
2    0
3    0
4    0
Name: Target_num, dtype: int64


In [45]:
# Convert Input and Output data to Tensors and create a TensorDataset 
input = torch.Tensor(input.to_numpy())      # Create tensor of type torch.float32 
print('\nInput format: ', input.shape, input.dtype)     # Input format: torch.Size([5684, 7]) torch.float32 
output = torch.tensor(output.to_numpy())        # Create tensor type torch.int64  
print('Output format: ', output.shape, output.dtype)  # Output format: torch.Size([5684]) torch.int64 
data = TensorDataset(input, output)    # Create a torch.utils.data.TensorDataset object for further data manipulation


Input format:  torch.Size([4861, 7]) torch.float32
Output format:  torch.Size([4861]) torch.int64


In [46]:
# Split to Train, Validate and Test sets using random_split 
train_batch_size = 128        
number_rows = len(input)    # The size of our dataset or the number of rows in csv table.  
test_split = int(number_rows*0.1)  
validate_split = int(number_rows*0.1) 
train_split = number_rows - test_split - validate_split     
train_set, validate_set, test_set = random_split( 
    data, [train_split, validate_split, test_split])    
 
# Create Dataloader to read the data within batch sizes and put into memory. 
train_loader = DataLoader(train_set, batch_size = train_batch_size, shuffle = True) 
validate_loader = DataLoader(validate_set, batch_size = 1) 
test_loader = DataLoader(test_set, batch_size = 1)

In [47]:
# Define model parameters 
input_size = list(input.shape)[1]   # = 7. The input depends on how many features we initially feed the model. In our case, there are 2 features for every predict value  
learning_rate = 0.001 
output_size = len(labels)           # The output is prediction results


# Define neural network 
class Network(nn.Module): 
   def __init__(self, input_size, output_size): 
       super(Network, self).__init__() 
        
       self.layer1 = nn.Linear(input_size, 24) 
       self.layer2 = nn.Linear(24, 24) 
       self.layer3 = nn.Linear(24, output_size) 

   def forward(self, x): 
       x1 = F.relu(self.layer1(x)) 
       x2 = F.relu(self.layer2(x1)) 
       x3 = self.layer3(x2)
       return x3 
 
# Instantiate the model 
model = Network(input_size, output_size)

In [48]:
# Function to save the model 
def saveModel(): 
    path = "./NetModel.pth" 
    torch.save(model.state_dict(), path)

In [49]:
# Define the loss function with Classification Cross-Entropy loss and an optimizer with Adam optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

In [50]:
# Training Function 
def train(num_epochs): 
    best_accuracy = 0.0 
     
    print("Begin training...") 
    for epoch in range(1, num_epochs+1): 
        running_train_loss = 0.0 
        running_accuracy = 0.0 
        running_vall_loss = 0.0 
        total = 0 
 
        # Training Loop 
        for data in train_loader: 
        #for data in enumerate(train_loader, 0): 
            inputs, outputs = data  # get the input and real species as outputs; data is a list of [inputs, outputs] 
            optimizer.zero_grad()   # zero the parameter gradients          
            predicted_outputs = model(inputs)   # predict output from the model 
            train_loss = loss_fn(predicted_outputs, outputs)   # calculate loss for the predicted output  
            train_loss.backward()   # backpropagate the loss 
            optimizer.step()        # adjust parameters based on the calculated gradients 
            running_train_loss +=train_loss.item()  # track the loss value 
 
        # Calculate training loss value 
        train_loss_value = running_train_loss/len(train_loader) 
 
        # Validation Loop 
        with torch.no_grad(): 
            model.eval() 
            for data in validate_loader: 
               inputs, outputs = data 
               predicted_outputs = model(inputs) 
               val_loss = loss_fn(predicted_outputs, outputs) 
             
               # The label with the highest value will be our prediction 
               _, predicted = torch.max(predicted_outputs, 1) 
               running_vall_loss += val_loss.item()  
               total += outputs.size(0) 
               running_accuracy += (predicted == outputs).sum().item() 
 
        # Calculate validation loss value 
        val_loss_value = running_vall_loss/len(validate_loader) 
                
        # Calculate accuracy as the number of correct predictions in the validation batch divided by the total number of predictions done.  
        accuracy = (100 * running_accuracy / total)     
 
        # Save the model if the accuracy is the best 
        if accuracy > best_accuracy: 
            saveModel() 
            best_accuracy = accuracy 
         
        # Print the statistics of the epoch 
        print('Completed training batch', epoch, 'Training Loss is: %.4f' %train_loss_value, 'Validation Loss is: %.4f' %val_loss_value, 'Accuracy is %d %%' % (accuracy))

In [51]:
# Function to test the model 
def test(): 
    # Load the model that we saved at the end of the training loop 
    model = Network(input_size, output_size) 
    path = "NetModel.pth" 
    model.load_state_dict(torch.load(path)) 
     
    running_accuracy = 0 
    total = 0 
 
    with torch.no_grad(): 
        for data in test_loader: 
            inputs, outputs = data 
            outputs = outputs.to(torch.float32) 
            predicted_outputs = model(inputs) 
            _, predicted = torch.max(predicted_outputs, 1) 
            total += outputs.size(0) 
            running_accuracy += (predicted == outputs).sum().item() 
 
        print('Accuracy of the model based on the test set of', test_split ,'inputs is: %d %%' % (100 * running_accuracy / total))    
 
 
# Optional: Function to test which species were easier to predict  
def test_species(): 
    # Load the model that we saved at the end of the training loop 
    model = Network(input_size, output_size) 
    path = "NetModel.pth" 
    model.load_state_dict(torch.load(path)) 
     
    labels_length = len(labels) # how many classes we have. = 3 in our database. 
    labels_correct = list(0. for i in range(labels_length)) # list to calculate correct labels  
    labels_total = list(0. for i in range(labels_length))   # list to keep the total num of labels per type 
  
    with torch.no_grad(): 
        for data in test_loader: 
            inputs, outputs = data 
            predicted_outputs = model(inputs) 
            _, predicted = torch.max(predicted_outputs, 1) 
             
            label_correct_running = (predicted == outputs).squeeze() 
            label = outputs[0] 
            if label_correct_running.item():  
                labels_correct[label] += 1 
            labels_total[label] += 1  
  
    label_list = list(labels.keys()) 
    for i in range(output_size): 
        print('Accuracy to predict %5s : %2d %%' % (label_list[i], 100 * labels_correct[i] / labels_total[i]))

In [52]:
if __name__ == "__main__": 
    num_epochs = 50
    train(num_epochs) 
    print('Finished Training\n') 
    test() 
    test_species()

Begin training...
Completed training batch 1 Training Loss is: 43.2055 Validation Loss is: 14.1142 Accuracy is 33 %
Completed training batch 2 Training Loss is: 6.6382 Validation Loss is: 10.1986 Accuracy is 79 %
Completed training batch 3 Training Loss is: 17.6113 Validation Loss is: 30.4066 Accuracy is 79 %
Completed training batch 4 Training Loss is: 18.5211 Validation Loss is: 30.5431 Accuracy is 79 %
Completed training batch 5 Training Loss is: 13.4846 Validation Loss is: 12.8586 Accuracy is 36 %
Completed training batch 6 Training Loss is: 11.0713 Validation Loss is: 4.2560 Accuracy is 47 %
Completed training batch 7 Training Loss is: 12.5369 Validation Loss is: 17.7198 Accuracy is 79 %
Completed training batch 8 Training Loss is: 12.1830 Validation Loss is: 8.3947 Accuracy is 41 %
Completed training batch 9 Training Loss is: 9.9076 Validation Loss is: 8.2076 Accuracy is 78 %
Completed training batch 10 Training Loss is: 8.5026 Validation Loss is: 7.3332 Accuracy is 79 %
Complete