In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset,DataLoader

In [2]:
columns= [
        'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
        'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
        'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
        'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
        'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
        'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
        'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
        'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
        'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
        'attack_label', 'difficulty'
    ]

In [8]:
df=pd.read_csv('data/KDDTrain+.txt',names=columns)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  int64  
 1   protocol_type                125973 non-null  object 
 2   service                      125973 non-null  object 
 3   flag                         125973 non-null  object 
 4   src_bytes                    125973 non-null  int64  
 5   dst_bytes                    125973 non-null  int64  
 6   land                         125973 non-null  int64  
 7   wrong_fragment               125973 non-null  int64  
 8   urgent                       125973 non-null  int64  
 9   hot                          125973 non-null  int64  
 10  num_failed_logins            125973 non-null  int64  
 11  logged_in                    125973 non-null  int64  
 12  num_compromised              125973 non-null  int64  
 13 

In [43]:

categorical_features = ['protocol_type', 'service', 'flag']
df_encoded = pd.get_dummies(df, columns=categorical_features)


df_processed = df_encoded.drop('attack_label', axis=1)
scaler = MinMaxScaler()
df_normalized = scaler.fit_transform(df_processed)


In [44]:
df_normalized.shape

(125973, 123)

In [45]:
class NSLKDD(Dataset):
    def __init__(self,data,lables):
        self.data=torch.tensor(data,dtype=torch.float32)
        self.lables=torch.tensor(lables,dtype=torch.long)
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
         return self.data[index], self.lables[index]
    

In [46]:
dataset=NSLKDD(df_normalized,df['attack_label'].apply(lambda x:0 if x=="normal" else 1))
data_loader=DataLoader(dataset,batch_size=32,shuffle=True)


In [47]:
train_batch=next(iter(data_loader))[0]
print(train_batch.shape)

torch.Size([32, 123])


In [66]:

model=nn.Sequential(
    nn.Conv1d(1,10,kernel_size=3),
    nn.BatchNorm1d(10),
    nn.ReLU(),
    nn.MaxPool1d(2,2),
    nn.Flatten(),
    nn.Linear(600,128),
    nn.ReLU(),
    nn.Linear(128,2)
)

In [70]:
crit=nn.CrossEntropyLoss()
opt=optim.Adam(model.parameters(),lr=0.01)

for epoch in range(10):
    for batch_idx,(data,lable) in enumerate(data_loader):
        
        opt.zero_grad()
        data=data.unsqueeze(1)
        outputs=model(data)
        loss=crit(outputs,lable)
        loss.backward()
        opt.step()
    print(f"epoch :{epoch+1}, loss={loss.item()}")

epoch :1, loss=0.04767533764243126
epoch :2, loss=0.00010196235962212086
epoch :3, loss=0.0017329599941149354
epoch :4, loss=0.0002173951215809211
epoch :5, loss=0.0003811268543358892
epoch :6, loss=0.007362586446106434
epoch :7, loss=3.3237669413210824e-05
epoch :8, loss=6.454496906371787e-05
epoch :9, loss=0.00020141180721111596
epoch :10, loss=0.0


In [71]:
test=pd.read_csv('data/KDDTest-21.txt',names=columns)

In [72]:
test_encoded = pd.get_dummies(test, columns=categorical_features)
test_encoded = test_encoded.reindex(columns=df_encoded.columns, fill_value=0)
test_processed = test_encoded.drop('attack_label', axis=1)
test_normalized = scaler.transform(test_processed)

In [73]:
test_dataset=NSLKDD(test_normalized,test['attack_label'].apply(lambda x:0 if x=="normal" else 1))
test_loader=DataLoader(test_dataset,batch_size=32,shuffle=False)

In [74]:

all_probs = torch.tensor([])
model.eval()

with torch.no_grad():
    for idx,(data,lable) in enumerate(test_loader):
        output=model(data.unsqueeze(1))
        probs=torch.nn.functional.softmax(output,dim=1)

        all_probs=torch.cat((all_probs,probs),dim=0)

prediction=torch.argmax(all_probs,dim=1)

In [75]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Get true labels from your test dataset
true_labels = test['attack_label'].apply(lambda x: 0 if x == "normal" else 1).values

# Calculate metrics
accuracy = accuracy_score(true_labels, prediction.numpy())
precision = precision_score(true_labels, prediction.numpy())
recall = recall_score(true_labels, prediction.numpy())
f1 = f1_score(true_labels, prediction.numpy())

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.7102
Precision: 0.9516
Recall: 0.6806
F1-score: 0.7936
