In [23]:
import glob
import os
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

In [32]:
# dataset_csv_path = './csv_files/'
# csv_files_pattern = '*.csv'
# full_path = os.path.join(dataset_csv_path, csv_files_pattern)
# csv_files = glob.glob(full_path)
csv_files = ["./csv_files/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv", "./csv_files/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv", 
             "./csv_files/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv"]

In [33]:
# loading the dataset into a pandas dataframe
df = pd.concat(map(pd.read_csv, csv_files), ignore_index=True)
# df = pd.read_csv('./csv_files/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv')
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,389,113095465,48,24,9668,10012,403,0,201.416667,203.548293,...,32,203985.5,575837.3,1629110,379,13800000.0,4277541.0,16500000,6737603,BENIGN
1,389,113473706,68,40,11364,12718,403,0,167.117647,171.919413,...,32,178326.875,503426.9,1424245,325,13800000.0,4229413.0,16500000,6945512,BENIGN
2,0,119945515,150,0,0,0,0,0,0.0,0.0,...,0,6909777.333,11700000.0,20400000,6,24400000.0,24300000.0,60100000,5702188,BENIGN
3,443,60261928,9,7,2330,4221,1093,0,258.888889,409.702161,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,53,269,2,2,102,322,51,51,51.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [34]:
def replace_negatives_with_zeros(df):
    num = df._get_numeric_data()
    num[num < 0] = 0

def drop_zero_variance_columns(df):
    zero_variance_cols=[]
    for col in df.columns:
        if len(df[col].unique()) == 1:
            zero_variance_cols.append(col)
    df.drop(columns = zero_variance_cols, axis = 1, inplace = True)
    return zero_variance_cols

def drop_infinite_and_nan(df):
    df.replace([np.inf,-np.inf],np.nan,inplace=True)
    print(df.isna().any(axis = 1).sum(),"rows dropped")
    df.dropna(inplace = True)
    print("Shape after Removing NaN: ", df.shape)

def drop_identical_columns(df):
    column_pairs = [(i,j) for i,j in combinations(df,2) if df[i].equals(df[j])]
    ide_cols=[]
    for col_pair in column_pairs:
        ide_cols.append(col_pair[1])
    df.drop(columns=ide_cols,axis=1,inplace=True)
    return column_pairs

def data_cleaning(df):
    df.columns=df.columns.str.strip()
    print("Dataset Shape: ",df.shape)
    replace_negatives_with_zeros(df)
    
    # dropping the zero variance columns to eliminate its negative impacts on the model
    zero_variance_cols = drop_zero_variance_columns(df)
    print("Zero Variance Columns: ", zero_variance_cols, "are dropped.")
    print("Shape after removing the zero variance columns: ", df.shape)
    
    # cleaning all infinite and non-numeric values
    drop_infinite_and_nan(df)
    
    # removing duplicates
    df.drop_duplicates(inplace=True)
    print("Shape after dropping duplicates: ", df.shape)
    
    # eliminating columns with identical values if any
    column_pairs = drop_identical_columns(df)
    print("Columns which have identical values: ",column_pairs," dropped!")
    print("Shape after removing identical value columns: ",df.shape)
    return df
    
df=data_cleaning(df)

Dataset Shape:  (682578, 79)
Zero Variance Columns:  ['Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'CWE Flag Count', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate'] are dropped.
Shape after removing the zero variance columns:  (682578, 69)
540 rows dropped
Shape after Removing NaN:  (682038, 69)
Shape after dropping duplicates:  (595765, 69)
Columns which have identical values:  [('Total Fwd Packets', 'Subflow Fwd Packets'), ('Total Backward Packets', 'Subflow Bwd Packets'), ('Total Length of Fwd Packets', 'Subflow Fwd Bytes'), ('Fwd Packet Length Mean', 'Avg Fwd Segment Size'), ('Fwd PSH Flags', 'SYN Flag Count'), ('Fwd Header Length', 'Fwd Header Length.1'), ('RST Flag Count', 'ECE Flag Count')]  dropped!
Shape after removing identical value columns:  (595765, 62)


In [35]:
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,389,113095465,48,24,9668,10012,403,0,201.416667,203.548293,...,32,203985.5,575837.3,1629110,379,13800000.0,4277541.0,16500000,6737603,BENIGN
1,389,113473706,68,40,11364,12718,403,0,167.117647,171.919413,...,32,178326.875,503426.9,1424245,325,13800000.0,4229413.0,16500000,6945512,BENIGN
2,0,119945515,150,0,0,0,0,0,0.0,0.0,...,0,6909777.333,11700000.0,20400000,6,24400000.0,24300000.0,60100000,5702188,BENIGN
3,443,60261928,9,7,2330,4221,1093,0,258.888889,409.702161,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,53,269,2,2,102,322,51,51,51.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [36]:
# Separate features and target variable
X = df.drop('Label', axis=1)  # Features
y = df['Label']  # Target variable (attack or benign)

In [37]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Standardizing features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [39]:
# Creating and training the model
model = LogisticRegression(solver='liblinear')
model.fit(X_train_scaled, y_train)

In [29]:
# Trying predictions on the test set
y_pred = model.predict(X_test_scaled)

In [30]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9871477297781006
Precision: 0.9838168290484332
Recall: 0.9871477297781006
F1-score: 0.9848589028793053


In [31]:
data_crosstab = pd.crosstab(y_pred, y_test)

print(data_crosstab)

Label                       BENIGN   DDoS  Web Attack � Brute Force  \
row_0                                                                 
BENIGN                       50589     31                       262   
DDoS                           524  25517                         0   
Web Attack � Brute Force        13      0                        10   
Web Attack � Sql Injection       2      0                         0   
Web Attack � XSS                16      0                         0   

Label                       Web Attack � Sql Injection  Web Attack � XSS  
row_0                                                                     
BENIGN                                               7               136  
DDoS                                                 0                 0  
Web Attack � Brute Force                             0                 0  
Web Attack � Sql Injection                           0                 0  
Web Attack � XSS                                    