In [1]:
import glob
import os
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

In [2]:
dataset_csv_path = './csv_files/'
csv_files_pattern = '*.csv'
full_path = os.path.join(dataset_csv_path, csv_files_pattern)
csv_files = glob.glob(full_path)
# csv_files = ["./csv_files/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv", "./csv_files/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv", 
#              "./csv_files/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv"]

In [3]:
# loading the dataset into a pandas dataframe
df = pd.concat(map(pd.read_csv, csv_files), ignore_index=True)
# df = pd.read_csv('./csv_files/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv')
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [4]:
def replace_negatives_with_zeros(df):
    num = df._get_numeric_data()
    num[num < 0] = 0

def drop_zero_variance_columns(df):
    zero_variance_cols=[]
    for col in df.columns:
        if len(df[col].unique()) == 1:
            zero_variance_cols.append(col)
    df.drop(columns = zero_variance_cols, axis = 1, inplace = True)
    return zero_variance_cols

def drop_infinite_and_nan(df):
    df.replace([np.inf,-np.inf],np.nan,inplace=True)
    print(df.isna().any(axis = 1).sum(),"rows dropped")
    df.dropna(inplace = True)
    print("Shape after Removing NaN: ", df.shape)

def drop_identical_columns(df):
    column_pairs = [(i,j) for i,j in combinations(df,2) if df[i].equals(df[j])]
    ide_cols=[]
    for col_pair in column_pairs:
        ide_cols.append(col_pair[1])
    df.drop(columns=ide_cols,axis=1,inplace=True)
    return column_pairs

def data_cleaning(df):
    df.columns=df.columns.str.strip()
    print("Dataset Shape: ",df.shape)
    replace_negatives_with_zeros(df)
    
    # dropping the zero variance columns to eliminate its negative impacts on the model
    zero_variance_cols = drop_zero_variance_columns(df)
    print("Zero Variance Columns: ", zero_variance_cols, "are dropped.")
    print("Shape after removing the zero variance columns: ", df.shape)
    
    # cleaning all infinite and non-numeric values
    drop_infinite_and_nan(df)
    
    # removing duplicates
    df.drop_duplicates(inplace=True)
    print("Shape after dropping duplicates: ", df.shape)
    
    # eliminating columns with identical values if any
    column_pairs = drop_identical_columns(df)
    print("Columns which have identical values: ",column_pairs," dropped!")
    print("Shape after removing identical value columns: ",df.shape)
    return df
    
df=data_cleaning(df)

Dataset Shape:  (2830743, 79)
Zero Variance Columns:  ['Bwd PSH Flags', 'Bwd URG Flags', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate'] are dropped.
Shape after removing the zero variance columns:  (2830743, 71)
2867 rows dropped
Shape after Removing NaN:  (2827876, 71)
Shape after dropping duplicates:  (2520798, 71)
Columns which have identical values:  [('Total Fwd Packets', 'Subflow Fwd Packets'), ('Total Backward Packets', 'Subflow Bwd Packets'), ('Fwd PSH Flags', 'SYN Flag Count'), ('Fwd URG Flags', 'CWE Flag Count'), ('Fwd Header Length', 'Fwd Header Length.1')]  dropped!
Shape after removing identical value columns:  (2520798, 66)


In [5]:
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [6]:
# Separate features and target variable
X = df.drop('Label', axis=1)  # Features
y = df['Label']  # Target variable (attack or benign)

In [7]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Standardizing features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Creating and training the model
model = LogisticRegression(solver='liblinear')
model.fit(X_train_scaled, y_train)

In [None]:
# Trying predictions on the test set
y_pred = model.predict(X_test_scaled)

In [None]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

In [None]:
data_crosstab = pd.crosstab(y_pred, y_test)

print(data_crosstab)