## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


## Load Dataset

In [2]:
df = pd.read_csv("dos_ddos_dataset.csv", encoding="ISO-8859-1", encoding_errors='replace', on_bad_lines="skip", low_memory=False)
df.columns = df.columns.str.strip()  # Strip whitespace from column names
df.head()

##df = pd.read_csv("dos_ddos_dataset.csv", encoding="ISO-8859-1", on_bad_lines="skip", low_memory=False)
##df.head()


Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
0,445444,172.16.0.5-192.168.50.4-9429-9429-6,172.16.0.5,9429,192.168.50.4,9429,6,2018-11-03 11:36:28.607338,36063894,7,...,29.444864,52.0,1.0,12021280.0,6253623.0,18628035.0,6193840.0,0,1,Syn
1,113842,172.16.0.5-192.168.50.4-60224-60224-6,172.16.0.5,60224,192.168.50.4,60224,6,2018-11-03 11:36:28.607339,44851366,8,...,0.0,1.0,1.0,20662680.0,11697830.0,28934293.0,12391060.0,0,1,Syn
2,176377,172.16.0.5-192.168.50.4-33827-11746-6,192.168.50.4,11746,172.16.0.5,33827,6,2018-11-03 11:36:28.607388,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,Syn
3,24777,172.16.0.5-192.168.50.4-33828-1431-6,172.16.0.5,33828,192.168.50.4,1431,6,2018-11-03 11:36:28.607391,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,Syn
4,85100,172.16.0.5-192.168.50.4-5311-5311-6,172.16.0.5,5311,192.168.50.4,5311,6,2018-11-03 11:36:28.607442,35731470,8,...,33.234019,48.0,1.0,11910470.0,1849493.0,13693985.0,10001398.0,0,1,Syn


## Preprocessing - Handle Missing Values

In [3]:
import numpy as np
from sklearn.impute import SimpleImputer

# Replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Separate numeric and non-numeric columns
numeric_cols = df.select_dtypes(include=['number']).columns
non_numeric_cols = df.select_dtypes(exclude=['number']).columns

# Impute numeric columns with mean
imputer = SimpleImputer(strategy="mean")
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# Drop rows with missing values in non-numeric columns
df.dropna(subset=non_numeric_cols, inplace=True)

# Keep only first 10,000 rows
df = df.head(10000)

print("Remaining NaNs:", df.isnull().sum().sum())


"""import numpy as np
from sklearn.impute import SimpleImputer

# Replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Separate numeric and non-numeric columns
numeric_cols = df.select_dtypes(include=['number']).columns
non_numeric_cols = df.select_dtypes(exclude=['number']).columns

# Impute numeric columns with mean
imputer = SimpleImputer(strategy="mean")
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# Drop rows with any missing values in non-numeric columns
df.dropna(subset=non_numeric_cols, inplace=True)

# Keep only the first 10,000 rows
df = df.head(10000)

# Final check
print("Remaining NaNs:", df.isnull().sum().sum())

"""


Remaining NaNs: 0


'import numpy as np\nfrom sklearn.impute import SimpleImputer\n\n# Replace infinite values with NaN\ndf.replace([np.inf, -np.inf], np.nan, inplace=True)\n\n# Separate numeric and non-numeric columns\nnumeric_cols = df.select_dtypes(include=[\'number\']).columns\nnon_numeric_cols = df.select_dtypes(exclude=[\'number\']).columns\n\n# Impute numeric columns with mean\nimputer = SimpleImputer(strategy="mean")\ndf[numeric_cols] = imputer.fit_transform(df[numeric_cols])\n\n# Drop rows with any missing values in non-numeric columns\ndf.dropna(subset=non_numeric_cols, inplace=True)\n\n# Keep only the first 10,000 rows\ndf = df.head(10000)\n\n# Final check\nprint("Remaining NaNs:", df.isnull().sum().sum())\n\n'

## Split Features/Target & Scale

In [4]:
from sklearn.preprocessing import StandardScaler

X = df.drop("Label", axis=1)
y = df["Label"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

"""X = df.drop("Label", axis=1)
y = df["Label"]


# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
"""

ValueError: could not convert string to float: '172.16.0.5-192.168.50.4-9429-9429-6'

## Correlation-based Feature Selection (Top 30)

In [None]:
# Select top 30 most relevant features
selector = SelectKBest(score_func=f_classif, k=30)
X_selected = selector.fit_transform(X_scaled, y)

selected_features = X.columns[selector.get_support()]
print("Selected Features:", selected_features.tolist())


## Train/Test Split (Stratified)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)


## Train & Evaluate Helper Function

In [None]:
def evaluate_model(model, name):
    start_train = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_train

    start_pred = time.time()
    y_pred = model.predict(X_test)
    predict_time = time.time() - start_pred

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\n{name}")
    print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1-Score: {f1:.4f}")
    print(f"Train time: {train_time:.3f}s | Predict time: {predict_time:.4f}s")

    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()


## Run All Models


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}

for name, model in models.items():
    evaluate_model(model, name)
