In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install cudatree

Collecting cudatree
  Downloading cudatree-0.6.tar.gz (22 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [3]:
# Colab warns and provides remediation steps if the GPUs is not compatible with RAPIDS.

!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 582, done.[K
remote: Counting objects: 100% (148/148), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 582 (delta 119), reused 82 (delta 82), pack-reused 434 (from 3)[K
Receiving objects: 100% (582/582), 190.86 KiB | 4.06 MiB/s, done.
Resolving deltas: 100% (293/293), done.
Using Python 3.11.11 environment at: /usr
Resolved 2 packages in 137ms
Prepared 2 packages in 24ms
Installed 2 packages in 3ms
 + nvidia-ml-py==12.570.86
 + pynvml==12.0.0
Installing RAPIDS remaining 24.12.* libraries
Using Python 3.11.11 environment at: /usr
Resolved 154 packages in 1.09s
Downloading cuml-cu12 (522.6MiB)
Downloading libcuspatial-cu12 (16.9MiB)
Downloading dask (1.2MiB)
Downloading libucx-cu12 (25.7MiB)
Downloading datashader (17.5MiB)
Downloading scikit-image (14.2MiB)
Downloading cugraph-cu12 (877.5MiB)
Downloading raft-dask-cu12 (187.8MiB)
Downloading cucim-cu12 (5.3MiB)
Downloading cuvs-cu12 (810.3MiB)

In [9]:
import cudf
import cupy as cp
from cuml.naive_bayes import GaussianNB as cuGaussianNB

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim

import pandas as pd

#######################################
# 1) PyTorch MLP Definition (GPU)
#######################################
class SimpleMLP(nn.Module):
    """
    A basic MLP with one hidden layer of size hidden_dim,
    ReLU activation, output_dim=2 for binary classification.
    """
    def __init__(self, input_dim, hidden_dim=32, output_dim=2):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x  # shape (N, 2) for 2-class output

def train_mlp_pytorch(X_train, y_train, X_test, y_test, epochs=5, lr=0.001):
    """
    Trains a simple MLP in PyTorch on GPU (if available).
    Returns the accuracy on the test set.
    Inputs/Outputs should be NumPy arrays (float32 for X, int for y).
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Convert data to PyTorch tensors and move to GPU/CPU
    X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train_t = torch.tensor(y_train, dtype=torch.long).to(device)

    X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_test_t = torch.tensor(y_test, dtype=torch.long).to(device)

    # Define the model
    input_dim = X_train.shape[1]
    model = SimpleMLP(input_dim=input_dim, hidden_dim=32, output_dim=2).to(device)

    # Loss & Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Training Loop (simple: all data in one batch)
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_t)
        loss = criterion(outputs, y_train_t)
        loss.backward()
        optimizer.step()

        # Optional: print the training loss
        # print(f"Epoch [{epoch+1}/{epochs}] - Loss: {loss.item():.4f}")

    # Evaluate on test set
    model.eval()
    with torch.no_grad():
        logits = model(X_test_t)               # shape (N, 2)
        _, predicted = torch.max(logits, 1)    # shape (N,)
        correct = (predicted == y_test_t).sum().item()
        accuracy = correct / float(y_test_t.shape[0])

    return accuracy

########################################
# 2) Main Pipeline for Attack Types
########################################
attack_types = [
    "Bot",
    "DDoS",
    "DoS GoldenEye",
    "DoS Hulk",
    "DoS Slowhttptest",
    "DoS slowloris",
    "FTP-Patator",
    "PortScan",
    "SSH-Patator"
]
benign_type = "BENIGN"

results = []

for attack_type in attack_types:
    # --------------------------
    # A) Read top features
    # --------------------------
    importance_file = (
        f"/content/drive/MyDrive/Threat-Detection-in-Cyber-Security-Using-AI-master/"
        f"{attack_type}_importance.csv"
    )
    importance_data = pd.read_csv(importance_file)
    selected_features = importance_data["Feature"][:3].tolist()

    # --------------------------
    # B) Load data on GPU, label -> 0/1
    # --------------------------
    data_file = (
        f"/content/drive/MyDrive/Threat-Detection-in-Cyber-Security-Using-AI-master/"
        f"{attack_type}_vs_{benign_type}.csv"
    )
    df_gpu = cudf.read_csv(data_file)

    # Convert " Label" to 0/1
    df_gpu[" Label"] = (df_gpu[" Label"] != "BENIGN").astype("int32")

    # Subset the features
    X_gpu = df_gpu[selected_features].astype("float32")
    y_gpu = df_gpu[" Label"]  # int32

    # Convert to Pandas for train_test_split
    X_cpu = X_gpu.to_pandas()
    y_cpu = y_gpu.to_pandas()

    # --------------------------
    # C) Train/Test Split (CPU)
    # --------------------------
    X_train_cpu, X_test_cpu, y_train_cpu, y_test_cpu = train_test_split(
        X_cpu, y_cpu, test_size=0.4, random_state=42
    )

    # --------------------------
    # D) GPU-based Naive Bayes
    # --------------------------
    # Convert splits to cuDF -> Cupy
    X_train_gpu = cudf.DataFrame.from_pandas(X_train_cpu)
    X_test_gpu  = cudf.DataFrame.from_pandas(X_test_cpu)
    y_train_gpu = cudf.Series(y_train_cpu)
    y_test_gpu  = cudf.Series(y_test_cpu)

    X_train_cp = X_train_gpu.to_cupy()
    y_train_cp = y_train_gpu.to_cupy()
    X_test_cp  = X_test_gpu.to_cupy()
    y_test_cp  = y_test_gpu.to_cupy()

    nb_model_gpu = cuGaussianNB()
    nb_model_gpu.fit(X_train_cp, y_train_cp)

    nb_preds_cp  = nb_model_gpu.predict(X_test_cp)
    nb_preds_cpu = cp.asnumpy(nb_preds_cp)
    y_test_cpu_nb = cp.asnumpy(y_test_cp)

    nb_accuracy = accuracy_score(y_test_cpu_nb, nb_preds_cpu)

    # --------------------------
    # E) CPU-based QDA
    # --------------------------
    qda = QuadraticDiscriminantAnalysis()
    qda.fit(X_train_cpu, y_train_cpu)
    qda_preds = qda.predict(X_test_cpu)
    qda_accuracy = accuracy_score(y_test_cpu, qda_preds)

    # --------------------------
    # F) GPU-based MLP (PyTorch)
    # --------------------------
    # Our function expects NumPy arrays (float32 for X, int for y).
    # They are already in CPU memory after train_test_split
    X_train_np = X_train_cpu.values.astype("float32")
    y_train_np = y_train_cpu.values.astype("int64")  # for PyTorch cross-entropy
    X_test_np  = X_test_cpu.values.astype("float32")
    y_test_np  = y_test_cpu.values.astype("int64")

    mlp_accuracy = train_mlp_pytorch(X_train_np, y_train_np, X_test_np, y_test_np,
                                     epochs=5, lr=0.001)

    # --------------------------
    # G) Store results
    # --------------------------
    results.append({
        "Attack Type": attack_type,
        "Naive Bayes (GPU)": nb_accuracy,
        "QDA (CPU)": qda_accuracy,
        "MLP (GPU, PyTorch)": mlp_accuracy
    })

# --------------------------------
# 3) Build Results DataFrame
# --------------------------------
df_results = pd.DataFrame(results)
print(df_results)




        Attack Type  Naive Bayes (GPU)  QDA (CPU)  MLP (GPU, PyTorch)
0               Bot           0.638021   0.998870            0.998860
1              DDoS           0.987697   0.979342            0.884113
2     DoS GoldenEye           0.865831   0.984525            0.006007
3          DoS Hulk           0.725127   0.875710            0.729039
4  DoS Slowhttptest           0.853011   0.975157            0.911235
5     DoS slowloris           0.866002   0.980956            0.996625
6       FTP-Patator           0.004545   0.998776            0.184951
7          PortScan           0.916335   0.916335            0.916335
8       SSH-Patator           0.996145   0.998788            0.996560


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

from incident_handler import respond_to_threat  # ⬅️ make sure incident_handler.py is in the same folder

# Simulate IPs just for demo
import random

# List of attack types
attack_types = ["Bot", "DDoS", "DoS GoldenEye", "DoS Hulk", "DoS Slowhttptest", "DoS slowloris", "FTP-Patator",
                "PortScan", "SSH-Patator"]
benign_type = "BENIGN"

# Initialize an empty list to store results as dictionaries
results = []

# Loop through each attack type
for attack_type in attack_types:
    # Read the feature importance file
    # importance_file = f"{attack_type}_importance.csv"
    importance_file = (
        f"C:/Users/91701/Thesis/Threat-Detection-in-Cyber-Security-Using-AI-master/dataset/"
        f"{attack_type}_importance.csv"
    )
    importance_data = pd.read_csv(importance_file)

    # Select the first 4 features
    selected_features = importance_data['Feature'][:3].tolist()

    # Read the data file
    # data_file = f"{attack_type}_vs_{benign_type}.csv"
    data_file = (
        f"C:/Users/91701/Thesis/Threat-Detection-in-Cyber-Security-Using-AI-master/dataset/"
        f"{attack_type}_vs_{benign_type}.csv"
    )
    data = pd.read_csv(data_file)

    # Select the selected features and the target column
    selected_data = data[selected_features + [' Label']]

    # Split the data into features (X) and target (y)
    X = selected_data[selected_features]
    y = selected_data[' Label']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

    # Train Naive Bayes
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)

    # Train Quadratic Discriminant Analysis
    qda_model = QuadraticDiscriminantAnalysis()
    qda_model.fit(X_train, y_train)

    # Train Multi-Layer Perceptron
    mlp_model = MLPClassifier(random_state=42, max_iter=1000, learning_rate_init=0.001)
    mlp_model.fit(X_train, y_train)

    # Predictions
    nb_preds = nb_model.predict(X_test)
    qda_preds = qda_model.predict(X_test)
    mlp_preds = mlp_model.predict(X_test)
    mlp_probs = mlp_model.predict_proba(X_test)  # shape: [n_samples, 2]

    for i in range(len(mlp_preds)):
        if mlp_preds[i] == 1:  # 1 = attack
            confidence = max(mlp_probs[i])  # the model’s certainty
            random_ip = f"192.168.1.{random.randint(1, 255)}"  # fake IP for now
            respond_to_threat(attack_type, random_ip, confidence)


    # Calculate accuracies
    nb_accuracy = accuracy_score(y_test, nb_preds)
    qda_accuracy = accuracy_score(y_test, qda_preds)
    mlp_accuracy = accuracy_score(y_test, mlp_preds)

    # Store the results as a dictionary
    result_dict = {
        'Attack Type': attack_type,
        'Naive Bayes Accuracy': nb_accuracy,
        'QDA Accuracy': qda_accuracy,
        'MLP Accuracy': mlp_accuracy
    }

    # Append the dictionary to the results list
    results.append(result_dict)

# Create a Pandas DataFrame from the results list
results_df = pd.DataFrame(results)

# Display the DataFrame
print(results_df)



        Attack Type  Naive Bayes Accuracy  QDA Accuracy  MLP Accuracy
0               Bot              0.638021      0.832589      0.998870
1              DDoS              0.984610      0.980481      0.992264
2     DoS GoldenEye              0.984251      0.984238      0.996942
3          DoS Hulk              0.977031      0.976215      0.989537
4  DoS Slowhttptest              0.972274      0.974597      0.996782
5     DoS slowloris              0.981212      0.978324      0.997528
6       FTP-Patator              0.998758      0.998777      0.995455
7          PortScan              0.916335      0.916335      0.998157
8       SSH-Patator              0.998276      0.998788      0.998808


In [3]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import joblib

In [1]:
import pandas as pd
import os

attack_types = ["Bot", "DDoS", "DoS GoldenEye", "DoS Hulk", "DoS Slowhttptest",
                "DoS slowloris", "FTP-Patator", "PortScan", "SSH-Patator"]
benign_type = "BENIGN"

all_data = []

for attack in attack_types:
    path = f"C:/Users/91701/Thesis/Threat-Detection-in-Cyber-Security-Using-AI-master/dataset/{attack}_vs_{benign_type}.csv"
    df = pd.read_csv(path)
    df[" Label"] = df[" Label"].apply(lambda v: 0 if v.strip().upper() == "BENIGN" else 1)
    all_data.append(df)

df_all = pd.concat(all_data, ignore_index=True)
print("✅ Combined shape:", df_all.shape)


✅ Combined shape: (16230880, 79)


In [2]:
DATASET_DIR = "C:/Users/91701/Thesis/Threat-Detection-in-Cyber-Security-Using-AI-master/dataset"
SAVE_DIR = "C:/Users/91701/Thesis/Threat-Detection-in-Cyber-Security-Using-AI-master/src"
combined_csv_path = os.path.join(SAVE_DIR, "df_all.csv")
df_all.to_csv(combined_csv_path, index=False)
print(f"💾 Saved combined dataset to: {combined_csv_path}")

💾 Saved combined dataset to: C:/Users/91701/Thesis/Threat-Detection-in-Cyber-Security-Using-AI-master/src\df_all.csv


In [9]:
# ✅ Step 3: Select top features
# 👉 You can update this list based on your actual importance scores
selected_features = [" Flow Duration", " Bwd Packet Length Mean", " Packet Length Std"]

if not all(f in df_all.columns for f in selected_features):
    missing = [f for f in selected_features if f not in df_all.columns]
    raise ValueError(f"❌ Missing features in dataset: {missing}")

X = df_all[selected_features].astype("float32")
y = df_all[" Label"].astype("int")

# ✅ Step 4: Scale & train
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

mlp_model = MLPClassifier(random_state=42, max_iter=1000, learning_rate_init=0.001)
mlp_model.fit(X_scaled, y)

# ✅ Step 5: Save model and scaler
joblib.dump(mlp_model, os.path.join(SAVE_DIR, "mlp_model.pkl"))
joblib.dump(scaler, os.path.join(SAVE_DIR, "scaler.pkl"))
print("✅ Saved: mlp_model.pkl and scaler.pkl")

✅ Saved: mlp_model.pkl and scaler.pkl


In [7]:
df_all.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,-0.401097,-0.37944,-0.010777,-0.009744,-0.064632,-0.008531,-0.235865,0.479238,-0.067529,-0.312913,...,1.005074,-0.099053,-0.109234,-0.13763,-0.073297,-0.282251,-0.080798,-0.284355,-0.273412,0
1,1.930937,-0.326029,-0.05673,-0.046895,-0.004781,-0.032457,0.690723,-0.53698,1.506978,1.972851,...,1.078168,-0.118177,-0.109978,-0.143077,-0.086475,-0.244274,-0.073967,-0.245803,-0.23629,0
2,-0.333292,-0.655631,-0.010123,-0.010387,-0.088198,-0.007591,-0.377384,-0.17702,-0.34649,-0.366976,...,-1.06835,-0.131771,-0.100491,-0.148812,-0.104465,-0.580707,-0.105787,-0.585977,-0.57149,0
3,-0.428196,-0.390415,-0.011691,-0.012064,-0.07071,-0.009201,-0.350847,-0.439754,-0.445006,-0.332946,...,-0.879153,-0.120297,-0.126787,-0.165393,-0.08944,-0.284261,-0.07563,-0.285737,-0.276032,0
4,-0.358097,-0.65446,-0.007449,-0.008357,-0.065506,-0.007532,-0.324424,0.449044,-0.143664,-0.366976,...,-1.06835,-0.131771,-0.100491,-0.148812,-0.104465,-0.580707,-0.105787,-0.585977,-0.57149,0


In [8]:
df_all.columns

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

In [6]:
import pandas as pd
import requests
import time

CSV_PATH = "incidents/incident_log.csv"
API_URL = "https://ipinfo.io/{ip}/json"

def get_location(ip):
    try:
        response = requests.get(API_URL.format(ip=ip), timeout=3)
        data = response.json()
        if 'loc' in data:
            lat, lon = data['loc'].split(',')
            return float(lat), float(lon)
    except Exception as e:
        print(f"Failed for {ip}: {e}")
    return None, None

def enrich_csv_with_geo(csv_path):
    df = pd.read_csv(csv_path, on_bad_lines="skip")  # pandas >= 1.3.0
    if 'latitude' not in df.columns:
        df['latitude'] = None
        df['longitude'] = None

    for i, row in df.iterrows():
        if pd.isna(row['latitude']) or pd.isna(row['longitude']):
            ip = row['ip_address']
            lat, lon = get_location(ip)
            if lat and lon:
                df.at[i, 'latitude'] = lat
                df.at[i, 'longitude'] = lon
                print(f"[{i}] {ip} → ({lat}, {lon})")
            time.sleep(0.5)  # Prevent rate-limiting

    df.to_csv(csv_path, index=False)
    print("✅ Enrichment complete.")

if __name__ == "__main__":
    enrich_csv_with_geo(CSV_PATH)


KeyboardInterrupt: 