In [None]:
#important libraries 
import os
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_curve, auc, confusion_matrix, precision_recall_curve
)
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
import seaborn as sns
import networkx as nx
from matplotlib.backends.backend_pdf import PdfPages

#brownish/beige palette for non topology plots  pt2
PALETTE = {
    'dark':   '#8b4513',  #saddle brownish
    'med':    '#deb887',  
    'light':  '#f5deb3',  
    'accent': '#cd853f'   
}

#label encoding, sampling
def load_attack_type(data_dir, attack_type, nrows=10000, n_attacks=20, amount_of_noise=0.3):
    path = os.path.join(data_dir, f"{attack_type}.csv")
    df = pd.read_csv(path, nrows=nrows, low_memory=False)
    df.columns = df.columns.str.strip()                 #getting rid off some extra spaces in headers
    df['attack_type'] = attack_type                     #atk type column
    df['Label'] = df['Label'].map(lambda x: 0 if 'benign' in str(x).lower() else 1)

    attack_df = df[df['Label'] == 1].copy()             #attack flows
    benign_df = df[df['Label'] == 0].copy()             #benign flows

    if not attack_df.empty:
        generated = []
        for i in range(1, n_attacks + 1):
            fake_ip = f"10.10.10.{i}"
            rows = attack_df.sample(frac=0.3, replace=True, random_state=i).copy()
            rows['Source IP'] = fake_ip

            #gaussian noise to numeric features to receive decent accuracy and etc (optional! but we add noise to obtain regularization and prevent overffiting))
            num_cols = rows.select_dtypes(include=[np.number]).columns.difference(['Label'])
            for col in num_cols:
                vals    = rows[col].dropna()
                std_dev = vals.std(ddof=0) if len(vals)>0 else 0
                sigma   = amount_of_noise * std_dev if std_dev>0 else 1
                rows[col] += np.random.normal(0, sigma, size=rows.shape[0])

            #flipping some labels back to benign
            if i % 5 == 0:
                rows['Label'] = 0

            generated.append(rows)

        attack_df = pd.concat([attack_df] + generated, ignore_index=True)

    #recombining n shuffling
    df = pd.concat([benign_df, attack_df], ignore_index=True)
    return df.sample(frac=1, random_state=42).reset_index(drop=True)

#placeholder for any df cleaning steps
def preprocess_for_graph(df):
    return df

def build_graph(flow_df):
    flow_df = flow_df.replace([np.inf, -np.inf], np.nan).fillna(0)  # replace inf/nan

    #mapping each ip to a unique node index
    ip_set = pd.unique(flow_df[['Source IP','Destination IP']].values.ravel())
    ip_map = {ip: idx for idx, ip in enumerate(ip_set)}
    rev_map= {idx: ip for ip, idx in ip_map.items()}

    flow_df['src'] = flow_df['Source IP'].map(ip_map)
    flow_df['dst'] = flow_df['Destination IP'].map(ip_map)

    #building node feature matrix by avg numeric cols per node
    num_cols = flow_df.select_dtypes(include=[np.number]).columns.difference(['src','dst','Label'])
    node_features = []
    for n in range(len(ip_set)):
        sub = flow_df[flow_df['src'] == n]
        feat = sub[num_cols].mean().values if not sub.empty else np.zeros(len(num_cols))
        node_features.append(feat)
    node_df = pd.DataFrame(node_features, columns=num_cols).fillna(0)

    #constructing tensors 
    edge_index = torch.tensor(flow_df[['src','dst']].values.T, dtype=torch.long)
    edge_attr  = torch.tensor(
        StandardScaler().fit_transform(flow_df['Flow Bytes/s'].values.reshape(-1,1)),
        dtype=torch.float
    )
    x = torch.tensor(StandardScaler().fit_transform(node_df), dtype=torch.float)

    #labeling nodes as attack if any incident flow has Label=1
    attack_nodes = np.union1d(
        flow_df[flow_df['Label']==1]['src'].unique(),
        flow_df[flow_df['Label']==1]['dst'].unique()
    )
    y = torch.tensor([1 if i in attack_nodes else 0 for i in range(len(node_df))], dtype=torch.float)

    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    data.node_ip_map   = rev_map             # store mapping back to ip
    data.raw_flows     = flow_df             # keep raw flows for plotting
    data.feature_names = list(num_cols)      # save feature names for importance
    data.time_series   = flow_df[['Timestamp','Flow Bytes/s']] \
        if 'Timestamp' in flow_df.columns else None
    return data

#gnn model
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.3, training=self.training)
        x = self.conv2(x, edge_index)
        return torch.sigmoid(x).squeeze()

#full batch training
def train(data, epochs, hidden_channels=32):
    model = GCN(data.x.shape[1], hidden_channels)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    loss_fn = torch.nn.BCELoss()
    for _ in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = loss_fn(out, data.y)
        loss.backward()
        optimizer.step()
    return model

#computing metrics!
def evaluate_model(model, data, threshold=0.5):
    model.eval()
    with torch.no_grad():
        y_scores = model(data).numpy()
        y_true   = data.y.numpy()
        y_pred   = (y_scores > threshold).astype(int)

    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    f1   = f1_score(y_true, y_pred, zero_division=0)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    tpr = tp/(tp+fn) if tp+fn>0 else 0
    fpr = fp/(fp+tn) if fp+tn>0 else 0

    print(f"tp: {tp}, fp: {fp}, tn: {tn}, fn: {fn}")
    print(f"accuracy: {acc:.4f}, precision: {prec:.4f}, recall: {rec:.4f}, f1: {f1:.4f}")
    print(f"tpr: {tpr:.4f}, fpr: {fpr:.4f}")

    return y_true, y_scores, y_pred, acc, prec, rec, f1, tp, fp, tn, fn

#top features
def plot_feature_importance(model, data, pdf):
    model.eval()
    x = data.x.clone().detach().requires_grad_(True)
    with torch.enable_grad():
        out = model(Data(x=x, edge_index=data.edge_index))
        out.mean().backward()

    grads = x.grad.abs().mean(dim=0).numpy()
    names = data.feature_names
    imp   = sorted(zip(names, grads), key=lambda z: z[1], reverse=True)[:10]
    top_names, top_grads = zip(*imp)

    plt.figure(figsize=(10, 4))
    plt.barh(top_names[::-1], top_grads[::-1], color=PALETTE['med'])
    plt.title("top10 feature importances")
    plt.xlabel("avg gradient magnitude")
    plt.tight_layout()
    pdf.savefig(); plt.close()

#ROC and precision recall curves
def plot_roc_pr(y_true, y_scores, pdf):
    fpr_vals, tpr_vals, _ = roc_curve(y_true, y_scores)
    auc_val = auc(fpr_vals, tpr_vals)
    plt.figure()
    plt.plot(fpr_vals, tpr_vals, color=PALETTE['dark'], label=f"auc={auc_val:.2f}")
    plt.plot([0,1],[0,1], linestyle="--", color=PALETTE['light'])
    plt.xlabel("fpr"); plt.ylabel("tpr"); plt.title("roc curve"); plt.legend()
    pdf.savefig(); plt.close()

    prec_vals, rec_vals, _ = precision_recall_curve(y_true, y_scores)
    plt.figure()
    plt.plot(rec_vals, prec_vals, color=PALETTE['accent'])
    plt.xlabel("recall"); plt.ylabel("precision"); plt.title("precision recall curve")
    pdf.savefig(); plt.close()

#threshold vs metrics
def plot_threshold_metrics(y_true, y_scores, pdf):
    thresholds = np.linspace(0, 1, 100)
    tprs, fprs, precs = [], [], []
    for t in thresholds:
        p = (y_scores >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, p).ravel()
        tprs.append(tp/(tp+fn) if tp+fn>0 else 0)
        fprs.append(fp/(fp+tn) if fp+tn>0 else 0)
        precs.append(precision_score(y_true, p, zero_division=0))
    plt.figure()
    plt.plot(thresholds, tprs, label="tpr", color=PALETTE['dark'])
    plt.plot(thresholds, fprs, label="fpr", color=PALETTE['med'])
    plt.plot(thresholds, precs, label="precision", color=PALETTE['accent'])
    plt.xlabel("threshold"); plt.ylabel("rate"); plt.title("threshold vs metrics"); plt.legend()
    pdf.savefig(); plt.close()

#heatmap (confusion metrics)
def plot_confusion_heatmap(y_true, y_pred, pdf):
    cm = confusion_matrix(y_true, y_pred)
    cmap = sns.light_palette(PALETTE['dark'], as_cmap=True)
    plt.figure()
    sns.heatmap(cm, annot=True, fmt="d", cmap=cmap,
                xticklabels=["benign","attack"],
                yticklabels=["benign","attack"])
    plt.title("confusion matrix"); plt.xlabel("predicted"); plt.ylabel("actual")
    pdf.savefig(); plt.close()

#traffic volume over time
def show_time_series(data, pdf):
    if data.time_series is not None:
        df = data.time_series.copy()
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
        df = df.dropna().sort_values('Timestamp')
        plt.figure(figsize=(10, 4))
        plt.plot(df['Timestamp'], df['Flow Bytes/s'], color=PALETTE['accent'])
        plt.title("ddos behavior over time"); plt.ylabel("flow bytes/s"); plt.grid()
        pdf.savefig(); plt.close()

#displaying topology and highlighting suspicious nodes
def show_graph_topology(data, pdf, scores=None, threshold=0.5, title="graph topology"):
    G = nx.Graph()
    flows = data.raw_flows
    G.add_edges_from(zip(flows['src'], flows['dst']))

    if scores is not None:
        color_map = []
        for node in G:
            s = scores[node] if node < len(scores) else 0
            if s > threshold:
                color_map.append('red')
            elif s > 0.3:
                color_map.append('orange')
            else:
                color_map.append('green')
        plt.figure(figsize=(8, 6))
        nx.draw(G, node_color=color_map, with_labels=True, font_size=6, node_size=100)
    else:
        plt.figure(figsize=(8, 6))
        nx.draw(G, node_color='brown', with_labels=True, font_size=6, node_size=100)

    plt.title(title)
    pdf.savefig(); plt.close()

#mitigation actions based on node scores
def get_mitigation_actions(data, scores, threshold_blacklist=0.5, threshold_limit=(0.3, 0.5)):
    actions = []
    for idx, sc in enumerate(scores):
        ip = data.node_ip_map.get(idx, f'node{idx}')
        if sc > threshold_blacklist:
            actions.append((ip, 'blacklist'))
        elif threshold_limit[0] <= sc <= threshold_limit[1]:
            actions.append((ip, 'rate limit 10mbps'))
        elif sc > 0.8:
            actions.append((ip, 'sinkhole'))
    return actions

#counting as well as printing unique ips
def check_unique_ips(df):
    src = len(df['Source IP'].unique())
    dst = len(df['Destination IP'].unique())
    all_ips = len(pd.unique(df[['Source IP','Destination IP']].values.ravel()))
    print(f"unique ips in source ip       : {src}")
    print(f"unique ips in dest ip         : {dst}")
    print(f"total unique ips              : {all_ips}")
    return all_ips

#flipping a fraction of attack labels to benign
def inject_label_noise(df, flip_fraction=0.1, seed=42):
    np.random.seed(seed)
    idxs = df[df['Label']==1].index
    n_flip = int(len(idxs) * flip_fraction)
    flips = np.random.choice(idxs, n_flip, replace=False)
    df.loc[flips,'Label'] = 0
    return df

#datapath specification (ddos1 is the file for next day attacks (note! new sets of data), which includes ldap, mssql, netbios and etc)
data_dir = "ddos1"
all_attacks = [
    "LDAP", "MSSQL", "NetBIOS", "Portmap",
    "Syn", "UDP", "UDPLag"
]

with PdfPages("gnngraphs(second half).pdf") as pdf:
    for atk in all_attacks:
        print(f"\n{atk}\n")
        df = load_attack_type(data_dir, atk, nrows=100000, n_attacks=20, amount_of_noise=0.1)
        check_unique_ips(df)
        df = preprocess_for_graph(df)
        df = inject_label_noise(df, flip_fraction=0.1)

        #train/test
        train_flows, test_flows = train_test_split(
            df, test_size=0.3, stratify=df['Label'], random_state=42
        )
        train_data = build_graph(train_flows)
        test_data  = build_graph(test_flows)

        #visualization before atk
        show_graph_topology(train_data, pdf, title=f"{atk}: pre attack topology")
        show_time_series(train_data, pdf)

        #evaluating and training
        model = train(train_data, epochs=50, hidden_channels=32)
        y_true, y_scores, y_pred, acc, prec, rec, f1, tp, fp, tn, fn = evaluate_model(model, test_data)

        #plots
        plot_roc_pr(y_true, y_scores, pdf)
        plot_threshold_metrics(y_true, y_scores, pdf)
        plot_confusion_heatmap(y_true, y_pred, pdf)
        plot_feature_importance(model, train_data, pdf)

        #pre atk visualization
        show_graph_topology(test_data, pdf, scores=y_scores, title=f"{atk}: post attack topology")

        #outcomes
        plt.figure(); plt.axis('off')
        summary = (
            f"attack: {atk}\n\n"
            f"accuracy: {acc:.4f}  precision: {prec:.4f}\n"
            f"recall: {rec:.4f}  f1: {f1:.4f}\n"
            f"tp: {tp}  fp: {fp}  tn: {tn}  fn: {fn}"
        )
        plt.text(0, 0.5, summary, fontsize=12, fontfamily="monospace")
        pdf.savefig(); plt.close()

        #mitigation actions
        actions = get_mitigation_actions(test_data, y_scores)
        print("\nmitigation actions:")
        for ip, act in actions:
            print(f"{ip}: {act}")
