In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# ----------------------------------------------------------------------
# Ρυθμίσεις
# ----------------------------------------------------------------------
EPOCHS = 100
BATCH_SIZE = 256
RANDOM_STATE = 42

DATA_URL = "https://raw.githubusercontent.com/kdemertzis/EKPA/main/Data/pcap_data.csv"
SAVE_DIR = "./savemodels_pcap"
os.makedirs(SAVE_DIR, exist_ok=True)

# ----------------------------------------------------------------------
# Φόρτωση & προεπεξεργασία δεδομένων
# ----------------------------------------------------------------------
def load_dataset():
    # Put dataset path here !
    df = pd.read_csv(DATA_URL)
    filename = "C:/Users/ΝΙΚΟΣ/Desktop/pcap_data.csv"

    print("Columns:", df.columns.tolist())
    print("Shape:", df.shape)

    # -----------------------------
    # 1. Διαχωρισμός χαρακτηριστικών / ετικέτας
    # -----------------------------
    # ΑΛΛΑΞΕ εδώ αν η στήλη ετικέτας έχει άλλο όνομα
    label_col = "target"
    if label_col not in df.columns:
        raise ValueError(f"Δεν βρέθηκε στήλη '{label_col}'. Έλεγξε τα ονόματα στηλών.")

    df["target"] = (df["target"] > 0).astype(int)


    y_raw = df[label_col]
    X = df.drop(columns=[label_col])

    # Αν υπάρχουν μη αριθμητικές στήλες στα χαρακτηριστικά, τις κωδικοποιούμε/αφαιρούμε
    # Εδώ, για απλότητα, κρατάμε μόνο τις αριθμητικές στήλες
    X = X.select_dtypes(include=[np.number])

    # Αν υπάρχουν NaN, τα αντικαθιστούμε με 0 (ή ό,τι κρίνεις καλύτερο)
    X = X.fillna(0)

    # -----------------------------
    # 2. Κωδικοποίηση ετικετών
    # -----------------------------
    le = LabelEncoder()
    y_int = le.fit_transform(y_raw)
    n_classes = len(le.classes_)
    print("Classes:", le.classes_)
    print("n_classes:", n_classes)

    # -----------------------------
    # 3. Κανονικοποίηση χαρακτηριστικών (min-max)
    # -----------------------------
    X = X.to_numpy().astype("float32")
    dmin = X.min(axis=0)
    dmax = X.max(axis=0)
    # Προσοχή σε περιπτώσεις όπου dmax == dmin
    denom = (dmax - dmin)
    denom[denom == 0] = 1.0
    X = (X - dmin) / denom

    # -----------------------------
    # 4. Train / Val / Test split
    # -----------------------------
    X_train, X_test, y_train_int, y_test_int = train_test_split(
        X, y_int, test_size=0.20, stratify=y_int, random_state=RANDOM_STATE
    )

    X_train, X_val, y_train_int, y_val_int = train_test_split(
        X_train, y_train_int, test_size=0.125, stratify=y_train_int, random_state=RANDOM_STATE
    )
    # 0.8 * 0.125 = 0.10 → 70/10/20

    # One-hot
    y_train = to_categorical(y_train_int, n_classes)
    y_val = to_categorical(y_val_int, n_classes)
    y_test = to_categorical(y_test_int, n_classes)

    return (X_train, y_train, y_train_int,
            X_val, y_val, y_val_int,
            X_test, y_test, y_test_int,
            le, n_classes)


# ----------------------------------------------------------------------
# Ορισμός μοντέλου (π.χ. απλό MLP)
# ----------------------------------------------------------------------
def build_model(input_dim, n_classes, lr=1e-3):
    model = Sequential()
    model.add(Dense(128, activation="relu", input_shape=(input_dim,)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(n_classes, activation="softmax"))

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model


# ----------------------------------------------------------------------
# Κύριο πρόγραμμα
# ----------------------------------------------------------------------
def main():
    # Φόρτωση δεδομένων
    (X_train, y_train, y_train_int,
     X_val, y_val, y_val_int,
     X_test, y_test, y_test_int,
     label_encoder, n_classes) = load_dataset()

    inshape = X_train.shape[1]
    print("Train shape:", X_train.shape)
    print("Val shape:", X_val.shape)
    print("Test shape:", X_test.shape)

    # Υπολογισμός class weights
    class_weights_arr = class_weight.compute_class_weight(
        class_weight="balanced",
        classes=np.unique(y_train_int),
        y=y_train_int
    )
    class_weights = {i: class_weights_arr[i] for i in range(len(class_weights_arr))}
    print("Class weights:", class_weights)

    # Μοντέλο
    model = build_model(inshape, n_classes, lr=1e-3)
    model.summary()

    # Callbacks
    early_stopping = EarlyStopping(
        monitor="val_loss",
        patience=15,
        mode="min",
        restore_best_weights=True,
        verbose=1
    )

    checkpoint_path = os.path.join(
        SAVE_DIR, "pcap_ids.weights.{epoch:03d}-{val_accuracy:.4f}.weights.h5"
    )
    model_checkpoint = ModelCheckpoint(
        checkpoint_path,
        monitor="val_accuracy",
        mode="max",
        save_best_only=True,
        save_weights_only=True,
        verbose=1
    )

    # Εκπαίδευση
    history = model.fit(
        X_train,
        y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=(X_val, y_val),
        class_weight=class_weights,
        shuffle=True,
        callbacks=[early_stopping, model_checkpoint],

            )

    # Βρίσκουμε το καλύτερο μοντέλο (τελευταίο αρχείο στο SAVE_DIR)
    saved_models = sorted(os.listdir(SAVE_DIR))
    if saved_models:
        best_model_file = saved_models[-1]
        print("Best model file:", best_model_file)
        model.load_weights(os.path.join(SAVE_DIR, best_model_file))

    # Αξιολόγηση στο test set
    print("Evaluating on test set...")
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}")

    # Confusion matrix
    y_pred_proba = model.predict(X_test)
    y_pred_int = y_pred_proba.argmax(axis=-1)

    cm = confusion_matrix(y_test_int, y_pred_int)
    print("Confusion matrix:\n", cm)

    labels = label_encoder.classes_
    print("Accuracy per class:")
    for i, cls in enumerate(labels):
        cls_acc = cm[i, i] / np.sum(cm[i, :]) if np.sum(cm[i, :]) > 0 else 0.0
        print(f"{cls:<20} = {cls_acc:.4f}")
        
    # Plot confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    fig, ax = plt.subplots(figsize=(10, 10))
    disp.plot(ax=ax, xticks_rotation=45)
    plt.title("Confusion Matrix - PCAP IDS")
    plt.tight_layout()

    # Ιστορικό εκπαίδευσης
    acc = history.history["accuracy"]
    val_acc = history.history["val_accuracy"]
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]

    # Αποθήκευση ιστορικού (προαιρετικό)
    np.save("history_pcap.npy", [acc, val_acc, loss, val_loss])

    epochs_range = range(len(acc))

    plt.figure()
    plt.plot(epochs_range, acc, "b", label="Training acc")
    plt.plot(epochs_range, val_acc, "r.", label="Validation acc")
    plt.title("Training and validation accuracy")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend()

    plt.figure()
    plt.plot(epochs_range, loss, "b", label="Training loss")
    plt.plot(epochs_range, val_loss, "r.", label="Validation loss")
    plt.title("Training and validation loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()

    plt.show()


if __name__ == "__main__":
    main()
