# NSK-KDD

## Logistic Regression (no SMOTE) 

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)

# === 0. Fix random seed for reproducibility ===
np.random.seed(42)

# === 1. Load binary-labeled train and test datasets ===
train_df = pd.read_csv("KDDTrain_cleaned_binary.csv")
test_df = pd.read_csv("KDDTest_cleaned_binary.csv")

# === 2. Split features and labels ===
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

# === 3. One-hot encoding of categorical variables ===
X_all = pd.concat([X_train, X_test])
X_all_encoded = pd.get_dummies(X_all, drop_first=True)

X_train_encoded = X_all_encoded.iloc[:len(X_train), :]
X_test_encoded = X_all_encoded.iloc[len(X_train):, :]

# === 4. Normalize features ===
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# === 5. Train logistic regression model ===
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)

# === 6. Predict and calculate metrics ===
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]  # For AUC

# === 7. Confusion matrix and metrics ===
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
far = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Alarm Rate

# === 8. Print results ===
print("=== Classification Metrics ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"FAR      : {far:.4f}")
print(f"AUC      : {auc:.4f}")
print("\n=== Confusion Matrix ===")
print(f"TN: {tn}, FP: {fp}")
print(f"FN: {fn}, TP: {tp}")


=== Classification Metrics ===
Accuracy      : 0.7443
Precision     : 0.9142
Recall        : 0.6079
F1-Score      : 0.7302
FAR      : 0.0754
AUC      : 0.8276

=== Confusion Matrix ===
TN: 8978, FP: 732
FN: 5032, TP: 7800


## Logistic Regression (with SMOTE) 

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)

# === 0. Fix random seed for reproducibility ===
np.random.seed(42)

from imblearn.over_sampling import SMOTE

# === 1. Load binary-labeled datasets ===
train_df = pd.read_csv("KDDTrain_cleaned_binary.csv")
test_df = pd.read_csv("KDDTest_cleaned_binary.csv")

# === 2. Split features and labels ===
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

# === 3. One-hot encode categorical columns ===
X_all = pd.concat([X_train, X_test])
X_all_encoded = pd.get_dummies(X_all, drop_first=True)

X_train_encoded = X_all_encoded.iloc[:len(X_train), :]
X_test_encoded = X_all_encoded.iloc[len(X_train):, :]

# === 4. Normalize features ===
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# === 5. Apply SMOTE to training data only ===
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train_scaled, y_train)

# === 6. Train logistic regression model ===
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_smote, y_train_smote)

# === 7. Predict and calculate metrics ===
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
far = fp / (fp + tn) if (fp + tn) > 0 else 0

# === 8. Print results ===
print("=== Classification Metrics (with SMOTE) ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"AUC           : {auc:.4f}")
print(f"FAR (False Alarm Rate): {far:.4f}")

print("\n=== Confusion Matrix ===")
print(f"TN: {tn}, FP: {fp}")
print(f"FN: {fn}, TP: {tp}")


=== Classification Metrics (with SMOTE) ===
Accuracy      : 0.7458
Precision     : 0.9137
Recall        : 0.6112
F1-Score      : 0.7324
AUC           : 0.8227
FAR (False Alarm Rate): 0.0763

=== Confusion Matrix ===
TN: 8969, FP: 741
FN: 4989, TP: 7843


## Naive Bayes (no SMOTE) 

In [13]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)

# === 0. Fix random seed for reproducibility ===
np.random.seed(42)

# === 1. Load binary-labeled datasets ===
train_df = pd.read_csv("KDDTrain_cleaned_binary.csv")
test_df = pd.read_csv("KDDTest_cleaned_binary.csv")

# === 2. Split features and labels ===
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

# === 3. One-hot encode categorical columns ===
X_all = pd.concat([X_train, X_test])
X_all_encoded = pd.get_dummies(X_all, drop_first=True)

X_train_encoded = X_all_encoded.iloc[:len(X_train), :]
X_test_encoded = X_all_encoded.iloc[len(X_train):, :]

# === 4. Normalize features ===
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# === 5. Train Naive Bayes model ===
model = GaussianNB()
model.fit(X_train_scaled, y_train)

# === 6. Predict and evaluate ===
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
far = fp / (fp + tn) if (fp + tn) > 0 else 0

# === 7. Print results ===
print("=== Classification Metrics (Naive Bayes, No SMOTE) ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"AUC           : {auc:.4f}")
print(f"FAR (False Alarm Rate): {far:.4f}")

print("\n=== Confusion Matrix ===")
print(f"TN: {tn}, FP: {fp}")
print(f"FN: {fn}, TP: {tp}")


=== Classification Metrics (Naive Bayes, No SMOTE) ===
Accuracy      : 0.5649
Precision     : 0.9800
Recall        : 0.2406
F1-Score      : 0.3864
AUC           : 0.7987
FAR (False Alarm Rate): 0.0065

=== Confusion Matrix ===
TN: 9647, FP: 63
FN: 9744, TP: 3088


## Naive Bayes (with SMOTE) 

In [14]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)

# === 0. Fix random seed for reproducibility ===
np.random.seed(42)

from imblearn.over_sampling import SMOTE

# === 1. Load binary-labeled datasets ===
train_df = pd.read_csv("KDDTrain_cleaned_binary.csv")
test_df = pd.read_csv("KDDTest_cleaned_binary.csv")

# === 2. Split features and labels ===
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

# === 3. One-hot encode categorical columns ===
X_all = pd.concat([X_train, X_test])
X_all_encoded = pd.get_dummies(X_all, drop_first=True)

X_train_encoded = X_all_encoded.iloc[:len(X_train), :]
X_test_encoded = X_all_encoded.iloc[len(X_train):, :]

# === 4. Normalize features ===
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# === 5. Apply SMOTE to training data only ===
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train_scaled, y_train)

# === 6. Train Naive Bayes model ===
model = GaussianNB()
model.fit(X_train_smote, y_train_smote)

# === 7. Predict and evaluate ===
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
far = fp / (fp + tn) if (fp + tn) > 0 else 0

# === 8. Print results ===
print("=== Classification Metrics (Naive Bayes with SMOTE) ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"AUC           : {auc:.4f}")
print(f"FAR (False Alarm Rate): {far:.4f}")

print("\n=== Confusion Matrix ===")
print(f"TN: {tn}, FP: {fp}")
print(f"FN: {fn}, TP: {tp}")


=== Classification Metrics (Naive Bayes with SMOTE) ===
Accuracy      : 0.5650
Precision     : 0.9800
Recall        : 0.2408
F1-Score      : 0.3866
AUC           : 0.7978
FAR (False Alarm Rate): 0.0065

=== Confusion Matrix ===
TN: 9647, FP: 63
FN: 9742, TP: 3090


## Autoencoder (no SMOTE) 

In [2]:
# === Reproducibility setup (set BEFORE importing tensorflow) ===
import os
os.environ["PYTHONHASHSEED"] = "42"        # stable hashing in Python
os.environ["TF_DETERMINISTIC_OPS"] = "1"   # deterministic TF ops (CPU)
# Optional: force CPU only for strict determinism across machines
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import random
import numpy as np
random.seed(42)
np.random.seed(42)

import tensorflow as tf
tf.random.set_seed(42)
# If available in your TF version, also enforce global determinism:
try:
    tf.config.experimental.enable_op_determinism(True)
except Exception:
    pass

# === Libraries ===
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

# === 1. Load datasets ===
train_df = pd.read_csv("KDDTrain_cleaned_binary.csv")
test_df = pd.read_csv("KDDTest_cleaned_binary.csv")

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

# === 2. One-hot encoding ===
X_all = pd.concat([X_train, X_test])
X_all_encoded = pd.get_dummies(X_all, drop_first=True)

X_train_encoded = X_all_encoded.iloc[:len(X_train), :]
X_test_encoded = X_all_encoded.iloc[len(X_train):, :]

# === 3. Normalize ===
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# === 4. Build and train autoencoder (deterministic) ===
input_dim = X_train_scaled.shape[1]
encoding_dim = 32

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
output_layer = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
encoder = Model(inputs=input_layer, outputs=encoded)

autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Nota: shuffle=False + validation_split produce un split determinista (toma el último 10%).
autoencoder.fit(
    X_train_scaled, X_train_scaled,
    epochs=20,
    batch_size=256,
    shuffle=False,          # <- clave para reproducibilidad
    validation_split=0.1,
    verbose=0
)

# === 5. Generate latent embeddings ===
X_train_embedded = encoder.predict(X_train_scaled, verbose=0)
X_test_embedded = encoder.predict(X_test_scaled, verbose=0)

# === 6. Define evaluation function ===
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1-Score": f1_score(y_test, y_pred, zero_division=0),
        "AUC": roc_auc_score(y_test, y_proba),
        "FAR": fp / (fp + tn) if (fp + tn) > 0 else 0,
        "ConfusionMatrix": (tn, fp, fn, tp)
    }
    return metrics

# === 7. Train and evaluate classifiers on embeddings ===
logreg = LogisticRegression(random_state=42, max_iter=1000)
rf = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=1)  # n_jobs=1 para reproducibilidad

results = []
results.append(evaluate_model("Logistic Regression on AE Embedding", logreg,
                              X_train_embedded, y_train, X_test_embedded, y_test))
results.append(evaluate_model("Random Forest on AE Embedding", rf,
                              X_train_embedded, y_train, X_test_embedded, y_test))

# === 8. Print comparison ===
for res in results:
    print(f"\n=== {res['Model']} ===")
    print(f"Accuracy      : {res['Accuracy']:.4f}")
    print(f"Precision     : {res['Precision']:.4f}")
    print(f"Recall        : {res['Recall']:.4f}")
    print(f"F1-Score      : {res['F1-Score']:.4f}")
    print(f"AUC           : {res['AUC']:.4f}")
    print(f"FAR           : {res['FAR']:.4f}")
    tn, fp, fn, tp = res['ConfusionMatrix']
    print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")


=== Logistic Regression on AE Embedding ===
Accuracy      : 0.7617
Precision     : 0.9165
Recall        : 0.6397
F1-Score      : 0.7534
AUC           : 0.9040
FAR           : 0.0770
Confusion Matrix: TN=8962, FP=748, FN=4624, TP=8208

=== Random Forest on AE Embedding ===
Accuracy      : 0.7661
Precision     : 0.9229
Recall        : 0.6428
F1-Score      : 0.7578
AUC           : 0.9350
FAR           : 0.0710
Confusion Matrix: TN=9021, FP=689, FN=4583, TP=8249


## Autoencoder (with SMOTE)

In [23]:
# === 1. Reproducibility setup ===
import os, random
os.environ["PYTHONHASHSEED"] = "42"
os.environ["TF_DETERMINISTIC_OPS"] = "1"
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Opcional: fuerza CPU

random.seed(42)
import numpy as np
np.random.seed(42)

import tensorflow as tf
tf.random.set_seed(42)
try:
    tf.config.experimental.enable_op_determinism(True)
except Exception:
    pass

# === 2. Libraries ===
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

# === 3. Load datasets ===
train_df = pd.read_csv("KDDTrain_cleaned_binary.csv")
test_df = pd.read_csv("KDDTest_cleaned_binary.csv")

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

# === 4. One-hot encoding ===
X_all = pd.concat([X_train, X_test])
X_all_encoded = pd.get_dummies(X_all, drop_first=True)

X_train_encoded = X_all_encoded.iloc[:len(X_train), :]
X_test_encoded = X_all_encoded.iloc[len(X_train):, :]

# === 5. Normalize ===
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# === 6. Autoencoder definition ===
input_dim = X_train_scaled.shape[1]
encoding_dim = 32

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
output_layer = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
encoder = Model(inputs=input_layer, outputs=encoded)

autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# 🔑 shuffle=False para reproducibilidad (validation_split será determinista: último 10%)
autoencoder.fit(
    X_train_scaled, X_train_scaled,
    epochs=20,
    batch_size=256,
    shuffle=False,
    validation_split=0.1,
    verbose=0
)

# === 7. Embedding extraction ===
X_train_embedded = encoder.predict(X_train_scaled, verbose=0)
X_test_embedded = encoder.predict(X_test_scaled, verbose=0)

# === 8. Apply SMOTE to embeddings ===
sm = SMOTE(random_state=42)
X_train_embedded_smote, y_train_smote = sm.fit_resample(X_train_embedded, y_train)

# === 9. Evaluation function ===
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    return {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1-Score": f1_score(y_test, y_pred, zero_division=0),
        "AUC": roc_auc_score(y_test, y_proba),
        "FAR": fp / (fp + tn) if (fp + tn) > 0 else 0,
        "ConfusionMatrix": (tn, fp, fn, tp)
    }

# === 10. Classifiers ===
logreg = LogisticRegression(random_state=42, max_iter=1000)
rf = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=1)  # n_jobs=1 -> reproducible

results = []
results.append(evaluate_model("Logistic Regression + AE + SMOTE", logreg,
                              X_train_embedded_smote, y_train_smote,
                              X_test_embedded, y_test))

results.append(evaluate_model("Random Forest + AE + SMOTE", rf,
                              X_train_embedded_smote, y_train_smote,
                              X_test_embedded, y_test))

# === 11. Print results ===
for res in results:
    print(f"\n=== {res['Model']} ===")
    print(f"Accuracy      : {res['Accuracy']:.4f}")
    print(f"Precision     : {res['Precision']:.4f}")
    print(f"Recall        : {res['Recall']:.4f}")
    print(f"F1-Score      : {res['F1-Score']:.4f}")
    print(f"AUC           : {res['AUC']:.4f}")
    print(f"FAR           : {res['FAR']:.4f}")
    tn, fp, fn, tp = res["ConfusionMatrix"]
    print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")



=== Logistic Regression + AE + SMOTE ===
Accuracy      : 0.7654
Precision     : 0.9166
Recall        : 0.6467
F1-Score      : 0.7583
AUC           : 0.9043
FAR           : 0.0778
Confusion Matrix: TN=8955, FP=755, FN=4534, TP=8298

=== Random Forest + AE + SMOTE ===
Accuracy      : 0.7822
Precision     : 0.9652
Recall        : 0.6405
F1-Score      : 0.7700
AUC           : 0.9355
FAR           : 0.0305
Confusion Matrix: TN=9414, FP=296, FN=4613, TP=8219


### LDA (no SMOTE) 

In [24]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
import numpy as np

# === 1. Reproducibility ===
np.random.seed(42)

# === 2. Load datasets ===
train_df = pd.read_csv("KDDTrain_cleaned_binary.csv")
test_df = pd.read_csv("KDDTest_cleaned_binary.csv")

# === 3. Split features and labels ===
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

# === 4. One-hot encode categorical columns ===
X_all = pd.concat([X_train, X_test])
X_all_encoded = pd.get_dummies(X_all, drop_first=True)

X_train_encoded = X_all_encoded.iloc[:len(X_train), :]
X_test_encoded = X_all_encoded.iloc[len(X_train):, :]

# === 5. Normalize features ===
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# === 6. Train LDA classifier ===
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train_scaled, y_train)

# === 7. Predict and evaluate ===
y_pred = lda_model.predict(X_test_scaled)
y_proba = lda_model.predict_proba(X_test_scaled)[:, 1]

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# === 8. Compute metrics ===
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
far = fp / (fp + tn) if (fp + tn) > 0 else 0

# === 9. Print results ===
print("=== Classification Metrics (LDA as Classifier) ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"AUC           : {auc:.4f}")
print(f"FAR (False Alarm Rate): {far:.4f}")

print("\n=== Confusion Matrix ===")
print(f"TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}")


=== Classification Metrics (LDA as Classifier) ===
Accuracy      : 0.7616
Precision     : 0.9249
Recall        : 0.6326
F1-Score      : 0.7514
AUC           : 0.8484
FAR (False Alarm Rate): 0.0679

=== Confusion Matrix ===
TN: 9051, FP: 659, FN: 4714, TP: 8118


## LDA (with SMOTE) 

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# === 1. Reproducibility ===
np.random.seed(42)
tf.random.set_seed(42)

# === 2. Load datasets ===
train_df = pd.read_csv("KDDTrain_cleaned_binary.csv")
test_df = pd.read_csv("KDDTest_cleaned_binary.csv")
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

# === 3. One-hot encode categorical features ===
X_all = pd.concat([X_train, X_test])
X_all_encoded = pd.get_dummies(X_all, drop_first=True)
X_train_encoded = X_all_encoded.iloc[:len(X_train), :]
X_test_encoded = X_all_encoded.iloc[len(X_train):, :]

# === 4. Normalize ===
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# === 5. Scenario A: SMOTE + LDA as classifier ===
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train_scaled, y_train)

lda_clf = LinearDiscriminantAnalysis()
lda_clf.fit(X_train_smote, y_train_smote)
y_pred_lda = lda_clf.predict(X_test_scaled)
y_proba_lda = lda_clf.predict_proba(X_test_scaled)[:, 1]

# === 6. Scenario B: SMOTE + AE + LDA + LR ===
# Train autoencoder
input_dim = X_train_scaled.shape[1]
encoding_dim = 32

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
output_layer = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
encoder = Model(inputs=input_layer, outputs=encoded)

autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=20, batch_size=256,
                shuffle=True, validation_split=0.1, verbose=0)

X_train_embedded = encoder.predict(X_train_scaled)
X_test_embedded = encoder.predict(X_test_scaled)

# Apply SMOTE to embedding
X_embed_smote, y_embed_smote = sm.fit_resample(X_train_embedded, y_train)

# Apply LDA on embedding after SMOTE
lda_proj = LinearDiscriminantAnalysis(n_components=1)
X_embed_lda_train = lda_proj.fit_transform(X_embed_smote, y_embed_smote)
X_embed_lda_test = lda_proj.transform(X_test_embedded)

# Train classifier (e.g., LR) on reduced embedding
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_embed_lda_train, y_embed_smote)
y_pred_proj = lr.predict(X_embed_lda_test)
y_proba_proj = lr.predict_proba(X_embed_lda_test)[:, 1]

# === 7. Evaluation function ===
def evaluate(name, y_true, y_pred, y_proba):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1-Score": f1_score(y_true, y_pred, zero_division=0),
        "AUC": roc_auc_score(y_true, y_proba),
        "FAR": fp / (fp + tn) if (fp + tn) > 0 else 0,
        "ConfusionMatrix": (tn, fp, fn, tp)
    }

results = [
    evaluate("SMOTE + LDA Classifier", y_test, y_pred_lda, y_proba_lda),
    evaluate("SMOTE + AE + LDA + LogisticRegression", y_test, y_pred_proj, y_proba_proj)
]

# === 8. Print results ===
for res in results:
    print(f"\n=== {res['Model']} ===")
    print(f"Accuracy      : {res['Accuracy']:.4f}")
    print(f"Precision     : {res['Precision']:.4f}")
    print(f"Recall        : {res['Recall']:.4f}")
    print(f"F1-Score      : {res['F1-Score']:.4f}")
    print(f"AUC           : {res['AUC']:.4f}")
    print(f"FAR           : {res['FAR']:.4f}")
    tn, fp, fn, tp = res["ConfusionMatrix"]
    print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")


[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 326us/step
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 350us/step

=== SMOTE + LDA Classifier ===
Accuracy      : 0.7632
Precision     : 0.9238
Recall        : 0.6365
F1-Score      : 0.7537
AUC           : 0.8525
FAR           : 0.0694
Confusion Matrix: TN=9036, FP=674, FN=4664, TP=8168

=== SMOTE + AE + LDA + LogisticRegression ===
Accuracy      : 0.7451
Precision     : 0.9117
Recall        : 0.6114
F1-Score      : 0.7319
AUC           : 0.8669
FAR           : 0.0783
Confusion Matrix: TN=8950, FP=760, FN=4987, TP=7845
