<a href="https://colab.research.google.com/github/osahonokoro/anomaly-detection-isolationforest-cicids2017/blob/main/notebooks/04_features_engineering_and_selection2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# âœ… Mount Google Drive first (run this only once after reconnecting)
from google.colab import drive
drive.mount('/content/drive')

# âœ… Load the anomaly-labeled dataset from your Drive
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/anomaly_detection_output.csv')
df.columns = df.columns.str.strip()
df['binary_anomaly'] = df['anomaly'].map({'Normal': 0, 'Anomaly': 1})

print("âœ… Reloaded dataset. Shape:", df.shape)
print(df['binary_anomaly'].value_counts())


Mounted at /content/drive
âœ… Reloaded dataset. Shape: (2827876, 82)
binary_anomaly
0    2771314
1      56562
Name: count, dtype: int64


In [3]:
#  One-Class SVM with Chunked Processing (100,000 rows per batch)

from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import pandas as pd

# Parameters
chunk_size = 100_000
svm_preds = []
binary_labels = []

# Drop non-feature columns
base_features = df.drop(columns=['anomaly', 'binary_anomaly'], errors='ignore')

# Loop over chunks
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i+chunk_size].copy()
    features = chunk.drop(columns=['anomaly', 'binary_anomaly'], errors='ignore')

    # Scale features
    scaler = StandardScaler()
    scaled = scaler.fit_transform(features)

    # Train One-Class SVM on the chunk
    svm = OneClassSVM(nu=0.02, kernel='rbf', gamma='scale')
    pred = svm.fit_predict(scaled)

    # Convert predictions to readable labels
    pred = pd.Series(pred).map({1: 'Normal', -1: 'Anomaly'})
    svm_preds.extend(pred)

    # Store true labels
    binary_labels.extend(chunk['binary_anomaly'])

    print(f"âœ… Processed rows {i} to {i+chunk_size-1}")

# Final Evaluation
print("\nðŸ“Š One-Class SVM Chunked Evaluation:")
print(pd.Series(svm_preds).value_counts())
print("\nðŸ“ˆ Classification Report:")
print(classification_report(binary_labels, pd.Series(svm_preds).map({'Normal': 0, 'Anomaly': 1})))


âœ… Processed rows 0 to 99999
âœ… Processed rows 100000 to 199999
âœ… Processed rows 200000 to 299999
âœ… Processed rows 300000 to 399999
âœ… Processed rows 400000 to 499999
âœ… Processed rows 500000 to 599999
âœ… Processed rows 600000 to 699999
âœ… Processed rows 700000 to 799999
âœ… Processed rows 800000 to 899999
âœ… Processed rows 900000 to 999999
âœ… Processed rows 1000000 to 1099999
âœ… Processed rows 1100000 to 1199999
âœ… Processed rows 1200000 to 1299999
âœ… Processed rows 1300000 to 1399999
âœ… Processed rows 1400000 to 1499999
âœ… Processed rows 1500000 to 1599999
âœ… Processed rows 1600000 to 1699999
âœ… Processed rows 1700000 to 1799999
âœ… Processed rows 1800000 to 1899999
âœ… Processed rows 1900000 to 1999999
âœ… Processed rows 2000000 to 2099999
âœ… Processed rows 2100000 to 2199999
âœ… Processed rows 2200000 to 2299999
âœ… Processed rows 2300000 to 2399999
âœ… Processed rows 2400000 to 2499999
âœ… Processed rows 2500000 to 2599999
âœ… Processed rows 2600000 to 2699999


In [4]:
# K-Means with Chunked Processing (500,000 rows per batch)

from sklearn.cluster import KMeans
from collections import Counter
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Set paths
input_file = '/content/drive/MyDrive/anomaly_detection_output.csv'
chunk_size = 500_000

# Placeholder for results
kmeans_results = []

# First, fit the scaler on a small subset
sample_df = pd.read_csv(input_file, nrows=100_000)
sample_df.columns = sample_df.columns.str.strip()
base_sample = sample_df.drop(columns=['anomaly', 'binary_anomaly'], errors='ignore')
scaler = StandardScaler()
scaler.fit(base_sample)

# Fit KMeans on the same subset to get initial clusters
scaled_sample = scaler.transform(base_sample)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans.fit(scaled_sample)

# Find the smallest cluster (used as anomaly class)
cluster_counts = Counter(kmeans.labels_)
smallest_cluster = min(cluster_counts, key=cluster_counts.get)
print(f"ðŸ“Œ Smallest cluster assumed to be anomaly: Cluster {smallest_cluster}")

# Process in chunks
for chunk in pd.read_csv(input_file, chunksize=chunk_size):
    chunk.columns = chunk.columns.str.strip()
    features = chunk.drop(columns=['anomaly', 'binary_anomaly'], errors='ignore')

    # Scale
    scaled_chunk = scaler.transform(features)

    # Predict clusters
    chunk_clusters = kmeans.predict(scaled_chunk)

    # Mark anomalies
    chunk['kmeans_anomaly'] = ['Anomaly' if label == smallest_cluster else 'Normal' for label in chunk_clusters]

    # Store results
    kmeans_results.append(chunk)

# Concatenate all chunks
final_kmeans_df = pd.concat(kmeans_results, ignore_index=True)
final_kmeans_df.to_csv('/content/drive/MyDrive/kmeans_anomaly_output.csv', index=False)

# Show summary
print("\nðŸ“Š K-Means Anomaly Detection Results:")
print(final_kmeans_df['kmeans_anomaly'].value_counts())


ðŸ“Œ Smallest cluster assumed to be anomaly: Cluster 1

ðŸ“Š K-Means Anomaly Detection Results:
kmeans_anomaly
Normal     2507971
Anomaly     319905
Name: count, dtype: int64


In [5]:
from sklearn.metrics import classification_report

# Create binary labels (if not already)
final_kmeans_df['binary_anomaly'] = final_kmeans_df['anomaly'].map({'Normal': 0, 'Anomaly': 1})
final_kmeans_df['kmeans_binary'] = final_kmeans_df['kmeans_anomaly'].map({'Normal': 0, 'Anomaly': 1})

# Classification report
print("ðŸ“ˆ Classification Report: K-Means vs Ground Truth")
print(classification_report(final_kmeans_df['binary_anomaly'], final_kmeans_df['kmeans_binary']))


ðŸ“ˆ Classification Report: K-Means vs Ground Truth
              precision    recall  f1-score   support

           0       0.99      0.90      0.95   2771314
           1       0.14      0.77      0.23     56562

    accuracy                           0.90   2827876
   macro avg       0.57      0.83      0.59   2827876
weighted avg       0.98      0.90      0.93   2827876



In [None]:
# Chunked LOF Implementation (500k per batch)

# ðŸ“¦ Required Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report

# âœ… Reload the dataset if needed
df = pd.read_csv('/content/drive/MyDrive/anomaly_detection_output.csv')
df.columns = df.columns.str.strip()
df['binary_anomaly'] = df['anomaly'].map({'Normal': 0, 'Anomaly': 1})

# Parameters
chunk_size = 500_000
n_neighbors = 20
contamination = 0.02

# Store predictions
lof_preds = []

# Loop over chunks
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i+chunk_size].copy()
    features = chunk.drop(columns=['anomaly', 'binary_anomaly'], errors='ignore')

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(features)

    # Apply LOF
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    pred = lof.fit_predict(X_scaled)
    pred = np.where(pred == -1, 1, 0)  # Convert to 1 for anomaly, 0 for normal

    lof_preds.extend(pred)

# Final prediction list
df['lof_anomaly'] = lof_preds

# âœ… Evaluate
print("ðŸ“Š LOF Chunked Evaluation:")
print(df['lof_anomaly'].value_counts())

print("\nðŸ“ˆ Classification Report: LOF vs Ground Truth")
print(classification_report(df['binary_anomaly'], df['lof_anomaly']))
