<a href="https://colab.research.google.com/github/manvichand/Anomaly-Detection-on-Security-Logs/blob/main/log_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1.Dataset: Loaded from /content/drive/My Drive/CICIDS2017/MachineLearningCVE/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv.

2.Model: Isolation Forest with 100 estimators and 5% contamination, trained on 78 numeric features.

3.Output: Predictions (anomaly_score: 1 = normal, -1 = anomaly; is_anomaly: 0 = normal, 1 = anomaly) alongside the original " Label".

In [None]:
!pip install pandas scikit-learn



In [None]:
import os
import pandas as pd
from sklearn.ensemble import IsolationForest

In [None]:
# config.py (Global Configurations) not needed at the moment

import os

# Paths
DATASET_PATH = os.path.join("datasets", "logs.csv")
MALWARE_IMAGE_PATH = os.path.join("datasets", "malware_images")
# MALWARE_IMAGE_PATH becomes a string like "datasets/malware_images" (on Unix) or "datasets\malware_images" (on Windows).
# This path could be used to access or save image files during the program’s execution.
MODEL_SAVE_PATH = os.path.join("models")
# This folder is intended to store trained machine learning models
#  (e.g., neural network weights or other model files) after they’ve been built or trained.

# ML Hyperparameters
ANOMALY_DETECTION_PARAMS = {"n_estimators": 100, "contamination": 0.05}
VISION_TRANSFORMER_MODEL = "google/vit-base-patch16-224"
LLM_MODEL = "gpt-4"


 ANOMALY_DETECTION_PARAMS = {"n_estimators": 100, "contamination": 0.05}
Here contamination parameter indicates the expected proportion of anomalies (outliers) in the dataset, set to 5% (0.05). It’s often used in anomaly detection algorithms like Isolation Forest to determine the threshold for classifying data points as anomalies.

In [None]:
# Anomaly Detection on Security Logs in Google Colab

# Install libraries
!pip install pandas scikit-learn

# Import libraries
import os
import pandas as pd
from sklearn.ensemble import IsolationForest
from google.colab import drive

# Mount Google Drive
drive.mount("/content/drive")

# Define paths and hyperparameters
ANOMALY_DETECTION_PARAMS = {"n_estimators": 100, "contamination": 0.05}
DATASET_DIR = "/content/drive/My Drive/CICIDS2017/MachineLearningCVE"
DATASET_PATH = os.path.join(DATASET_DIR, "Monday-WorkingHours.pcap_ISCX.csv")
ZIP_PATH = "/content/MachineLearningCVE.zip"
URL = "http://cicresearch.ca/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.zip"

# Download and prepare dataset
if not os.path.exists(DATASET_PATH):
    print("Dataset not found, downloading and extracting...")
    !mkdir -p "{DATASET_DIR}"
    if os.path.exists(ZIP_PATH):
        print("Removing old ZIP file...")
        !rm "{ZIP_PATH}"
    print(f"Downloading from: {URL}")
    !wget -O "{ZIP_PATH}" "{URL}"
    if os.path.exists(ZIP_PATH) and os.path.getsize(ZIP_PATH) > 200000000:
        print("Download successful, verifying size...")
        !ls -lh "{ZIP_PATH}"
        print("Extracting files...")
        !unzip -o "{ZIP_PATH}" -d "{DATASET_DIR}"
        !ls "{DATASET_DIR}"
    else:
        raise FileNotFoundError("Failed to download a valid ZIP file")
    if not os.path.exists(DATASET_PATH):
        print("Searching for Monday-WorkingHours.pcap_ISCX.csv...")
        for root, dirs, files in os.walk(DATASET_DIR):
            if "Monday-WorkingHours.pcap_ISCX.csv" in files:
                DATASET_PATH = os.path.join(root, "Monday-WorkingHours.pcap_ISCX.csv")
                print(f"Found at: {DATASET_PATH}")
                break
        else:
            raise FileNotFoundError("Could not find Monday-WorkingHours.pcap_ISCX.csv")

class LogAnalyzer:
    """Analyzes security logs to detect anomalies using Isolation Forest."""

    def __init__(self):
        """Initializes the LogAnalyzer with an Isolation Forest model."""
        self.model = IsolationForest(**ANOMALY_DETECTION_PARAMS)

    def load_data(self):
        """Loads security log data from a CSV file."""
        print(f"Loading data from: {DATASET_PATH}")
        df = pd.read_csv(DATASET_PATH)
        print("Columns in loaded DataFrame:", df.columns.tolist())
        return df

    def train_model(self, df):
        """Trains the Isolation Forest model on the log data."""
        numeric_df = df.select_dtypes(include=['float64', 'int64'])
        self.model.fit(numeric_df)

    def detect_anomalies(self, df):
        """Detects anomalies in log data using Isolation Forest."""
        numeric_df = df.select_dtypes(include=['float64', 'int64'])
        df["anomaly_score"] = self.model.predict(numeric_df)
        df["is_anomaly"] = df["anomaly_score"].apply(lambda x: 1 if x == -1 else 0)
        return df

if __name__ == "__main__":
    log_analyzer = LogAnalyzer()
    df = log_analyzer.load_data()
    log_analyzer.train_model(df)
    anomalies = log_analyzer.detect_anomalies(df)
    print("\nFirst 5 rows with anomaly detection results:")
    if " Label" in anomalies.columns:
        print(anomalies[[" Label", "anomaly_score", "is_anomaly"]].head())
    else:
        print("Label column not found, printing without it:")
        print(anomalies[["anomaly_score", "is_anomaly"]].head())
    print(f"\nNumber of detected anomalies: {anomalies['is_anomaly'].sum()}")

    # Add validation code here
    anomalies["true_anomaly"] = anomalies[" Label"].apply(lambda x: 0 if x == "BENIGN" else 1)
    print("\nComparison with true labels:")
    print(anomalies[[" Label", "is_anomaly", "true_anomaly"]].head(10))
    # Count matches
    matches = (anomalies["is_anomaly"] == anomalies["true_anomaly"]).sum()
    print(f"\nAgreement with true labels: {matches}/{len(anomalies)} ({matches/len(anomalies)*100:.2f}%)")
    #Step 1: Creates "true_anomaly" (0 for "BENIGN", 1 for attacks).
    # Step 2: Prints a table comparing " Label", "is_anomaly", and "true_anomaly" for the first 10 rows.
    # Step 3: Counts how many times "is_anomaly" equals "true_anomaly" (stored in matches).
    # This Line: Summarizes the agreement as a fraction and percentage.

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset not found, downloading and extracting...
Removing old ZIP file...
Downloading from: http://cicresearch.ca/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.zip
--2025-03-22 15:35:43--  http://cicresearch.ca/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.zip
Resolving cicresearch.ca (cicresearch.ca)... 205.174.165.80
Connecting to cicresearch.ca (cicresearch.ca)|205.174.165.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 235102953 (224M) [application/zip]
Saving to: ‘/content/MachineLearningCVE.zip’


2025-03-22 15:36:02 (12.1 MB/s) - ‘/content/MachineLearningCVE.zip’ saved [235102953/235102953]

Download successful, verifying size...
-rw-r--r-- 1 root root 225M Feb  1  2024 /content/MachineLearningCVE.zip
Extracting files...
Archive:  /content/MachineLearningCVE.zip
  inflating: /con

In [None]:
!wget --spider "http://cicresearch.ca/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.zip" #checked if the remote download file exists.

Spider mode enabled. Check if remote file exists.
--2025-03-22 14:59:57--  http://cicresearch.ca/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.zip
Resolving cicresearch.ca (cicresearch.ca)... 205.174.165.80
Connecting to cicresearch.ca (cicresearch.ca)|205.174.165.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 235102953 (224M) [application/zip]
Remote file exists.

