In [1]:
pip install torch transformers scikit-learn numpy


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
pip install docker

Collecting docker
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Downloading docker-7.1.0-py3-none-any.whl (147 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/147.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: docker
Successfully installed docker-7.1.0


In [1]:
import random
import re
import torch
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed

# ========================================================
# 1️⃣ Generate Synthetic Docker Log Dataset
# ========================================================
def generate_synthetic_logs(num_entries=500, anomaly_ratio=0.2):
    """
    Generates a synthetic list of Docker log entries.
    - num_entries: Total number of log entries.
    - anomaly_ratio: Fraction of logs that are anomalous.
    Returns a list of log strings.
    """
    normal_templates = [
        "web-container | INFO: Server started successfully",
        "db-container | INFO: Connection established",
        "cache-container | INFO: Cache hit for key 'user:1234'",
        "worker-container | INFO: Task completed in 5.2 seconds",
        "api-container | INFO: Received request from 192.168.1.5",
        "proxy-container | INFO: Forwarding request to backend service",
        "scheduler-container | INFO: Scheduled job executed",
        "auth-container | INFO: User login successful",
        "monitoring-container | INFO: Health check passed",
        "service-container | INFO: Updated configuration loaded"
    ]

    anomalous_templates = [
        "web-container | ERROR: Unexpected shutdown detected",
        "db-container | CRITICAL: Connection timeout. Unable to connect to database.",
        "api-container | ERROR: 500 Internal Server Error encountered",
        "auth-container | WARNING: Unauthorized access attempt detected",
        "proxy-container | ERROR: Failed to forward request. Connection refused.",
        "worker-container | CRITICAL: Task execution failed due to unexpected exception",
        "vulnerable-web-container | ERROR: python3: can't open file '/opt/vulnerable-web-app/http_server.py': [Errno 2] No such file or directory",
        "logspout | ERROR: Connection refused when attempting to write to syslog",
        "cache-container | WARNING: Cache miss encountered for key 'session:9876'",
        "monitoring-container | CRITICAL: Health check failed for container, status: down"
    ]

    base_date = datetime(2025, 3, 27, 12, 0, 0)
    logs = []
    num_anomalies = int(num_entries * anomaly_ratio)
    num_normal = num_entries - num_anomalies

    def random_timestamp():
        offset = timedelta(minutes=random.randint(0, 120))
        return (base_date + offset).strftime("%Y-%m-%d %H:%M:%S")

    # Generate normal log entries
    for _ in range(num_normal):
        ts = random_timestamp()
        template = random.choice(normal_templates)
        log_entry = f"{ts} {template}"
        logs.append(log_entry)

    # Generate anomalous log entries
    for _ in range(num_anomalies):
        ts = random_timestamp()
        template = random.choice(anomalous_templates)
        log_entry = f"{ts} {template}"
        logs.append(log_entry)

    random.shuffle(logs)
    return logs

# Use the synthetic data function to generate 500 log entries
logs = generate_synthetic_logs(num_entries=500, anomaly_ratio=0.2)
print(f"✅ Generated {len(logs)} synthetic log entries.")

# ========================================================
# 2️⃣ PREPROCESS LOGS
# ========================================================
def preprocess_logs(logs):
    """
    Cleans each log entry by removing the timestamp and container name prefix.
    Adjust the regex if your logs have a different format.
    """
    cleaned_logs = []
    # Example regex: remove the first 4 non-space groups and a separator (adjust as needed)
    for log in logs:
        cleaned_log = re.sub(r'^\S+\s+\S+\s+\S+\s+\|\s+', '', log)
        cleaned_logs.append(cleaned_log.strip())
    return cleaned_logs

cleaned_logs = preprocess_logs(logs)
print(cleaned_logs)

# ========================================================
# 3️⃣ CONVERT LOGS TO BERT EMBEDDINGS
# ========================================================
print("⏳ Loading BERT model...")
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)
print("✅ BERT model loaded!")

def get_bert_embeddings(logs, model, tokenizer):
    """
    Converts a list of log entries into BERT embeddings.
    Uses mean pooling on the last hidden state to obtain a single vector per log.
    """
    model.eval()
    embeddings = []
    for log in logs:
        encoded = tokenizer(log, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            output = model(**encoded)
        # Mean pooling over tokens
        embeddings.append(output.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

print("⏳ Extracting BERT embeddings...")
X = get_bert_embeddings(cleaned_logs, bert_model, tokenizer)
print(f"✅ Extracted embeddings for {len(X)} logs.")

# ========================================================
# 4️⃣ TRAIN LSTM AUTOENCODER
# ========================================================
# Normalize the embeddings
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Reshape for LSTM input: (samples, time steps, features)
X_reshaped = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# Build the LSTM Autoencoder model
autoencoder = Sequential([
    LSTM(128, activation='relu', input_shape=(1, X_scaled.shape[1]), return_sequences=True),
    LSTM(64, activation='relu', return_sequences=False),
    RepeatVector(1),
    LSTM(64, activation='relu', return_sequences=True),
    LSTM(128, activation='relu', return_sequences=True),
    TimeDistributed(Dense(X_scaled.shape[1]))
])

autoencoder.compile(optimizer='adam', loss='mse')

print("⏳ Training LSTM autoencoder...")
X_train, X_test = train_test_split(X_reshaped, test_size=0.2, random_state=42)
autoencoder.fit(X_train, X_train, epochs=10, batch_size=16, validation_data=(X_test, X_test))
print("✅ Training complete!")

# ========================================================
# 5️⃣ DETECT ANOMALIES
# ========================================================
print("⏳ Detecting anomalies...")
X_pred = autoencoder.predict(X_reshaped)
errors = np.mean(np.abs(X_pred - X_reshaped), axis=(1, 2))

# Set anomaly threshold as the 95th percentile of the reconstruction errors
threshold = np.percentile(errors, 90)
anomalies = [cleaned_logs[i] for i in range(len(cleaned_logs)) if errors[i] > threshold]

print("\n🚨 Detected Anomalies:")
for anomaly in anomalies:
    print(anomaly)

import joblib
from tensorflow.keras.models import load_model

# Save the trained autoencoder model
autoencoder.save("lstm_autoencoder.h5")
print("✅ Model saved as lstm_autoencoder.h5")

# Save the MinMaxScaler
joblib.dump(scaler, "scaler.pkl")
print("✅ Scaler saved as scaler.pkl")



✅ Generated 500 synthetic log entries.
⏳ Loading BERT model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

✅ BERT model loaded!
⏳ Extracting BERT embeddings...
✅ Extracted embeddings for 500 logs.


  super().__init__(**kwargs)


⏳ Training LSTM autoencoder...
Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 58ms/step - loss: 0.3170 - val_loss: 0.1131
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0878 - val_loss: 0.0680
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - loss: 0.0688 - val_loss: 0.0673
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - loss: 0.0685 - val_loss: 0.0661
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - loss: 0.0653 - val_loss: 0.0592
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - loss: 0.0588 - val_loss: 0.0546
Epoch 7/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - loss: 0.0542 - val_loss: 0.0483
Epoch 8/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - loss: 0.0449 - val_loss: 0.0428
Epoch 9/10
[1m25




🚨 Detected Anomalies:
CRITICAL: Health check failed for container, status: down
ERROR: 500 Internal Server Error encountered
ERROR: Connection refused when attempting to write to syslog
ERROR: Connection refused when attempting to write to syslog
ERROR: 500 Internal Server Error encountered
ERROR: python3: can't open file '/opt/vulnerable-web-app/http_server.py': [Errno 2] No such file or directory
ERROR: python3: can't open file '/opt/vulnerable-web-app/http_server.py': [Errno 2] No such file or directory
ERROR: 500 Internal Server Error encountered
CRITICAL: Health check failed for container, status: down
CRITICAL: Connection timeout. Unable to connect to database.
CRITICAL: Health check failed for container, status: down
ERROR: Connection refused when attempting to write to syslog
CRITICAL: Health check failed for container, status: down
CRITICAL: Health check failed for container, status: down
ERROR: Connection refused when attempting to write to syslog
ERROR: 500 Internal Server 

In [3]:
# Load required libraries
import torch
import numpy as np
import joblib
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import load_model

# Load the saved model and scaler
from tensorflow.keras.models import load_model
import tensorflow.keras.backend as K

# Define the loss function explicitly
def mse(y_true, y_pred):
    return K.mean(K.square(y_true - y_pred))

# Load the model with custom objects
autoencoder = load_model("lstm_autoencoder.h5", custom_objects={'mse': mse})

print("✅ Model loaded successfully!")
scaler = joblib.load("scaler.pkl")
print("✅ Model and scaler loaded!")

# Reload BERT model for embedding extraction
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)
bert_model.eval()
print("✅ BERT model loaded!")

# Function to preprocess new logs
def preprocess_logs(logs):
    """
    Cleans each log entry by removing the timestamp and container name prefix.
    """
    cleaned_logs = []
    for log in logs:
        cleaned_log = re.sub(r'^\S+\s+\S+\s+\S+\s+\|\s+', '', log)
        cleaned_logs.append(cleaned_log.strip())
    return cleaned_logs

# Function to convert logs to BERT embeddings
def get_bert_embeddings(logs, model, tokenizer):
    """
    Converts a list of log entries into BERT embeddings.
    Uses mean pooling on the last hidden state to obtain a single vector per log.
    """
    embeddings = []
    for log in logs:
        encoded = tokenizer(log, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            output = model(**encoded)
        embeddings.append(output.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

file_path = "/content/docker_logs.txt"  # Change this to your actual file name
with open(file_path, "r") as file:
    logs_50000 = file.readlines()

# Remove any extra whitespaces or newline characters
logs_50000 = logs_50000[0:10000]
logs_50000 = [log.strip() for log in logs_50000]

print(f"✅ Loaded {len(logs_50000)} log entries.")



print(f"⏳ Processing {len(logs_50000)} logs...")

# Preprocess logs
cleaned_logs_50000 = preprocess_logs(logs_50000)

# Get BERT embeddings
X_new = get_bert_embeddings(cleaned_logs_50000, bert_model, tokenizer)

# Normalize using the saved scaler
X_new_scaled = scaler.transform(X_new)

# Reshape for LSTM input
X_new_reshaped = X_new_scaled.reshape((X_new_scaled.shape[0], 1, X_new_scaled.shape[1]))

# Predict with trained model
X_pred_new = autoencoder.predict(X_new_reshaped)

# Compute reconstruction error
errors_new = np.mean(np.abs(X_pred_new - X_new_reshaped), axis=(1, 2))

# Use the same anomaly threshold from training (90th percentile)
threshold = np.percentile(errors_new, 93)
anomalies_new = [cleaned_logs_50000[i] for i in range(len(cleaned_logs_50000)) if errors_new[i] > threshold]

print("\n🚨 Detected Anomalies in New Data:")
for anomaly in anomalies_new[:50]:  # Print first 50 anomalies
    print(anomaly)

print(f"\n✅ Total anomalies detected: {len(anomalies_new)}")




✅ Model loaded successfully!
✅ Model and scaler loaded!
✅ BERT model loaded!
✅ Loaded 10000 log entries.
⏳ Processing 10000 logs...
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step

🚨 Detected Anomalies in New Data:
DEBUG: Connection established with backend at 57.134.171.6:38894
DEBUG: Connection established with backend at 57.134.171.6:38894
2025/03/26 19:32:52 # logspout v3.2.14 by gliderlabs
2025/03/26 19:32:52 # logspout v3.2.14 by gliderlabs
DEBUG: Connection established with backend at 57.134.171.6:38894
2025/03/26 19:32:52 # jobs    : http[logs,routes,health]:80 pump routes
2025/03/26 19:32:52 # logspout v3.2.14 by gliderlabs
2025/03/26 19:32:52 # logspout v3.2.14 by gliderlabs
DEBUG: Connection established with backend at 57.134.171.6:38894
ALERT: Potential DDoS attack detected from multiple IPs, e.g., 81.133.20.56:61877
DEBUG: Connection established with backend at 57.134.171.6:38894
2025/03/26 19:32:52 # logspout v3.2.14 by gliderlabs
2025/03/26 19: