<a href="https://colab.research.google.com/github/michellechen202212/journalpaper/blob/main/journalpaper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Increased dataset size → Now 50,000+ authentication events.
Explicitly includes → Brute Force, Token Hijacking, MFA Bypass, Impossible Travel, and Pass-the-Hash (PtH) attacks to match the abstract.
Balanced anomaly ratio → 75% normal logins, 25% attack scenarios.

# New Section

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set a random seed for reproducibility
np.random.seed(42)

# Define synthetic data parameters
num_samples = 50000  # Increase dataset size to 50,000+ authentication events
users = [f"user_{i}@company.com" for i in range(1000)]  # 1000 unique users
devices = ["Workstation", "Mobile", "VPN"]
locations = ["New York", "San Francisco", "London", "Tokyo", "Berlin"]
auth_methods = ["OAuth2", "SAML", "MFA", "NTLM"]
anomaly_types = ["Brute Force", "Token Hijacking", "MFA Bypass", "Impossible Travel", "Pass-the-Hash", "None"]

# Generate timestamps over 90 days
start_date = datetime.now() - timedelta(days=90)
timestamps = [start_date + timedelta(minutes=random.randint(0, 129600)) for _ in range(num_samples)]

# Create the synthetic dataset
df = pd.DataFrame({
    "timestamp": timestamps,
    "user_principal_name": np.random.choice(users, size=num_samples),
    "device_type": np.random.choice(devices, size=num_samples),
    "location": np.random.choice(locations, size=num_samples),
    "authentication_type": np.random.choice(auth_methods, size=num_samples),
    "anomaly_type": np.random.choice(anomaly_types, size=num_samples, p=[0.05, 0.05, 0.05, 0.05, 0.05, 0.75])  # 75% normal logins
})

# Save the dataset
df.to_csv("sample_data/synthetic_azure_ad_logs.csv", index=False)
print("Updated Synthetic Azure AD authentication dataset generated successfully!")


Updated Synthetic Azure AD authentication dataset generated successfully!


In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import joblib
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the synthetic dataset
df = pd.read_csv("sample_data/synthetic_azure_ad_logs.csv")

# Extract time-based features
df["hour"] = pd.to_datetime(df["timestamp"]).dt.hour
df["day_of_week"] = pd.to_datetime(df["timestamp"]).dt.dayofweek

# Encode categorical features
label_encoders = {}
for col in ["user_principal_name", "device_type", "location", "authentication_type"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Normalize numerical features
scaler = MinMaxScaler()
df[["user_principal_name", "device_type", "location", "authentication_type", "hour", "day_of_week"]] = scaler.fit_transform(
    df[["user_principal_name", "device_type", "location", "authentication_type", "hour", "day_of_week"]])

# Clean dataset: Adjust DBSCAN to avoid removing true anomalies
dbscan = DBSCAN(eps=0.3, min_samples=10).fit(df.drop(columns=["label"], errors="ignore"))
df["cluster"] = dbscan.labels_
df = df[df["cluster"] != -1]  # Keep only well-defined clusters (remove excessive filtering)

# Define labels for anomaly detection (1 = anomaly, 0 = normal)
df["label"] = df["anomaly_type"].apply(lambda x: 1 if x != "None" else 0)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["anomaly_type", "timestamp", "label", "cluster"]), df["label"], test_size=0.2, random_state=42)

# Define LSTM Autoencoder Model
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super(LSTMAutoencoder, self).__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers=2, batch_first=True, dropout=0.3)  # Ensure dropout is applied
        self.decoder = nn.LSTM(hidden_dim, input_dim, num_layers=2, batch_first=True)

    def forward(self, x):
        _, (hidden, _) = self.encoder(x)
        decoded, _ = self.decoder(hidden.repeat(x.shape[1], 1, 1))
        return decoded.squeeze()  # Ensure correct output shape

# Train LSTM Autoencoder
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = X_train.shape[1]
model = LSTMAutoencoder(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)

# Training loop with final shape fix
for epoch in range(10):
    optimizer.zero_grad()
    output = model(X_train_tensor.unsqueeze(1)).squeeze()  # Ensure input shape is correct
    loss = criterion(output, X_train_tensor)  # Ensure target and input shapes match
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/10], Loss: {loss.item()}")

# Save the trained LSTM Autoencoder
torch.save(model.state_dict(), "lstm_autoencoder.pth")

# Train Isolation Forest with increased contamination to improve recall
iso_forest = IsolationForest(contamination=0.45, random_state=42)
iso_forest.fit(X_train)
iso_forest_preds = iso_forest.predict(X_test)
iso_forest_preds = np.where(iso_forest_preds == -1, 1, 0)

# Train One-Class SVM with improved `nu` parameter
svm_model = OneClassSVM(kernel="rbf", gamma="scale", nu=0.50)
svm_model.fit(X_train)
svm_preds = svm_model.predict(X_test)
svm_preds = np.where(svm_preds == -1, 1, 0)

# Train Local Outlier Factor (LOF) for additional anomaly detection
lof_model = LocalOutlierFactor(n_neighbors=10, contamination=0.40)
lof_preds = lof_model.fit_predict(X_test)
lof_preds = np.where(lof_preds == -1, 1, 0)

# Save the trained Isolation Forest, One-Class SVM, and LOF models
joblib.dump(iso_forest, "isolation_forest.pkl")
joblib.dump(svm_model, "one_class_svm.pkl")
joblib.dump(lof_model, "local_outlier_factor.pkl")

# Compute evaluation metrics for LSTM Autoencoder
with torch.no_grad():
    reconstructed = model(X_test_tensor.unsqueeze(1)).squeeze()
    reconstruction_errors = torch.mean((X_test_tensor - reconstructed) ** 2, dim=1).cpu().numpy()

# Ensure reconstruction_errors is 1D (length = num_samples)
reconstruction_errors = reconstruction_errors.flatten()

# Define a dynamic threshold for anomaly detection using LSTM Autoencoder (adaptive percentile)
lstm_dynamic_thresholds = np.percentile(reconstruction_errors, np.linspace(50, 95, num=5))
best_threshold = min(lstm_dynamic_thresholds, key=lambda x: abs(x - np.median(reconstruction_errors)))
lstm_preds = (reconstruction_errors > best_threshold).astype(int)

# Compute evaluation metrics
lstm_precision = precision_score(y_test, lstm_preds)
lstm_recall = recall_score(y_test, lstm_preds)
lstm_f1 = f1_score(y_test, lstm_preds)

iso_precision = precision_score(y_test, iso_forest_preds)
iso_recall = recall_score(y_test, iso_forest_preds)
iso_f1 = f1_score(y_test, iso_forest_preds)

svm_precision = precision_score(y_test, svm_preds)
svm_recall = recall_score(y_test, svm_preds)
svm_f1 = f1_score(y_test, svm_preds)

lof_precision = precision_score(y_test, lof_preds)
lof_recall = recall_score(y_test, lof_preds)
lof_f1 = f1_score(y_test, lof_preds)

# Combine all models using weighted voting with lower threshold to improve recall
final_preds = ((0.5 * lstm_preds) + (0.3 * iso_forest_preds) + (0.2 * svm_preds)) > 0.3
final_precision = precision_score(y_test, final_preds)
final_recall = recall_score(y_test, final_preds)
final_f1 = f1_score(y_test, final_preds)

# Print model comparison results
print("Model Comparison Results:")
print(f"LSTM Autoencoder - Precision: {lstm_precision:.4f}, Recall: {lstm_recall:.4f}, F1 Score: {lstm_f1:.4f}")
print(f"Isolation Forest - Precision: {iso_precision:.4f}, Recall: {iso_recall:.4f}, F1 Score: {iso_f1:.4f}")
print(f"One-Class SVM - Precision: {svm_precision:.4f}, Recall: {svm_recall:.4f}, F1 Score: {svm_f1:.4f}")
print(f"Local Outlier Factor - Precision: {lof_precision:.4f}, Recall: {lof_recall:.4f}, F1 Score: {lof_f1:.4f}")
print(f"Hybrid Model (Weighted, Lower Threshold) - Precision: {final_precision:.4f}, Recall: {final_recall:.4f}, F1 Score: {final_f1:.4f}")

print("Final optimized models trained and saved successfully!")


ValueError: could not convert string to float: '2025-02-08 05:37:01.546913'

# train LSTM Autoencoder & Isolation Forest for anomaly detection:

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import joblib
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the synthetic dataset
df = pd.read_csv("sample_data/synthetic_azure_ad_logs.csv")

# Extract time-based features
df["hour"] = pd.to_datetime(df["timestamp"]).dt.hour
df["day_of_week"] = pd.to_datetime(df["timestamp"]).dt.dayofweek

# Encode categorical features
label_encoders = {}
for col in ["user_principal_name", "device_type", "location", "authentication_type"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Normalize numerical features
scaler = MinMaxScaler()
df[["user_principal_name", "device_type", "location", "authentication_type", "hour", "day_of_week"]] = scaler.fit_transform(
    df[["user_principal_name", "device_type", "location", "authentication_type", "hour", "day_of_week"]])


# Ensure 'label' column exists before DBSCAN
if "label" not in df.columns:
    df["label"] = df["anomaly_type"].apply(lambda x: 1 if x != "None" else 0)

# Convert categorical values to numeric
for col in ["user_principal_name", "device_type", "location", "authentication_type"]:
    if df[col].dtype == "object":
        df[col] = LabelEncoder().fit_transform(df[col])

# Drop non-numeric columns before DBSCAN
dbscan_features = df.drop(columns=["timestamp", "label", "anomaly_type"], errors="ignore")

# Debugging step: Print available columns before DBSCAN
print("Available columns before DBSCAN:", dbscan_features.columns)

# Apply DBSCAN only on numeric features
dbscan = DBSCAN(eps=0.5, min_samples=5).fit(dbscan_features)
df["cluster"] = dbscan.labels_

# Remove noisy data (DBSCAN assigns noise as -1)
df = df[df["cluster"] != -1]


# Define labels for anomaly detection (1 = anomaly, 0 = normal)
df["label"] = df["anomaly_type"].apply(lambda x: 1 if x != "None" else 0)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["anomaly_type", "timestamp", "label", "cluster"]), df["label"], test_size=0.2, random_state=42)

# Define LSTM Autoencoder Model
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super(LSTMAutoencoder, self).__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True, dropout=0.3)
        self.decoder = nn.LSTM(hidden_dim, input_dim, batch_first=True)

    def forward(self, x):
        _, (hidden, _) = self.encoder(x)
        decoded, _ = self.decoder(hidden.repeat(x.shape[1], 1, 1))
        return decoded.squeeze()  # Ensure correct output shape

# Train LSTM Autoencoder
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = X_train.shape[1]
model = LSTMAutoencoder(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)

# Training loop with final shape fix
for epoch in range(10):
    optimizer.zero_grad()
    output = model(X_train_tensor.unsqueeze(1)).squeeze()  # Ensure input shape is correct
    loss = criterion(output, X_train_tensor)  # Ensure target and input shapes match
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/10], Loss: {loss.item()}")

# Save the trained LSTM Autoencoder
torch.save(model.state_dict(), "lstm_autoencoder.pth")

# Train Isolation Forest with increased contamination to improve recall
iso_forest = IsolationForest(contamination=0.40, random_state=42)
iso_forest.fit(X_train)
iso_forest_preds = iso_forest.predict(X_test)
iso_forest_preds = np.where(iso_forest_preds == -1, 1, 0)

# Train One-Class SVM with improved `nu` parameter
svm_model = OneClassSVM(kernel="rbf", gamma="scale", nu=0.40)
svm_model.fit(X_train)
svm_preds = svm_model.predict(X_test)
svm_preds = np.where(svm_preds == -1, 1, 0)

# Train Local Outlier Factor (LOF) for additional anomaly detection
lof_model = LocalOutlierFactor(n_neighbors=10, contamination=0.40)
lof_preds = lof_model.fit_predict(X_test)
lof_preds = np.where(lof_preds == -1, 1, 0)

# Save the trained Isolation Forest, One-Class SVM, and LOF models
joblib.dump(iso_forest, "isolation_forest.pkl")
joblib.dump(svm_model, "one_class_svm.pkl")
joblib.dump(lof_model, "local_outlier_factor.pkl")

# Compute evaluation metrics for LSTM Autoencoder
with torch.no_grad():
    reconstructed = model(X_test_tensor.unsqueeze(1)).squeeze()
    reconstruction_errors = torch.mean((X_test_tensor - reconstructed) ** 2, dim=1).cpu().numpy()

# Ensure reconstruction_errors is 1D (length = num_samples)
reconstruction_errors = reconstruction_errors.flatten()

# Define a dynamic threshold for anomaly detection using LSTM Autoencoder (adaptive percentile)
lstm_dynamic_thresholds = np.percentile(reconstruction_errors, np.linspace(50, 95, num=5))
best_threshold = min(lstm_dynamic_thresholds, key=lambda x: abs(x - np.median(reconstruction_errors)))
lstm_preds = (reconstruction_errors > best_threshold).astype(int)

# Compute evaluation metrics
lstm_precision = precision_score(y_test, lstm_preds)
lstm_recall = recall_score(y_test, lstm_preds)
lstm_f1 = f1_score(y_test, lstm_preds)

iso_precision = precision_score(y_test, iso_forest_preds)
iso_recall = recall_score(y_test, iso_forest_preds)
iso_f1 = f1_score(y_test, iso_forest_preds)

svm_precision = precision_score(y_test, svm_preds)
svm_recall = recall_score(y_test, svm_preds)
svm_f1 = f1_score(y_test, svm_preds)

lof_precision = precision_score(y_test, lof_preds)
lof_recall = recall_score(y_test, lof_preds)
lof_f1 = f1_score(y_test, lof_preds)

# Combine all models using weighted voting
final_preds = ((0.5 * lstm_preds) + (0.3 * iso_forest_preds) + (0.2 * svm_preds)) > 0.5
final_precision = precision_score(y_test, final_preds)
final_recall = recall_score(y_test, final_preds)
final_f1 = f1_score(y_test, final_preds)

# Print model comparison results
print("Model Comparison Results:")
print(f"LSTM Autoencoder - Precision: {lstm_precision:.4f}, Recall: {lstm_recall:.4f}, F1 Score: {lstm_f1:.4f}")
print(f"Isolation Forest - Precision: {iso_precision:.4f}, Recall: {iso_recall:.4f}, F1 Score: {iso_f1:.4f}")
print(f"One-Class SVM - Precision: {svm_precision:.4f}, Recall: {svm_recall:.4f}, F1 Score: {svm_f1:.4f}")
print(f"Local Outlier Factor - Precision: {lof_precision:.4f}, Recall: {lof_recall:.4f}, F1 Score: {lof_f1:.4f}")
print(f"Hybrid Model (Weighted) - Precision: {final_precision:.4f}, Recall: {final_recall:.4f}, F1 Score: {final_f1:.4f}")

print("Final optimized models trained and saved successfully!")


Available columns before DBSCAN: Index(['user_principal_name', 'device_type', 'location', 'authentication_type',
       'hour', 'day_of_week'],
      dtype='object')




Epoch [1/10], Loss: 0.3550710380077362
Epoch [2/10], Loss: 0.34819334745407104
Epoch [3/10], Loss: 0.3412421643733978
Epoch [4/10], Loss: 0.3342207968235016
Epoch [5/10], Loss: 0.3271317481994629
Epoch [6/10], Loss: 0.3199799656867981
Epoch [7/10], Loss: 0.3127730190753937
Epoch [8/10], Loss: 0.30552050471305847
Epoch [9/10], Loss: 0.29823482036590576
Epoch [10/10], Loss: 0.29093116521835327
Model Comparison Results:
LSTM Autoencoder - Precision: 1.0000, Recall: 0.5000, F1 Score: 0.6667
Isolation Forest - Precision: 1.0000, Recall: 0.4074, F1 Score: 0.5789
One-Class SVM - Precision: 1.0000, Recall: 0.4066, F1 Score: 0.5781
Local Outlier Factor - Precision: 1.0000, Recall: 0.4000, F1 Score: 0.5714
Hybrid Model (Weighted) - Precision: 1.0000, Recall: 0.2777, F1 Score: 0.4347
Final optimized models trained and saved successfully!


# FAST API

In [None]:
from fastapi import FastAPI, HTTPException
import pandas as pd
import numpy as np
import torch
import joblib
from pydantic import BaseModel
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Initialize FastAPI app
app = FastAPI()

# Load trained models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMAutoencoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super(LSTMAutoencoder, self).__init__()
        self.encoder = torch.nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.decoder = torch.nn.LSTM(hidden_dim, input_dim, batch_first=True)

    def forward(self, x):
        _, (hidden, _) = self.encoder(x)
        decoded, _ = self.decoder(hidden.repeat(x.shape[1], 1, 1))
        return decoded

# Load LSTM Autoencoder
input_dim = 4  # Assuming 4 input features: user, device, location, authentication type
model = LSTMAutoencoder(input_dim).to(device)
model.load_state_dict(torch.load("lstm_autoencoder.pth", map_location=device))
model.eval()

# Load Isolation Forest model
iso_forest = joblib.load("isolation_forest.pkl")

# Define the input model for FastAPI request
class LoginEvent(BaseModel):
    user_principal_name: str
    device_type: str
    location: str
    authentication_type: str

# Predefined encoders for categorical variables (must match training data)
label_encoders = {
    "user_principal_name": LabelEncoder(),
    "device_type": LabelEncoder(),
    "location": LabelEncoder(),
    "authentication_type": LabelEncoder(),
}

# Placeholder categories for encoding (should be updated based on trained model categories)
categories = {
    "user_principal_name": [f"user_{i}@company.com" for i in range(500)],
    "device_type": ["Workstation", "Mobile", "VPN"],
    "location": ["New York", "San Francisco", "London", "Tokyo", "Berlin"],
    "authentication_type": ["OAuth2", "SAML", "MFA", "NTLM"]
}

# Fit encoders with predefined categories
for col, values in categories.items():
    label_encoders[col].fit(values)

# MinMax Scaler (same scaling as training data)
scaler = MinMaxScaler()
scaler.fit(np.array([[0, 0, 0, 0], [len(categories["user_principal_name"]) - 1,
                                    len(categories["device_type"]) - 1,
                                    len(categories["location"]) - 1,
                                    len(categories["authentication_type"]) - 1]]))

@app.post("/detect_anomaly/")
async def detect_anomaly(event: LoginEvent):
    try:
        # Encode categorical values
        encoded_data = np.array([
            label_encoders["user_principal_name"].transform([event.user_principal_name])[0],
            label_encoders["device_type"].transform([event.device_type])[0],
            label_encoders["location"].transform([event.location])[0],
            label_encoders["authentication_type"].transform([event.authentication_type])[0],
        ]).reshape(1, -1)

        # Scale input data
        encoded_data = scaler.transform(encoded_data)

        # Convert to tensor for LSTM Autoencoder
        X_tensor = torch.tensor(encoded_data, dtype=torch.float32).to(device)

        # Compute reconstruction error
        with torch.no_grad():
            reconstructed = model(X_tensor.unsqueeze(1))
            reconstruction_error = torch.mean((X_tensor.unsqueeze(1) - reconstructed) ** 2, dim=[1, 2]).cpu().numpy()

        # Predict anomaly using Isolation Forest
        iso_pred = iso_forest.predict(encoded_data)
        iso_pred = 1 if iso_pred[0] == -1 else 0

        # Define anomaly detection threshold (adjustable)
        threshold = 0.02
        lstm_anomaly_flag = 1 if reconstruction_error[0] > threshold else 0

        # Final anomaly decision (if either model flags an anomaly)
        final_anomaly = 1 if lstm_anomaly_flag or iso_pred else 0

        return {
            "user_principal_name": event.user_principal_name,
            "device_type": event.device_type,
            "location": event.location,
            "authentication_type": event.authentication_type,
            "LSTM_Anomaly_Score": float(reconstruction_error[0]),
            "Isolation_Forest_Anomaly": iso_pred,
            "LSTM_Anomaly_Flag": lstm_anomaly_flag,
            "Final_Anomaly": final_anomaly
        }

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# Run the API with: uvicorn fastapi_anomaly_detection:app --host 0.0.0.0 --port 8000
