# ðŸ§  ML Notebook: Anomaly Detection from Security Logs

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Load sample logs
azure_logs = pd.read_json("../logs/azure_sample.json")
aws_logs = pd.read_json("../logs/aws_sample.json")

# Flatten AWS log (extract username, source IP, etc.)
aws_logs["Account"] = aws_logs["userIdentity"].apply(lambda x: x.get("userName") if isinstance(x, dict) else "unknown")
aws_logs["EventName"] = aws_logs["eventName"]
aws_logs["SourceIP"] = aws_logs["sourceIPAddress"]
aws_logs["Time"] = pd.to_datetime(aws_logs["eventTime"])
aws_logs["Platform"] = "AWS"
azure_logs["Platform"] = "Azure"
azure_logs["Time"] = pd.to_datetime(azure_logs["Time"])

# Normalize column names for merge
aws_df = aws_logs[["Account", "EventName", "SourceIP", "Time", "Platform"]]
azure_df = azure_logs.rename(columns={"EventID": "EventName"})[["Account", "EventName", "SourceIP", "Time", "Platform"]]

# Combine data
df = pd.concat([azure_df, aws_df], ignore_index=True)
df["EventName"] = df["EventName"].astype(str)

# Encode categorical
le_account = LabelEncoder()
le_event = LabelEncoder()
le_ip = LabelEncoder()
df["Account_enc"] = le_account.fit_transform(df["Account"])
df["Event_enc"] = le_event.fit_transform(df["EventName"])
df["IP_enc"] = le_ip.fit_transform(df["SourceIP"])

features = df[["Account_enc", "Event_enc", "IP_enc"]]

# Isolation Forest for Anomaly Detection
clf = IsolationForest(contamination=0.1, random_state=42)
df["Anomaly"] = clf.fit_predict(features)

# Visualize
plt.figure(figsize=(8, 4))
plt.title("Anomaly Detection (1: Normal, -1: Anomaly)")
plt.scatter(df.index, df["Anomaly"], c=df["Anomaly"])
plt.xlabel("Log Index")
plt.ylabel("Anomaly Score")
plt.show()

df[df["Anomaly"] == -1]
