In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
import re

# File path
file_path = '/Users/vatsal/Desktop/consolidated_trace_data_final2.csv'  # Adjust as needed

# Function to extract features from the 'Path' column
def extract_path_features(path):
    features = {
        'path_length': len(str(path)),
        'special_chars': len(re.findall(r'[^a-zA-Z0-9/]', str(path))),
        'has_sql_keywords': 1 if re.search(r'(select|union|delete|drop|insert|exec|update)', str(path).lower()) else 0,
        'has_path_traversal': 1 if '..' in str(path) or '//' in str(path) else 0,
        'has_suspicious_extensions': 1 if re.search(r'\.(php|asp|aspx|exe|bat|cmd)$', str(path).lower()) else 0
    }
    return pd.Series(features)

# Initialize results storage for anomalies
chunk_results = []

# Read and process the dataset in chunks
chunk_size = 10000
chunks = pd.read_csv(file_path, chunksize=chunk_size)

for chunk in chunks:
    # Fill missing values
    chunk['HTTP Status Code'].fillna(-1, inplace=True)
    chunk['User Agent'].fillna('Unknown', inplace=True)

    # Extract path features
    path_features = chunk['Path'].apply(extract_path_features)
    
    # Add frequency of paths as a feature
    chunk['path_frequency'] = chunk['Path'].map(chunk['Path'].value_counts())
    
    # Combine features
    X = pd.concat([
        path_features,
        pd.get_dummies(chunk['HTTP Status Code'].astype(int), prefix='status'),
        chunk[['path_frequency']]
    ], axis=1)

    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Anomaly Detection: Isolation Forest
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    chunk['is_anomaly_iso'] = iso_forest.fit_predict(X_scaled)

    # Anomaly Detection: DBSCAN
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    chunk['is_anomaly_dbscan'] = dbscan.fit_predict(X_scaled)

    # Combine results from both models
    chunk['is_anomaly_combined'] = (chunk['is_anomaly_iso'] == -1) | (chunk['is_anomaly_dbscan'] == -1)

    # Store anomalies from this chunk
    chunk_anomalies = chunk[chunk['is_anomaly_combined']]
    chunk_results.append(chunk_anomalies)

# Combine anomalies from all chunks
anomalies = pd.concat(chunk_results, ignore_index=True)

# Save anomalies to a CSV file
output_file = "anomalies_detected2.csv"
anomalies.to_csv(output_file, index=False)

# Display summary
print(f"Total number of rows processed: {chunk_size * len(chunk_results)}")
print(f"Total anomalies detected: {len(anomalies)}")
print("\nSample of detected anomalies:")
print(anomalies[['Path', 'HTTP Status Code', 'User Agent']].head())

# Analyze HTTP Status Code distribution in anomalies
print("\nHTTP Status Code distribution in anomalies:")
print(anomalies['HTTP Status Code'].value_counts())

Total number of rows processed: 190000
Total anomalies detected: 9304

Sample of detected anomalies:
                                                Path  HTTP Status Code  \
0                                       /api/healthz             503.0   
1  /api/v1/references/HTTPS:%2F%2FOWDS.ORG%2FAF.p...             200.0   
2  /api/v1/documents/33c6ab11-ed75-46b1-aeb9-58f2...               0.0   
3                                              /owa/             404.0   
4                                       /api/healthz             503.0   

                                          User Agent  
0                                         curl/8.5.0  
1  Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...  
2  Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; ...  
3                                        curl/7.54.0  
4                                         curl/8.5.0  

HTTP Status Code distribution in anomalies:
HTTP Status Code
 404.0    3000
 201.0    2824
 503.0    1784
 204.0     609
 2