In [30]:
'''
Nikhil Patil
Project 5
Anomaly Detection
iForest Implementation
'''
import pandas as pd
import random
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [None]:
'''Load and Preprocess Data'''
df = pd.read_csv('datasets/labelled_training_data.csv')

df['stackAddresses_length'] = df['stackAddresses'].apply(lambda x: len(eval(x)) if pd.notnull(x) else 0)
df['args_length'] = df['args'].apply(lambda x: len(eval(x)) if pd.notnull(x) else 0)

features = [
    'timestamp', 'processId', 'threadId', 'parentProcessId', 
    'userId', 'mountNamespace', 'returnValue', 'stackAddresses_length', 'args_length'
]
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

iso_forest = IsolationForest(n_estimators=100, contamination='auto', random_state=random.randint(1,100))
iso_forest.fit(X_scaled)

df['anomaly_score'] = iso_forest.decision_function(X_scaled)
df['is_anomaly'] = iso_forest.predict(X_scaled)

df['is_anomaly'] = df['is_anomaly'].map({1: 0, -1: 1})

print(df[['sus', 'evil', 'is_anomaly']].head())


In [None]:
'''Train and Obtain Results'''
y_true = df['sus']
y_pred = df['is_anomaly']

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Print results
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")

cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

report = classification_report(y_true, y_pred, target_names=["Normal", "Anomaly"])
print(report)