In [9]:
import sys
import os

# Add the project root (one level up from 'notebooks') to the Python path
sys.path.append(os.path.abspath(".."))

# Now import your function
from app.utils import parse_log_file
import pandas as pd

# Load the parsed log data
df = parse_log_file('../data/auth.log')
df.head()


#Second Cell
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode categorical features: user, IP, port
df['user'] = LabelEncoder().fit_transform(df['user'])
df['ip'] = LabelEncoder().fit_transform(df['ip'])
df['port'] = df['port'].astype(int)

# Features and label
X = df[['user', 'ip', 'port']]
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = X, X, y, y

#Train Model A - random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Train
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

#Train model B - isolation forest (anomaly detection)
from sklearn.ensemble import IsolationForest

# Isolation Forest works on unlabeled data
iso_model = IsolationForest(contamination=0.3, random_state=42)
iso_model.fit(X_train)

# Predict: -1 = anomaly, 1 = normal
iso_preds = iso_model.predict(X_test)

# Convert to 0/1: 1 (normal) -> 0 (not threat), -1 (anomaly) -> 1 (threat)
iso_preds = [1 if p == -1 else 0 for p in iso_preds]

print(classification_report(y_test, iso_preds))


[[0 0]
 [1 2]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.67      0.80         3

    accuracy                           0.67         3
   macro avg       0.50      0.33      0.40         3
weighted avg       1.00      0.67      0.80         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.33      0.50         3

    accuracy                           0.33         3
   macro avg       0.50      0.17      0.25         3
weighted avg       1.00      0.33      0.50         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
