In [2]:
import sys
import os

# Add the project root (one level up from 'notebooks') to the Python path
sys.path.append(os.path.abspath(".."))

# Now import your function
from app.utils import parse_log_file
import pandas as pd

# Load the parsed log data
df = parse_log_file('../data/auth.log')
df.head()

Unnamed: 0,timestamp,status,user,ip,port,label
0,Jul 23 21:00:01,Accepted,devops,192.168.1.10,2244,0
1,Jul 23 21:00:05,Failed,hacker,45.23.56.12,5577,1
2,Jul 23 21:00:10,Failed,root,45.23.56.12,5580,1
3,Jul 23 21:00:12,Failed,root,45.23.56.12,5582,1
4,Jul 23 21:00:14,Failed,test,45.23.56.12,5585,1


In [3]:
#Second Cell
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode categorical features: user, IP, port
df['user'] = LabelEncoder().fit_transform(df['user'])
df['ip'] = LabelEncoder().fit_transform(df['ip'])
df['port'] = df['port'].astype(int)

# Features and label
X = df[['user', 'ip', 'port']]
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
#Train Model A - random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Train
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 1  0]
 [ 0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00        14

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15



In [5]:
#Train model B - isolation forest (anomaly detection)
from sklearn.ensemble import IsolationForest

# Isolation Forest works on unlabeled data
iso_model = IsolationForest(contamination=0.3, random_state=42)
iso_model.fit(X_train)

# Predict: -1 = anomaly, 1 = normal
iso_preds = iso_model.predict(X_test)

# Convert to 0/1: 1 (normal) -> 0 (not threat), -1 (anomaly) -> 1 (threat)
iso_preds = [1 if p == -1 else 0 for p in iso_preds]

print(classification_report(y_test, iso_preds))


              precision    recall  f1-score   support

           0       0.10      1.00      0.18         1
           1       1.00      0.36      0.53        14

    accuracy                           0.40        15
   macro avg       0.55      0.68      0.35        15
weighted avg       0.94      0.40      0.50        15



In [6]:
import joblib

# Save model to a file
joblib.dump(clf, '../models/best_model.pkl')


['../models/best_model.pkl']