In [2]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
from scipy.sparse import save_npz, load_npz
import joblib


In [None]:

log_file = "cleaned_logs.txt"
output_folder = "tfidf_output_sklearn"
os.makedirs(output_folder, exist_ok=True)


In [None]:

print("Loading log file...")
with open(log_file, "r") as f:
    all_lines = [line.strip() for line in f if line.strip()]

grouped_logs = [' '.join(all_lines[i:i+5]) for i in range(0, len(all_lines), 5)]
print(f"Total grouped logs: {len(grouped_logs)}")

print("Performing TF-IDF on CPU...")
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectors = vectorizer.fit_transform(grouped_logs)

print(f"TF-IDF vector shape: {tfidf_vectors.shape}")


In [None]:

save_npz(os.path.join(output_folder, "tfidf_vectors_sklearn.npz"), tfidf_vectors)
np.save(os.path.join(output_folder, "feature_names_sklearn.npy"),
        np.array(vectorizer.get_feature_names_out()))
joblib.dump(vectorizer, os.path.join(output_folder, "tfidf_vectorizer_sklearn.pkl"))
print("Saved TF-IDF data and vectorizer.")



In [None]:

print("\nTraining Isolation Forest on CPU...")
iso_forest = IsolationForest(
    n_estimators=100,
    contamination=0.05, 
    random_state=42
)
iso_forest.fit(tfidf_vectors)
print("Isolation Forest training complete.")


In [None]:

# Save model
joblib.dump(iso_forest, os.path.join(output_folder, "isolation_forest_sklearn.pkl"))
print("Saved Isolation Forest model → isolation_forest_sklearn.pkl")


In [None]:

def test_model(new_logs):
    vectorizer = joblib.load(os.path.join(output_folder, "tfidf_vectorizer_sklearn.pkl"))
    iso_forest = joblib.load(os.path.join(output_folder, "isolation_forest_sklearn.pkl"))

    grouped_new = [' '.join(new_logs[i:i+5]) for i in range(0, len(new_logs), 5)]
    new_tfidf = vectorizer.transform(grouped_new)

    preds = iso_forest.predict(new_tfidf)
    scores = iso_forest.decision_function(new_tfidf)

    for i, (p, s) in enumerate(zip(preds, scores)):
        status = "⚠️ Anomaly" if p == -1 else "Normal"
        print(f"Doc {i+1}: {status} (score={s:.4f})")

# Example test
sample_logs = [
    "eException in thread java.lang.Error: org.apache.spark.SparkException: Exception while starting container container_1485248649253_0073_02_000009 on host mesos-slave-09",
    "at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1151)",
    "at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)",
    "at java.lang.Thread.run(Thread.java:745)",
    "at java.lang.Thread.run(Thread.java:745)"
]
test_model(sample_logs)


In [None]:

def test_model(new_logs):

    vectorizer = joblib.load(os.path.join(output_folder, "tfidf_vectorizer_sklearn.pkl"))
    iso_forest = joblib.load(os.path.join(output_folder, "isolation_forest_sklearn.pkl"))

    grouped_new = [' '.join(new_logs[i:i+5]) for i in range(0, len(new_logs), 5)]
    new_tfidf = vectorizer.transform(grouped_new)

    preds = iso_forest.predict(new_tfidf)
    scores = iso_forest.decision_function(new_tfidf)

    for i, (p, s) in enumerate(zip(preds, scores)):
        status = "⚠️ Anomaly" if p == -1 else "Normal"
        print(f"Doc {i+1}: {status} (score={s:.4f})")

sample_logs = [
    "INFO ApplicationMaster: Registered signal handlers for [TERM, HUP, INT]",
    "INFO ApplicationMaster: ApplicationAttemptId: appattempt_1485248649253_0073_000002",
    #"eException in thread java.lang.Error: org.apache.spark.SparkException: Exception while starting container container_1485248649253_0073_02_000009 on host mesos-slave-09",
    #at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1151)",
    #"at java.lang.Thread.run(Thread.java:745)",
    #"INFO SecurityManager: Changing modify acls to: yarn,curi",
    "INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(yarn, curi); users with modify permissions: Set(yarn, curi)}'",
    "File /opt/bitnami/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py, line 204, in load",
    "File /opt/bitnami/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py, line 1304, in __call__",
    #"File /opt/bitnami/spark/python/lib/pyspark.zip/pyspark/sql/utils.py, line 117, in deco",
    #"pyspark.sql.utils.AnalysisException: Path does not exist: hdfs://namenode:8020/analytics/recommendation/model_report/22-10-2025"
    #"INFO BlockManagerMasterEndpoint: Registering block manager 10.4.5.58:34635 with 2004.6 MiB RAM, BlockManagerId(1, 10.4.5.58, 34635, None)"
]
test_model(sample_logs)
