In [None]:
import os
import pickle
import random
import json
import mlflow
import scipy.sparse as sp
from joblib import load
from sklearn.datasets import fetch_rcv1
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Paths
DATA_DIR = "data"
MODEL_DIR = "models"
METRICS_DIR = "metrics"


In [None]:
if os.path.exists(os.path.join(DATA_DIR, "data.pickle")):
    print(" Loading cached RCV1 dataset...")
    X = pickle.load(open(os.path.join(DATA_DIR, "data.pickle"), "rb"))
    y = pickle.load(open(os.path.join(DATA_DIR, "target.pickle"), "rb"))
else:
    print("⬇ Fetching RCV1 dataset (first time, may take a few mins)...")
    rcv1 = fetch_rcv1(subset="train")
    X = rcv1.data
    y = rcv1.target
    os.makedirs(DATA_DIR, exist_ok=True)
    pickle.dump(X, open(os.path.join(DATA_DIR, "data.pickle"), "wb"))
    pickle.dump(y, open(os.path.join(DATA_DIR, "target.pickle"), "wb"))

if sp.issparse(y):
    y = y.toarray()
y = y[:, random.randint(0, y.shape[1] - 1)]
print(f" Dataset ready: X={X.shape}, y={y.shape}")


In [None]:
model_files = sorted(
    [f for f in os.listdir(MODEL_DIR) if f.endswith(".joblib")],
    key=lambda x: os.path.getmtime(os.path.join(MODEL_DIR, x)),
    reverse=True,
)
if not model_files:
    raise FileNotFoundError("❌ No model found in 'models/' directory!")

model_path = os.path.join(MODEL_DIR, model_files[0])
print(f"🧠 Loading model: {model_path}")
model = load(model_path)


In [None]:

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
y_pred = model.predict(test_X)

acc = round(accuracy_score(test_y, y_pred), 3)
f1 = round(f1_score(test_y, y_pred), 3)

print(f"✅ Model Evaluation Complete")
print(f"Accuracy: {acc}")
print(f"F1 Score: {f1}")


In [None]:
cm = confusion_matrix(test_y, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f"Confusion Matrix\nAcc={acc}, F1={f1}")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
os.makedirs(METRICS_DIR, exist_ok=True)
metrics_path = os.path.join(METRICS_DIR, "test_metrics.json")
with open(metrics_path, "w") as f:
    json.dump({"accuracy": acc, "f1_score": f1}, f, indent=4)
print(f"📊 Metrics saved to {metrics_path}")
