# CATastrophe Model Evaluation

Simple notebook to evaluate the model accuracy on test datasets.

In [1]:
# Import libraries
import json
import torch
import numpy as np
from huggingface_hub import hf_hub_download
import pickle
import sys
import os
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Add src to path
sys.path.insert(0, 'src')
from catastrophe.model.autoencoder import Autoencoder

In [2]:
# Load test data
with open('tests/test_data/safe_c_commits.json', 'r') as f:
    safe_commits = json.load(f)
    
with open('tests/test_data/vulnerable_c_commits.json', 'r') as f:
    vulnerable_commits = json.load(f)

print(f"Loaded {len(safe_commits)} safe commits")
print(f"Loaded {len(vulnerable_commits)} vulnerable commits")

Loaded 10 safe commits
Loaded 10 vulnerable commits


In [3]:
# Download model from Hugging Face
model_repo = "ewhk9887/CATastrophe"
print("Downloading model...")

model_path = hf_hub_download(repo_id=model_repo, filename="catastrophe_model.pth")
vectorizer_path = hf_hub_download(repo_id=model_repo, filename="vectorizer.pkl")

print("Model downloaded!")

Downloading model...
Model downloaded!


In [4]:
# Load vectorizer and model
with open(vectorizer_path, 'rb') as f:
    vectorizer = pickle.load(f)

feature_dim = vectorizer.get_feature_names_out().shape[0]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Autoencoder(feature_dim)
model.load_state_dict(torch.load(model_path, map_location=device))
model = model.to(device)
model.eval()

print(f"Model loaded! Feature dimension: {feature_dim}, Device: {device}")

Model loaded! Feature dimension: 2000, Device: cpu


In [5]:
# Define prediction function
def predict_vulnerability(message, func, threshold=0.5):
    text = f"{message} {func}"
    features = vectorizer.transform([text]).toarray()
    features_tensor = torch.tensor(features, dtype=torch.float32).to(device)
    
    with torch.no_grad():
        reconstruction = model(features_tensor)
    
    mse = torch.nn.functional.mse_loss(reconstruction, features_tensor, reduction='none')
    reconstruction_error = mse.mean(dim=1).item()
    
    return reconstruction_error > threshold, reconstruction_error

In [6]:
# Evaluate model
y_true = []
y_pred = []
y_scores = []

# Test safe commits (label = 0)
for commit in safe_commits:
    is_vuln, score = predict_vulnerability(commit['message'], commit['func'])
    y_true.append(0)
    y_pred.append(1 if is_vuln else 0)
    y_scores.append(score)

# Test vulnerable commits (label = 1)
for commit in vulnerable_commits:
    is_vuln, score = predict_vulnerability(commit['message'], commit['func'])
    y_true.append(1)
    y_pred.append(1 if is_vuln else 0)
    y_scores.append(score)

print("Evaluation complete!")

Evaluation complete!


In [7]:
# Display results
accuracy = accuracy_score(y_true, y_pred)
print(f"\n🎯 Model Accuracy: {accuracy:.1%}\n")

print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=['Safe', 'Vulnerable']))

# ROC AUC
roc_auc = roc_auc_score(y_true, y_scores)
print(f"\n📊 ROC AUC Score: {roc_auc:.3f}")


🎯 Model Accuracy: 50.0%

Classification Report:
              precision    recall  f1-score   support

        Safe       0.50      1.00      0.67        10
  Vulnerable       0.00      0.00      0.00        10

    accuracy                           0.50        20
   macro avg       0.25      0.50      0.33        20
weighted avg       0.25      0.50      0.33        20


📊 ROC AUC Score: 0.780


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Show example predictions
print("\n🔍 Example Predictions:\n")
print("Vulnerable Code Examples:")
for i in range(min(3, len(vulnerable_commits))):
    commit = vulnerable_commits[i]
    is_vuln, score = predict_vulnerability(commit['message'], commit['func'])
    print(f"\n{i+1}. {commit['message']}")
    print(f"   Score: {score:.4f}")
    print(f"   Predicted: {'❌ Vulnerable' if is_vuln else '✅ Safe'}")
    print(f"   Actual: ❌ Vulnerable")


🔍 Example Predictions:

Vulnerable Code Examples:

1. fix buffer overflow in string copy
   Score: 0.0005
   Predicted: ✅ Safe
   Actual: ❌ Vulnerable

2. add user authentication
   Score: 0.0005
   Predicted: ✅ Safe
   Actual: ❌ Vulnerable

3. implement sql query function
   Score: 0.0005
   Predicted: ✅ Safe
   Actual: ❌ Vulnerable


In [9]:
# Summary
print("\n📋 Summary:")
print(f"- Model tested on {len(y_true)} code samples")
print(f"- Accuracy: {accuracy:.1%}")
print(f"- ROC AUC: {roc_auc:.3f}")
print(f"\nNote: The model shows {accuracy:.0%} accuracy, which indicates it needs improvement.")
print("The model appears to be classifying all samples as 'Safe', missing all vulnerable code.")


📋 Summary:
- Model tested on 20 code samples
- Accuracy: 50.0%
- ROC AUC: 0.780

Note: The model shows 50% accuracy, which indicates it needs improvement.
The model appears to be classifying all samples as 'Safe', missing all vulnerable code.
