# MACV Evaluation Analysis

This notebook analyzes the results from the `msv_evaluation_results.csv` file and generates the plots for the paper.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix

# Load the results
df = pd.read_csv('../data/msv_evaluation_results.csv')
df.head()

## 1. Accuracy Comparison

In [None]:
accuracy_data = {
    'Single LLM': df.groupby('dataset')['single_llm_correct'].mean(),
    'RAG': df.groupby('dataset')['rag_correct'].mean(),
    'Self-Correction': df.groupby('dataset')['self_correction_correct'].mean(),
    'MACV': df.groupby('dataset')['msv_Full_MSV_correct'].mean()
}

accuracy_df = pd.DataFrame(accuracy_data)

accuracy_df.plot(kind='bar', figsize=(12, 7))
plt.title('Accuracy Comparison Across Models and Datasets')
plt.ylabel('Accuracy')
plt.xlabel('Dataset')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--')
plt.savefig('../images/1_accuracy.png')
plt.show()

## 2. Hallucination Rate
Hallucination is defined as `1 - accuracy`.

In [None]:
hallucination_df = 1 - accuracy_df

hallucination_df.plot(kind='bar', figsize=(12, 7))
plt.title('Hallucination Rate Comparison')
plt.ylabel('Hallucination Rate')
plt.xlabel('Dataset')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--')
plt.savefig('../images/2_hallucination.png')
plt.show()

## 3. Average Response Time

In [None]:
time_data = {
    'Single LLM': df['single_llm_time'].mean(),
    'RAG': df['rag_time'].mean(),
    'Self-Correction': df['self_correction_time'].mean(),
    'MACV': df['msv_Full_MSV_time'].mean()
}

time_series = pd.Series(time_data)

time_series.plot(kind='bar', figsize=(10, 6), color=['skyblue', 'lightgreen', 'salmon', 'plum'])
plt.title('Average Response Time (Seconds)')
plt.ylabel('Time (s)')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--')
plt.savefig('../images/3_avg_time.png')
plt.show()

## 4. MACV Confusion Matrix

This shows how well MACV's decision to `ACCEPT` or `REJECT` aligns with whether the response was actually correct.

In [None]:
y_true = df['msv_Full_MSV_correct'].astype(int) # 1 for correct, 0 for incorrect
y_pred = (df['msv_Full_MSV_decision'] == 'ACCEPT').astype(int) # 1 for ACCEPT, 0 for REJECT

cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Reject', 'Accept'], yticklabels=['Incorrect', 'Correct'])
plt.title('MACV Decision Confusion Matrix')
plt.xlabel('Predicted Decision')
plt.ylabel('Actual Correctness')
plt.savefig('../images/4_msv_confusion_matrix.png')
plt.show()

## 5. MACV Rejection Reasons

In [None]:
rejection_df = df[df['msv_Full_MSV_decision'] == 'REJECT']

plt.figure(figsize=(10, 6))
rejection_df['msv_Full_MSV_reason'].value_counts().plot(kind='barh')
plt.title('Reasons for MACV Rejection')
plt.xlabel('Count')
plt.gca().invert_yaxis()
plt.savefig('../images/5_msv_rejection_reasons.png')
plt.show()