# Multi-File MCQ Dataset Evaluation

This notebook evaluates **all MCQ CSV files** in the `results/without_image` directory on 5 key metrics:
1. **Coverage**: Semantic match to source PDF.
2. **Diversity**: Uniqueness of questions.
3. **Topic Depth**: Fraction of distinct source topics/pages covered.
4. **Correctness**: Accuracy check (Requires LLM).
5. **Quality**: Question quality rating (Requires LLM).

In [36]:
import pandas as pd
import numpy as np
import glob
import os
import sys
import importlib

# Add current directory to path
sys.path.append(os.getcwd())

import evaluation_metrics
importlib.reload(evaluation_metrics)

from utils.pdf_extractor import extract_text_from_pdf
from evaluation_metrics import calculate_coverage_score, calculate_diversity_score, calculate_topic_depth


In [37]:
# 1. Load Source Text (Assuming one common source for now)
pdf_path = 'notes/LLM_cs124_week7_2025.pdf'
pages_text = extract_text_from_pdf(pdf_path)
print(f"Source PDF Extracted: {len(pages_text)} pages/chunks.")

Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 38 0 (offset 0)
Ignoring wrong pointing object 40 0 (offset 0)
Ignoring wrong pointing object 42 0 (offset 0)
Ignoring wrong pointing object 44 0 (offset 0)
Ignoring wrong pointing object 46 0 (offset 0)
Ignoring wrong pointing object 49 0 (offset 0)
Ignoring wrong pointing object 51 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 55 0 (offset 0)
Ignoring wrong pointing object 58 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)
Ignoring wrong pointing object 62 0 (offset 0)
Ignoring wrong 

Source PDF Extracted: 114 pages/chunks.


In [38]:
# 2. Find All MCQ Datasets
all_csv_files = glob.glob('results/without_image/*.csv')
datasets = [f for f in all_csv_files if 'evaluation_summary.csv' not in f]
print(f"Found {len(datasets)} datasets to evaluate: {datasets}")

Found 5 datasets to evaluate: ['results/without_image/mistral_mcq_dataset.csv', 'results/without_image/falcon_mcq_dataset.csv', 'results/without_image/phi_mcq_dataset.csv', 'results/without_image/qween_mcq_dataset.csv', 'results/without_image/llama_mcq_dataset.csv']


## 3. Run Evaluation Loop

In [39]:
evaluation_results = []

for csv_file in datasets:
    print(f"Evaluating {csv_file}...")
    try:
        df = pd.read_csv(csv_file)
        # Check for column variations (case insensitive or different names)
        quest_col = None
        for col in df.columns:
            if 'question' in col.lower():
                quest_col = col
                break
        
        if not quest_col:
             print(f"Skipping {csv_file}: 'question' column not found.")
             continue

        questions = df[quest_col].astype(str).tolist()
        
        # 1. Coverage Score
        cov_score, _ = calculate_coverage_score(questions, pages_text)
        
        # 2. Diversity Score
        div_score = calculate_diversity_score(questions)
        
        # 3. Topic Depth/Count
        topic_frac, topic_count = calculate_topic_depth(questions, pages_text)
        
        # 4 & 5. Correctness and Quality (Placeholder for LLM)
        correctness_score = "Requires LLM"
        quality_score = "Requires LLM"
        
        result = {
            'Filename': os.path.basename(csv_file),
            'Num_Questions': len(questions),
            'Coverage_Score': round(cov_score * 100, 2),
            'Diversity_Score': round(div_score * 100, 2),
            'Topic_Depth (Fraction)': round(topic_frac * 100, 2),
            'Topic_Covered_Count': topic_count,
            'Correctness_Score': correctness_score,
            'Quality_Score': quality_score
        }
        evaluation_results.append(result)
        
    except Exception as e:
        print(f"Error evaluating {csv_file}: {e}")

# Create Summary DataFrame
summary_df = pd.DataFrame(evaluation_results)
summary_df

Evaluating results/without_image/mistral_mcq_dataset.csv...
Evaluating results/without_image/falcon_mcq_dataset.csv...
Evaluating results/without_image/phi_mcq_dataset.csv...
Evaluating results/without_image/qween_mcq_dataset.csv...
Evaluating results/without_image/llama_mcq_dataset.csv...


Unnamed: 0,Filename,Num_Questions,Coverage_Score,Diversity_Score,Topic_Depth (Fraction),Topic_Covered_Count,Correctness_Score,Quality_Score
0,mistral_mcq_dataset.csv,211,63.0,76.88,96.49,110,Requires LLM,Requires LLM
1,falcon_mcq_dataset.csv,10,8.28,41.91,0.0,0,Requires LLM,Requires LLM
2,phi_mcq_dataset.csv,199,55.29,86.84,94.74,108,Requires LLM,Requires LLM
3,qween_mcq_dataset.csv,176,52.18,85.59,85.09,97,Requires LLM,Requires LLM
4,llama_mcq_dataset.csv,262,58.73,87.84,96.49,110,Requires LLM,Requires LLM


In [40]:
# 4. Save to CSV
summary_df.to_csv('evaluation_summary.csv', index=False)
print("Saved 5-metric evaluation summary to evaluation_summary.csv")

Saved 5-metric evaluation summary to evaluation_summary.csv
