# Claude

In [None]:
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
import nltk
import json
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# Download necessary NLTK packages (run once)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Load a more powerful SBERT model
sbert_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')  # Larger, more accurate model
rouge = Rouge()
smoothie = SmoothingFunction().method4
stop_words = set(stopwords.words('english'))

# Sample answer key (with Bloom's level)
answer_key = {
    "questions": [
        {
            "id": 1,
            "question": "Explain the architecture of ARM processor.",
            "answer": "The ARM architecture includes components like the processor core, interrupt controller, memory controller, and peripheral controllers connected through AHB and APB buses.",
            "bloom": "Understand",
            "keywords": ["processor core", "interrupt controller", "memory controller", "peripheral", "AHB", "APB", "buses"]
        },
        {
            "id": 2,
            "question": "Design a simple block diagram for embedded systems.",
            "answer": "The design includes components such as CPU, memory, I/O devices, timers, and buses. Communication occurs through system buses like AHB and APB.",
            "bloom": "Apply",
            "keywords": ["CPU", "memory", "I/O", "timers", "buses", "AHB", "APB"]
        },
        {
            "id": 3,
            "question": "Analyze how AHB and APB improve embedded system performance.",
            "answer": "AHB enables high-speed communication between major components, while APB connects low-speed peripherals. This separation improves efficiency and reduces bottlenecks.",
            "bloom": "Analyze",
            "keywords": ["AHB", "APB", "high-speed", "low-speed", "efficiency", "bottlenecks", "separation"]
        }
    ]
}

# Sample student answers
student_answers = {
    "answers": [
        {
            "id": 1,
            "answer": "ARM includes the processor core, memory controller, interrupt controller and uses AHB and APB buses to connect peripherals."
        },
        {
            "id": 2,
            "answer": "A diagram has CPU, memory, timers, I/O devices all connected using buses like AHB and APB."
        },
        {
            "id": 3,
            "answer": "AHB connects fast components and APB connects slow devices, so performance is better."
        }
    ]
}

# Extract domain-specific keywords from text
def extract_keywords(text, domain_keywords=None):
    tokens = word_tokenize(text.lower())
    words = [word for word in tokens if word.isalnum() and word not in stop_words]

    # Prioritize domain keywords if provided
    if domain_keywords:
        matches = []
        for kw in domain_keywords:
            kw_lower = kw.lower()
            if kw_lower in text.lower():
                matches.append(kw)
        return matches

    return words

# More sophisticated Bloom's taxonomy classifier
def classify_bloom(question, answer):
    # Extract patterns that indicate different cognitive levels
    question = question.lower()
    answer = answer.lower()

    # Analyze question indicators
    if re.search(r'\b(define|list|name|identify|recall|state|who|what|when|where)\b', question):
        question_level = "Remember"
    elif re.search(r'\b(explain|describe|compare|contrast|summarize|interpret|paraphrase)\b', question):
        question_level = "Understand"
    elif re.search(r'\b(apply|use|demonstrate|illustrate|solve|implement|design)\b', question):
        question_level = "Apply"
    elif re.search(r'\b(analyze|differentiate|organize|attribute|distinguish|examine)\b', question):
        question_level = "Analyze"
    elif re.search(r'\b(evaluate|assess|critique|judge|justify|recommend)\b', question):
        question_level = "Evaluate"
    elif re.search(r'\b(create|design|construct|plan|produce|develop|formulate)\b', question):
        question_level = "Create"
    else:
        question_level = None

    # Analyze answer patterns
    if re.search(r'\b(is|are|was|were|means)\b', answer) and len(answer.split()) < 20:
        answer_level = "Remember"
    elif re.search(r'\b(because|since|as|consists of|includes)\b', answer):
        answer_level = "Understand"
    elif re.search(r'\b(can be used to|applied|implemented|designed|built)\b', answer):
        answer_level = "Apply"
    elif re.search(r'\b(compared to|differs from|analysis|relationship|impact of|effect of)\b', answer):
        answer_level = "Analyze"
    elif re.search(r'\b(better|worse|more effective|less efficient|advantages|disadvantages|pros|cons)\b', answer):
        answer_level = "Evaluate"
    elif re.search(r'\b(new|novel|innovative|created|designed|developed|proposed)\b', answer):
        answer_level = "Create"
    else:
        answer_level = "Understand"  # Default

    # Return the highest bloom level demonstrated
    bloom_hierarchy = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]

    if question_level and answer_level:
        q_idx = bloom_hierarchy.index(question_level) if question_level in bloom_hierarchy else 0
        a_idx = bloom_hierarchy.index(answer_level) if answer_level in bloom_hierarchy else 0
        return bloom_hierarchy[max(q_idx, a_idx)]
    elif answer_level:
        return answer_level
    elif question_level:
        return question_level
    else:
        return "Understand"  # Default fallback

# Calculate keyword coverage score
def keyword_coverage_score(student_text, reference_keywords):
    if not reference_keywords:
        return 1.0  # Perfect score if no keywords defined

    student_text_lower = student_text.lower()
    matches = 0

    for keyword in reference_keywords:
        if keyword.lower() in student_text_lower:
            matches += 1

    return matches / len(reference_keywords) if reference_keywords else 0

# Enhanced evaluation function
def evaluate_answer(gt_question, gt_answer, stu_answer, bloom_gt, keywords=None):
    # 1. Semantic similarity using better model
    emb_gt = sbert_model.encode(gt_answer, convert_to_tensor=True)
    emb_stu = sbert_model.encode(stu_answer, convert_to_tensor=True)
    sem_score = util.cos_sim(emb_gt, emb_stu).item()

    # 2. BLEU score with smoothing
    reference = [gt_answer.split()]
    candidate = stu_answer.split()
    bleu_score = sentence_bleu(reference, candidate, smoothing_function=smoothie)

    # 3. ROUGE-L score
    try:
        rouge_score = rouge.get_scores(stu_answer, gt_answer)[0]['rouge-l']['f']
    except:
        rouge_score = 0.0

    # 4. Keyword coverage (domain-specific)
    kw_coverage = keyword_coverage_score(stu_answer, keywords) if keywords else 0.5

    # 5. Bloom's taxonomy classification
    classified = classify_bloom(gt_question, stu_answer)

    # Calculate Bloom's penalty with a sliding scale
    bloom_hierarchy = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
    expected_idx = bloom_hierarchy.index(bloom_gt) if bloom_gt in bloom_hierarchy else 1
    actual_idx = bloom_hierarchy.index(classified) if classified in bloom_hierarchy else 1

    # Sliding scale penalty - higher penalty for significant discrepancies
    bloom_diff = expected_idx - actual_idx
    if bloom_diff == 0:  # Perfect match
        penalty = 0.0
    elif bloom_diff > 0:  # Student answered at a lower cognitive level than expected
        penalty = min(0.05 * bloom_diff, 0.15)
    else:  # Student exceeded expectations (minor bonus)
        penalty = -0.02  # Small bonus for exceeding expectations

    # Combine scores with adjusted weights
    alpha = 0.4  # Semantic similarity weight
    beta = 0.2   # BLEU + ROUGE weight
    gamma = 0.3  # Keyword coverage weight
    delta = 0.1  # Bloom penalty/bonus weight

    # Mix traditional metrics
    traditional_score = (bleu_score + rouge_score) / 2

    # Final weighted score
    final_score = (alpha * sem_score) + (beta * traditional_score) + (gamma * kw_coverage) - (delta * penalty)

    # Normalize score to 0-1 range and apply a curve to be more generous
    final_score = min(max(final_score, 0.0), 1.0)

    # Optional: Apply a gentle curve to make scores more intuitive (e.g., 0.6 becomes ~0.7)
    curved_score = pow(final_score, 0.8)

    return {
        "semantic_score": round(sem_score, 4),
        "bleu": round(bleu_score, 4),
        "rouge_l": round(rouge_score, 4),
        "keyword_coverage": round(kw_coverage, 4),
        "bloom_classified": classified,
        "bloom_expected": bloom_gt,
        "bloom_penalty": round(penalty, 4),
        "raw_score": round(final_score, 4),
        "final_score": round(curved_score, 4)
    }

# Evaluate all answers with improved system
results = []
for gt_q in answer_key["questions"]:
    stu_ans = next((a for a in student_answers["answers"] if a["id"] == gt_q["id"]), None)
    if stu_ans:
        keywords = gt_q.get("keywords", [])
        evaluation = evaluate_answer(gt_q["question"], gt_q["answer"], stu_ans["answer"], gt_q["bloom"], keywords)

        # Convert score to percentage for easier interpretation
        percentage_score = round(evaluation["final_score"] * 100, 1)

        results.append({
            "question_id": gt_q["id"],
            "question": gt_q["question"],
            "model_answer": gt_q["answer"],
            "student_answer": stu_ans["answer"],
            "evaluation": evaluation,
            "percentage_score": percentage_score
        })

# Print results in a readable format
for result in results:
    print(f"Question {result['question_id']}: {result['question']}")
    print(f"Model answer: {result['model_answer']}")
    print(f"Student answer: {result['student_answer']}")
    print(f"Score: {result['percentage_score']}%")
    print(f"Bloom's level: Expected '{result['evaluation']['bloom_expected']}', " +
          f"Classified as '{result['evaluation']['bloom_classified']}'")
    print("-" * 80)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Question 1: Explain the architecture of ARM processor.
Model answer: The ARM architecture includes components like the processor core, interrupt controller, memory controller, and peripheral controllers connected through AHB and APB buses.
Student answer: ARM includes the processor core, memory controller, interrupt controller and uses AHB and APB buses to connect peripherals.
Score: 79.0%
Bloom's level: Expected 'Understand', Classified as 'Understand'
--------------------------------------------------------------------------------
Question 2: Design a simple block diagram for embedded systems.
Model answer: The design includes components such as CPU, memory, I/O devices, timers, and buses. Communication occurs through system buses like AHB and APB.
Student answer: A diagram has CPU, memory, timers, I/O devices all connected using buses like AHB and APB.
Score: 74.4%
Bloom's level: Expected 'Apply', Classified as 'Apply'
----------------------------------------------------------------

In [None]:
# Return structured results
results

[{'question_id': 1,
  'question': 'Explain the architecture of ARM processor.',
  'model_answer': 'The ARM architecture includes components like the processor core, interrupt controller, memory controller, and peripheral controllers connected through AHB and APB buses.',
  'student_answer': 'ARM includes the processor core, memory controller, interrupt controller and uses AHB and APB buses to connect peripherals.',
  'evaluation': {'semantic_score': 0.9343,
   'bleu': 0.118,
   'rouge_l': 0.5946,
   'keyword_coverage': 1.0,
   'bloom_classified': 'Understand',
   'bloom_expected': 'Understand',
   'bloom_penalty': 0.0,
   'raw_score': 0.745,
   'final_score': 0.7902},
  'percentage_score': 79.0},
 {'question_id': 2,
  'question': 'Design a simple block diagram for embedded systems.',
  'model_answer': 'The design includes components such as CPU, memory, I/O devices, timers, and buses. Communication occurs through system buses like AHB and APB.',
  'student_answer': 'A diagram has CPU, 