# Rule-base Baseline

First set up the import

In [1]:
import json
import csv
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from functools import lru_cache
from tqdm import tqdm

Download necessary NLTK data

In [2]:
# Ensure necessary NLTK data is downloaded
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\steve\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\steve\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\steve\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

Load Data

In [3]:
def load_data(filepath):
    """Load JSON data from a file."""
    with open(filepath, 'r') as file:
        return json.load(file)

WordNet Semantic Similarity function setup

In [4]:
@lru_cache(maxsize=10000)
def get_synsets(word):
    """Fetch and cache synsets for a word to reduce WordNet lookup overhead."""
    return wn.synsets(word)

@lru_cache(maxsize=50000)
def word_similarity(word1, word2):
    """Calculate maximum similarity score between synsets of two words with caching."""
    synsets1 = get_synsets(word1)
    synsets2 = get_synsets(word2)
    if not synsets1 or not synsets2:
        return 0
    max_sim = max((wn.path_similarity(syn1, syn2) or 0) for syn1 in synsets1 for syn2 in synsets2)
    return max_sim

def sentence_similarity(sentence1, sentence2):
    """Calculate semantic similarity between two sentences using WordNet."""
    words1 = word_tokenize(sentence1.lower())
    words2 = word_tokenize(sentence2.lower())
    total_score = 0
    count = 0

    for word1 in words1:
        for word2 in words2:
            sim_score = word_similarity(word1, word2)
            if sim_score > 0:
                total_score += sim_score
                count += 1

    return total_score / count if count else 0

Evaluate Baseline



**1.Baseline 1: Direct Word Overlap**

This method is try to determine if the question is answerable based on the direct presence of its words in the context.

This baseline is straightforward and assumes that if a question's vocabulary substantially overlaps with the context, and the context likely contains the information needed to answer the question.

**2. Sentence-Level Word Overlap**

This method considers the distribution of question words across individual sentences in the context.

**3. WordNet Semantic Similarity**

This method check the answerability of a question bases on the semantic similarity between the words in the question and those in the context, using the relationships defined in WordNet.


In [5]:
def evaluate_baseline(test_data, output_csv):
    """Evaluate the WordNet similarity baseline and write results to CSV including comparison and correctness percentages."""
    correct_counts = {'Baseline 1': 0, 'Baseline 2': 0, 'Baseline 3': 0}
    total_questions = 0

    with open(output_csv, mode='w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Id', 'Baseline 1 Category', 'Baseline 2 Category', 'Baseline 3 Category', 'Is Impossible'])

        # Process each question with a progress bar
        questions = [(qa, article['paragraphs']) for article in test_data['data'] for paragraph in article['paragraphs'] for qa in paragraph['qas']]
        for qa, paragraphs in tqdm(questions, desc="Evaluating questions"):
            context = paragraphs[0]['context'].lower()  # Assuming one context per paragraph
            question_id = qa['id']
            question = qa['question'].lower()
            is_impossible = qa['is_impossible']

            # Baseline 1: Direct Word Overlap
            question_words = word_tokenize(question)
            context_words = word_tokenize(context)
            included_words = sum(1 for word in question_words if word in context_words)
            total_words = len(question_words)
            category_b1 = 1 if total_words > 0 and included_words / total_words >= 0.5 else 0

            # Baseline 2: Sentence-Level Word Overlap
            sentences = nltk.sent_tokenize(context)
            max_overlap = 0
            for sentence in sentences:
                sentence_words = word_tokenize(sentence)
                included_words = sum(1 for word in question_words if word in sentence_words)
                if included_words / total_words > max_overlap:
                    max_overlap = included_words / total_words
            category_b2 = 1 if max_overlap >= 0.5 else 0

            # Baseline 3: WordNet Semantic Similarity
            semantic_similarity = sentence_similarity(question, context)
            category_b3 = 1 if semantic_similarity >= 0.5 else 0

            # Write all baseline results and the is_impossible status in a single row for each question
            writer.writerow([question_id, category_b1, category_b2, category_b3, is_impossible])

            # Update correctness counters
            if (category_b1 == 1) == (not is_impossible):
                correct_counts['Baseline 1'] += 1
            if (category_b2 == 1) == (not is_impossible):
                correct_counts['Baseline 2'] += 1
            if (category_b3 == 1) == (not is_impossible):
                correct_counts['Baseline 3'] += 1

            total_questions += 1

        # Calculate and write correctness percentages
        correctness = {key: (value / total_questions) * 100 for key, value in correct_counts.items()}
        writer.writerow(['Correctness %', correctness['Baseline 1'], correctness['Baseline 2'], correctness['Baseline 3'], ''])

Main program run use dev database from SQuAD2.0 (https://rajpurkar.github.io/SQuAD-explorer/).

In [6]:
# Load your datasets
test_json = load_data('../MISC/Dataset/dev-v2.0.json')

# Evaluate the semantic similarity baseline
evaluate_baseline(test_json, 'combined_baseline_results.csv')

Evaluating questions: 100%|██████████| 11873/11873 [41:05<00:00,  4.82it/s] 
