In [1]:
import json
import re


In [2]:
import nltk
from nltk import pos_tag, word_tokenize
# POS_TAG is used to tag words in a sentence as nouns, verbs, adjectives, etc. This would help in determine sentence syntactic structure
# WORD_TOKENIZE is used to split a sentence into words

In [3]:
nltk.download('punkt_tab')
# divide the text into a list of sentences

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/prisha/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
œ# download the POS tagger for English language

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/prisha/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

## Helper Functions for Context Validation

In [5]:
def is_numeric_heavy(text):
    # Count the percentage of numeric content in the text , if more than 80% we don't consider it as a valid context
    tokens = text.split()
    numeric_tokens = [token for token in tokens if token.isdigit()]
    return len(numeric_tokens) / len(tokens) > 0.8

œœ
def is_too_short(text):
    # Define a minimum length for valid context atelast 10 words
    return len(text.split()) < 10 

def lacks_verbs(text):
    # Use POS tagging to check if there are no verbs in the context,then also the context is not valid
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    verbs = [word for word, pos in tagged if pos.startswith('VB')]
    return len(verbs) == 0 

def is_repetitive(text):
    # Check if the context contains a significant amount of repeated words, random context hence invalid
    tokens = text.split()
    unique_tokens = set(tokens)
    return len(unique_tokens) / len(tokens) < 0.5  

def is_valid_context(context):
    if is_numeric_heavy(context) or is_too_short(context) or lacks_verbs(context) or is_repetitive(context):
        return False
    return True


## Helper Functions for Question Validation

In [8]:

def is_question_generic(question):
    # Detect generic or vague questions that mention about the context
    generic_phrases = ["What is mentioned", "from this context", "according to the context" , "in the given"]
    for phrase in generic_phrases:
        if phrase in question.lower():
            return True
    return False

def question_mismatch_with_context(question, context):
    # Check for keywords overlap between question and context , should be atleast 2 else the question is generic
    question_keywords = set(re.findall(r'\b\w+\b', question.lower()))
    context_keywords = set(re.findall(r'\b\w+\b', context.lower()))
    common_words = question_keywords & context_keywords
    return len(common_words) < 2 

def is_question_incomplete(question):
    # Detect incomplete questions
    return question.endswith('...')

def is_question_only_numbers(question):
    # Detect questions that are only asking about numbers
    return re.fullmatch(r'[\d\s]+', question)

def is_valid_question(question, context):
    # Combine criteria for determining if a question is invalid
    if is_question_generic(question) or question_mismatch_with_context(question, context) or is_question_incomplete(question) or is_question_only_numbers(question):
        return False
    return True


In [10]:
def filter_json(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)

    filtered_data = []

    for doc in data['data']:
        valid_paragraphs = []
        for paragraph in doc['paragraphs']:
            context = paragraph['context']
            if is_valid_context(context):
                valid_qas = []
                for qa in paragraph['qas']:
                    question = qa['question']
                    if is_valid_question(question, context):
                        valid_qas.append(qa)
                
                # Keep the paragraph only if there are valid questions
                if valid_qas:
                    paragraph['qas'] = valid_qas
                    valid_paragraphs.append(paragraph)

        # Keep the document only if it has valid paragraphs
        if valid_paragraphs:
            doc['paragraphs'] = valid_paragraphs
            filtered_data.append(doc)

    with open(output_file, 'w') as f:
        json.dump({'data': filtered_data}, f, indent=2)


In [11]:
def filter_json_2(input_file, output_file_valid, output_file_invalid):
    with open(input_file, 'r') as f:
        data = json.load(f)

    filtered_data = []
    removed_data = [] 

    for doc in data['data']:
        valid_paragraphs = []
        removed_paragraphs = []
        for paragraph in doc['paragraphs']:
            context = paragraph['context']
            if is_valid_context(context):
                valid_qas = []
                invalid_qas = []
                for qa in paragraph['qas']:
                    question = qa['question']
                    if is_valid_question(question, context):
                        valid_qas.append(qa)
                    else:
                        invalid_qas.append(qa)

                if valid_qas:
                    paragraph['qas'] = valid_qas
                    valid_paragraphs.append(paragraph)

                if invalid_qas:
                    invalid_paragraph = paragraph.copy()
                    invalid_paragraph['qas'] = invalid_qas
                    removed_paragraphs.append(invalid_paragraph)

            else:
                removed_paragraphs.append(paragraph)

        if valid_paragraphs:
            doc['paragraphs'] = valid_paragraphs
            filtered_data.append(doc)

        if removed_paragraphs:
            doc_copy = doc.copy()
            doc_copy['paragraphs'] = removed_paragraphs
            removed_data.append(doc_copy)

    # Save valid data
    with open(output_file_valid, 'w') as f_valid:
        json.dump({'data': filtered_data}, f_valid, indent=2)

    # Save invalid data
    with open(output_file_invalid, 'w') as f_invalid:
        json.dump({'data': removed_data}, f_invalid, indent=2)

    print(f"Valid data saved to {output_file_valid}")
    print(f"Invalid data saved to {output_file_invalid}")



In [12]:
# Use the function to filter the input JSON
input_file = 'dev.json'  
output_file_valid = 'filtered_output.json' 
output_file_invalid = 'filtered_output_invalid.json'
filter_json_2(input_file, output_file_valid, output_file_invalid)

Valid data saved to filtered_output.json
Invalid data saved to filtered_output_invalid.json


In [None]:
# Use the function to filter the input JSON
input_file = 'dev.json'  
output_file = 'filtered_output.json' 
filter_json(input_file, output_file)
print("Filtering completed. Valid data saved to filtered_output.json.")