In [24]:
from datasets import load_dataset, load_dataset_builder
import random as random

In [None]:
ds = load_dataset("microsoft/ms_marco", 'v2.1', split="train")


In [26]:
random.seed(42)
size = len(ds)
print(f"Total size of dataset: {size}")
sample_size = 20000
sample_indices = random.sample(range(size), sample_size)
ds_sampled = ds.select(sample_indices)
print(f"Sampled dataset size: {len(ds_sampled)}")

Total size of dataset: 808731
Sampled dataset size: 20000


In [None]:
print(ds_sampled.column_names)
print("\n")
for i in range(10):
    print(ds_sampled[i]['query'])
    print(ds_sampled[i]['answers'])
    print(ds_sampled[i]['wellFormedAnswers'])
    print('\n')

In [None]:
#merge answers with wellFormedAnswers, and drop other columns
def simplify(example):
    if example['wellFormedAnswers'] and len(example['wellFormedAnswers']) > 0:
        answer = example['wellFormedAnswers'][0]
    else:
        answer = example['answers'][0]
    return {
        'query': example['query'],
        'answer': answer
    }
ds_simplified = ds_sampled.map(simplify, remove_columns=ds_sampled.column_names)
print(ds_simplified.column_names)
for i in range(10):
    print(ds_simplified[i])
    print('\n')


In [None]:
def has_multiple_sentences(example):
    period_count = example['answer'].count('.')
    return period_count > 1

ds_filtered = ds_simplified.filter(has_multiple_sentences)
print(f"Dataset size before filtering: {len(ds_simplified)}")
print(f"Dataset size after filtering: {len(ds_filtered)}")

for i in range(10):
    print(ds_filtered[i])
    print('\n')

In [None]:
def is_complete_question(example):
    query = example['query'].lower().strip()
    question_starters = ['what', 'when', 'where', 'who', 'whom', 'whose', 'which', 
                        'why', 'how', 'is', 'are', 'was', 'were', 'do', 'does', 
                        'did', 'can', 'could', 'will', 'would', 'should', 'may', 'might']
    word_count = len(query.split())

    starts_with_question = any(query.startswith(word + ' ') for word in question_starters)
    has_enough_words = word_count >= 4
    return starts_with_question and has_enough_words

ds_filtered_2 = ds_filtered.filter(is_complete_question)
print(f"Dataset size before filtering: {len(ds_filtered)}")
print(f"Dataset size after filtering: {len(ds_filtered_2)}")

for i in range(10):
    print(ds_filtered_2[i])
    print('\n')