# Validation

In [2]:
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O validation.json

--2023-09-04 16:34:26--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘validation.json’


2023-09-04 16:34:28 (5.66 MB/s) - ‘validation.json’ saved [4370528/4370528]



In [2]:
# !pip install pandas qdrant-client openai --quiet

In [20]:
import pandas as pd
import json

def json_to_dataframe_with_titles(json_data):
    qas = []
    context = []
    is_impossible = []
    answers = []
    titles = []

    for article in json_data['data']:
        title = article['title']
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                qas.append(qa['question'].strip())
                context.append(paragraph['context'])
                is_impossible.append(qa['is_impossible'])
                
                ans_list = []
                for ans in qa['answers']:
                    ans_list.append(ans['text'])
                answers.append(ans_list)
                titles.append(title)

    df = pd.DataFrame({'title': titles, 'question': qas, 'context': context, 'is_impossible': is_impossible, 'answers': answers})
    return df

def get_diverse_sample(df, sample_size=100, random_state=42):
    sample_df = df.groupby(['title', 'is_impossible']).apply(lambda x: x.sample(min(len(x), max(1, sample_size // 50)), random_state=random_state)).reset_index(drop=True)
    
    if len(sample_df) < sample_size:
        remaining_sample_size = sample_size - len(sample_df)
        remaining_df = df.drop(sample_df.index).sample(remaining_sample_size, random_state=random_state)
        sample_df = pd.concat([sample_df, remaining_df]).sample(frac=1, random_state=random_state).reset_index(drop=True)

    return sample_df.sample(min(sample_size, len(sample_df)), random_state=random_state).reset_index(drop=True)

validation = json.load(open('validation.json', 'r'))
validation_df = json_to_dataframe_with_titles(validation)
df = get_diverse_sample(validation_df, sample_size=1000, random_state=37)

In [17]:
df.value_counts('is_impossible')

is_impossible
False    53
True     47
Name: count, dtype: int64

In [22]:
df.title.value_counts()

title
Southern_California                          33
Geology                                      33
Jacksonville,_Florida                        32
University_of_Chicago                        32
Imperialism                                  31
1973_oil_crisis                              31
Fresno,_California                           31
Harvard_University                           31
Scottish_Parliament                          31
Sky_(United_Kingdom)                         31
Normans                                      30
Packet_switching                             30
Ctenophora                                   30
Rhine                                        30
Pharmacy                                     30
Intergovernmental_Panel_on_Climate_Change    30
Warsaw                                       29
Steam_engine                                 29
Victoria_(Australia)                         29
Computational_complexity_theory              29
Amazon_rainforest                 

In [21]:
# write to csv
df.to_json('v2_1K_Seed=37_sample.json', orient='records', lines=True)