In [1]:
from pathlib import Path
from qa2nli.qa_readers import race 
import tqdm
from typing import *
from joblib import delayed, Parallel

In [2]:
_set = 'test'
unfiltered_nli_dataset_path = Path('.data/RACE/converted_{}.json'.format(_set))
qa_data_dir = Path('../qa-labeling/RACE/{}'.format(_set))
nli_output_path = Path('.data/RACE/set1/nli_{}.json'.format(_set))
qa_output_path = Path('.data/RACE/set1/qa_{}.json'.format(_set))

In [3]:
def filter_data(nli_dataset: List[Dict], qa_data_dir_path: Path):
    nli_successful = []
    nli_failed = []
    qa_successful = []
    qa_successful_ids = []
    qa_failed_ids = []
    nli_successful_ids = []
    nli_failed_ids = []
    for nli_sample in tqdm.tqdm(race.read_nli_data(nli_dataset_path)):
        if race.conversion_successful(nli_sample):
            nli_successful_ids.append(nli_sample['id'])
            qa_successful_ids.append(race.get_qa_filename_from_nli_sample(nli_sample))
            nli_successful.append(nli_sample)
            qa_sample = race.get_matching_qa_sample(nli_sample, qa_data_dir_path)
            qa_successful.append(qa_sample)
        else:
            nli_failed_ids.append(nli_sample['id'])
            qa_failed_ids.append(race.get_qa_filename_from_nli_sample(nli_sample))
            nli_failed.append(nli_sample)
    return {'nli_successful': nli_successful, 'nli_failed': nli_failed, 
            'qa_successful': qa_successful, 'nli_failed_ids': set(nli_failed_ids), 
            'nli_successful_ids': set(nli_successful_ids),
           'qa_successful_ids': set(qa_successful_ids),
           'qa_failed_ids': set(qa_failed_ids)}
            

In [4]:
def get_succesfully_converted_nli_sample_ids(nli_samples: List[Dict], n_jobs=1):
    success = [s['id'] for s in nli_samples if race.conversion_successful(s)]
    return success

def get_questions_with_all_options_converted(successful):
    options = ['A', 'B', 'C', 'D']
    all_options_sucessful = set()
    for success in successful:
        fname, q, opt = success.split('_')
        all_options = True
        for option in options:
            if not '_'.join([fname, q, option]) in successful:
                all_options = False
        if all_options:
            all_options_sucessful.add(success)
    return all_options_sucessful


In [5]:
nli_unfiltered = race.read_nli_data(unfiltered_nli_dataset_path)

In [6]:
successful = get_succesfully_converted_nli_sample_ids(nli_unfiltered)

In [7]:
len(successful)

9900

In [8]:
len(nli_unfiltered)

19736

In [9]:
all_opts_successful = get_questions_with_all_options_converted(set(successful))

In [10]:
len(all_opts_successful)

9892

In [11]:
# drop the options from names ex: high123.txt_1_A
questions_with_all_options_converted = set([idx[:-2] for idx in all_opts_successful])

In [12]:
with open(Path('.data/RACE/set1/questions_with_all_options_converted_{}.txt'.format(_set)), 'w') as f:
    for q in questions_with_all_options_converted:
        f.write(q)
        f.write('\n')

In [13]:
def get_nli_samples(ids, all_samples):
    nli_samples = []
    samples_dict = {sample['id']: sample for sample in all_samples}
    for idx in ids:
        nli_samples.append(samples_dict[idx])
    return nli_samples
    

In [14]:
nli_samples = get_nli_samples(all_opts_successful, nli_unfiltered)

In [15]:
len(nli_samples)

9892

In [16]:
with open(nli_output_path, 'w') as f:
    json.dump(nli_samples, f)

In [17]:
qa_samples = [race.get_qa_sample(qa_id, qa_data_dir) for qa_id in questions_with_all_options_converted]

In [18]:
with open(qa_output_path, 'w') as f:
    json.dump(qa_samples, f)