## Generate Question Embedding Train Set

In [2]:
# Generate dataset of successful 1-shot examples (1 instance per correct question)

import glob
import json
import os

# Get all 1-shot experiment results file paths
pattern = os.path.join('chapter-3', 'results', 'spider', '**', '*results-[1].json')
results = glob.glob(pattern, recursive=True)

# Initialize a dictionary to store the embedding training examples.
train_examples = []

# Test all questions in all 1-shot experiments, obtaining one good example per question where possible
for file_path in results:
    
    with open(file_path, 'r') as file:
        data = json.load(file)

        for question in data['questions']:
            # If the example led to a correct answer, add it to the training set with a score of 1
            if question['correct'] == 1:
                train_examples.append({'sentence1': question['question'], 
                                         'sentence2': question['examples'][0]['question'],
                                         'score': 1})
            # If the example led to a wrong answer, add it to the training set with a score of 0
            else:
                train_examples.append({'sentence1': question['question'], 
                                         'sentence2': question['examples'][0]['question'],
                                         'score': 0})
                
output_path = os.path.join('data', 'spider', 'train-sets', 'question-embedding-train-set.json') 
                
with open(output_path, 'w') as output_file:
    json.dump(train_examples, output_file, indent=4)