#### Json Processors for questions in some specific Benchmarks

In [1]:
import json

In [2]:
class ARC_Challenge_Json_Processor:
    '''
    Json Processor for questions in the ARC Challenge Benchmark
    '''
    def __init__(self, question_path, input_files):
        '''
        Constructor
        @param question_path string: the path to the files in the input_files list
        @param List input_files: a list of files to process
        '''
        self.question_path = question_path
        self.input_files = input_files

    def read_data(self):
        '''
        read all the questions from all the json files into one big list of dictionaries
        The data in input_file is one dictionary per line
        @return List: Each entry in the list is a dictionary containing one question from the benchmark
        '''
        questions = []
        for input_file in self.input_files:
            #print("JsonProcessor.read_json(): reading", self.question_path + input_file)
            with open(self.question_path + input_file, 'r', encoding='utf-8') as file:
                for line in file:
                    #print("line:", line)
                    line = json.loads(line)  # convert string to dictionary
                    #print("Json_Processor.read_json(): type(line):", type(line))
                    question = dict()
                    tmp = line["question"]
                    # tmp is a dictionary with keys "stem" and "choices"
                    #print(tmp.keys())
                    question["input"] = tmp["stem"]
                    targets = tmp["choices"]
                    # targets is a list of dictionaries, We want the value from the key "text" in each dictionary
                    question["target"] = ""
                    for target in targets:
                        question["target"] += target["text"] + "\n"

                    #rint('question["input"]:', question["input"])
                    #rint('question["target"]:', question["target"])
                    questions.append(question)
        return questions

    def process_data(self, data):
        """
        Doesn't do anything, yet.
        """
        processed_data = data
        return processed_data

    def write_json(self, processed_data):
        """
        Doesn't do anything, yet.
        """
        with open(self.output_file, 'w') as file:
            json.dump(processed_data, file, indent=2)

    def process_json_files(self):
        """
        Doesn't do anything, yet.
        """
        data = self.read_json()
        processed_data = self.process_data(data)
        self.write_json(processed_data)

SyntaxError: EOL while scanning string literal (2635433795.py, line 36)

In [2]:
class Big_Bench_Json_Processor:
    '''
    Json Processor for questions in the Big Bench Hard Benchmark
    '''
    def __init__(self, question_path, input_files):
        '''
        Constructor
        @param question_path string: the path to the files in the input_files list
        @param List input_files: a list of files to process
        '''
        self.question_path = question_path
        self.input_files = input_files

    def read_data(self):
        '''
        read all the questions from all the json files into one big list of dictionaries
        The data in input_file is one dictionary per line
        @return List: Each entry in the list is a dictionary containing one question from the benchmark
        '''
        questions = []
        for input_file in self.input_files:
            #print("JsonProcessor.read_json(): reading", self.question_path + input_file)
            with open(self.question_path + input_file, 'r', encoding='utf-8') as file:
                for line in file:
                    #print("line:", line)
                    line = json.loads(line)  # convert string to dictionary
                    #print("Json_Processor.read_json(): type(line):", type(line))
                    text = line["examples"]
                    for q in text:
                        question = dict()
                        question["input"] = q["input"]
                        question["target"] = q["target"]
                        question["topic"] = input_file   # Label the question so we know which topic (actually, file) it came from
                        questions.append(question)
        return questions

    def process_data(self, data):
        """
        Doesn't do anything, yet.
        """
        processed_data = data
        return processed_data

    def write_json(self, processed_data):
        """
        Doesn't do anything, yet.
        """
        with open(self.output_file, 'w') as file:
            json.dump(processed_data, file, indent=2)

    def process_json_files(self):
        """
        Doesn't do anything, yet.
        """
        data = self.read_json()
        processed_data = self.process_data(data)
        self.write_json(processed_data)

In [None]:
class HellaSwag_Json_Processor:
    '''
    Json Processor for questions in the Hella Swag Benchmark
    '''
    def __init__(self, question_path, input_files):
        '''
        Constructor
        @param question_path string: the path to the files in the input_files list
        @param List input_files: a list of files to process
        '''
        self.question_path = question_path
        self.input_files = input_files

    def read_data(self):
        '''
        read all the questions from all the json files into one big list of dictionaries
        The data in input_file is one dictionary per line
        @return: A list of dictionaries. Each dictionary is a question from the benchmark
        '''
        questions = []
        for input_file in self.input_files:
            print("HellaSwag JsonProcessor.read_json(): reading", self.question_path + input_file)
            with open(self.question_path + input_file, 'r', encoding='utf-8') as file:
                for line in file:
                    #print("line:", line)
                    line = json.loads(line)  # convert string to dictionary
                    #print("Json_Processor.read_json(): type(line):", type(line))
                    question = dict()
                    #*************************************
                    #ToDo map the questions/answers into our dictionary format
                    ending_prefix = line["ctx_b"]
                    target = ""
                    question["input"] = line["ctx_a"]
                    question["topic"] = line["activity_label"]   # Label the question so we know which topic it came from
                    for ending in line["endings"]:
                        target += ending_prefix + " " + ending + "\n"
                    question["target"] = target
                    #*************************************
                    questions.append(question)
        return questions

    def process_data(self, data):
        """
        Doesn't do anything, yet.
        """
        processed_data = data
        return processed_data

    def write_json(self, processed_data):
        """
        Doesn't do anything, yet.
        """
        with open(self.output_file, 'w') as file:
            json.dump(processed_data, file, indent=2)

    def process_json_files(self):
        """
        Doesn't do anything, yet.
        """
        data = self.read_json()
        processed_data = self.process_data(data)
        self.write_json(processed_data)