In [1]:
import sys
import os

# Add the directory containing the module to the Python path
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
module_path = os.path.join(parent_dir, 'tools')
if module_path not in sys.path:
    sys.path.append(module_path)

# Generate single-run prompts

Each `jsonl` file saves all prompts and its corresponding parameters of a question.

The name convention of this `jsonl` file is: `{paper_idx}_{question_idx}.jsonl`.

- `paper_idx`: there are two kinds of paper, 1(全国卷1) and 2(全国卷2);
- `question_idx`: the question index in that paper.
  
Within the `jsonl` file, the data structure is:

```json
{
    "prompt_uuid": , # uuid to identify the query
    "prompt":
    {
        "system": "the_system_prompt",
        "user": "question_type_and_question"
    },
    "prompt_parameter":
    {
        "lang": "en/zh",
        "sequential": "answer-first/process-first", # generate answer first or process first
        "cot": True/False # whether state "use CoT and think step by step"
        "print_question": True/False # whether print the original question
    },
    "question":
    {
        "year": "2024",
        "paper": "1/2", # in the future, maybe some provinces
        "idx": "1_to_14", # include the single-choice, multiple-choice, and fill-in-the-blank
        "type": "1/2/3", # question type 1 := multiple choice (single); 2 := multiple choice (multiple); 3 :=  fill in the blank
        "type_detail": ..., # the description of question type
        "body": ..., # question body
        "answer": ..., 
        "value": ...,
    }
}
```

In [2]:
# set parameter

year = '2024'
paper_type = '1' # 1 or 2
paper = f'{year}-{paper_type}'

In [None]:
import uuid

import parse_data

# step 1 read system prompt
sys_prompt_fpath = os.path.join(cwd, 'system_prompt.jsonl')
sys_prompts = parse_data.read_jsonl(sys_prompt_fpath)
sys_prompts[0]

# step 2 read questions
paper_dir = os.path.join(parent_dir, f'paper' + os.sep + paper)
ques_fpath = os.path.join(paper_dir, 'question.jsonl')
questions = parse_data.read_jsonl(ques_fpath)
questions[0]

# step 3 read question type
ques_type_fpath = os.path.join(paper_dir, 'question_type.json')
ques_type = parse_data.read_json(ques_type_fpath)
ques_type

# step 4 generate prompt

# get question_idx list
ques_ids = set()
for i in questions:
    ques_ids.add(i['idx'])
# sort question ids
ques_ids = sorted(list(ques_ids), key=int)

# generate prompt
for i in ques_ids[:]:
    data = []
    # retrive questions with same idx
    for j in questions:
        if j['idx'] != i:
           continue
        # traverse all system prompt
        for sys_promt in sys_prompts[:]:
            if sys_promt['parameter']['lang'] != j['lang']:
                continue
            tmp = {}

            tmp['prompt_uuid'] = str(uuid.uuid4())

            tmp['prompt_parameter'] = {}
            tmp['prompt_parameter'].update(sys_promt['parameter'])

            tmp['question'] = {}
            tmp['question']['idx'] = i
            tmp['question']['year'] = year
            tmp['question']['paper'] = paper_type
            tmp['question']['type'] = j['question_type']
            tmp['question']['type_detail'] = ques_type[j['question_type']][j['lang']]
            tmp['question']['body'] = j['question']
            tmp['question']['answer'] = j['answer']
            tmp['question']['value'] = j['value']

            tmp['prompt'] = {}
            tmp['prompt']['sys'] = sys_promt['prompt']
            # add prefix
            if j['lang'] == 'zh':
                prefix = '题目如下:'
            elif j['lang'] == 'en':
                prefix = 'Here are the questions:'
            else:
                KeyError("only accept 'zh' and 'en'")
            tmp['prompt']['user'] = prefix + '\n\n' + tmp['question']['type_detail'] + '\n\n' + tmp['question']['body']
            
            data.append(tmp)

    # save the data
    target_fpath = os.path.join(parent_dir, 'data' + os.sep + paper + os.sep + f'{i}.jsonl')
    parse_data.write_jsonl(data, target_fpath)
