In [8]:
import uuid
import pandas as pd
import json
import os
import glob
import jsonlines
import requests
from tqdm import trange
import random
import glob
import re

def get_context(language="EN"):
    if language == "EN":
        context = ""
        for file in glob.glob("./context_data/PaulGrahamEssays/*.txt"):
            with open(file, 'r') as f:
                context += f.read().replace("\n", " ")
        return context
    else:
        string_punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        context_file = open("./context_data/The_Story_of_the_Stone.txt", "r", encoding="utf-8")
        context = ""
        for i in context_file.readlines():
            context += i.strip().replace("------------"," ").replace("\n", " ").replace(" ","")
        context = re.sub('[{}]'.format(string_punctuation),"",context)
        context = re.sub('[a-zA-Z]','',context)
        return context

def get_stars(stars_dir, counting_times):
    stars_file = open(stars_dir,"r")
    return eval(stars_file.readline())[str(counting_times)]

def sentence_with_star(language, test_type, indicator):
    if language == "ZH":
        if test_type == "multi-evidence-retrieval-searching":
            single_star = f"\n小企鹅数了{a_stars[indicator]}颗★\n"
        else:
            single_star = f"\n小企鹅数了{r_stars[indicator]}颗★，但发现数错了，于是又数了一遍，这次数对了，是{a_stars[indicator]}颗★\n"
    else:
        if test_type == "multi-evidence-retrieval-searching":
            single_star = f"\nThe little penguin counted {a_stars[indicator]} ★\n"
        else:
            single_star = f"\nThe little penguin counted {r_stars[indicator]} ★, but found that a mistake had been made, so the counting was done again, and this time {a_stars[indicator]} ★ was counted correctly.\n"
    return single_star

def select_question(language, test_type):
    if language == "ZH":
        searching_question = "\n\n\n\n在这个月光皎洁、云雾缭绕的夜晚，小企鹅正望向天空，全神贯注地数★。请帮助小企鹅收集所数★的颗数，按照如下格式：{\"小企鹅\":[x,x,x,...]}，不要求和，[x,x,x,...]中数字为小企鹅每次数★的颗数，仅以JSON格式输出结果，不需要输出任何解释。"
        reasoning_question = "\n\n\n\n在这个月光皎洁、云雾缭绕的夜晚，小企鹅正望向天空，全神贯注地数★。请帮助小企鹅收集所数★的正确颗数，按照如下格式：{\"小企鹅\":[x,x,x,...]}，不要求和，[x,x,x,...]中数字为小企鹅正确数★的颗数，仅以JSON格式输出结果，不需要输出任何解释。"
        if test_type == "multi-evidence-retrieval-searching":
            return searching_question
        else:
            return reasoning_question
    else:
        searching_question = "\n\n\n\n" + "On this moonlit and misty night, the little penguin is looking up at the sky and concentrating on counting ★. Please help the little penguin collect the number of ★, for example: {\"little_penguin\": [x, x, x,...]}. The summation is not required, and the numbers in [x, x, x,...] represent the counted number of ★ by the little penguin. Only output the results in JSON format without any explanation."
        reasoning_question = "\n\n\n\n" + "On this moonlit and misty night, the little penguin is looking up at the sky and concentrating on counting ★. Please help the little penguin collect the correct number of ★, for example: {\"little_penguin\": [x, x, x,...]}. The summation is not required, and the numbers in [x, x, x,...] represent the correctly counted number of ★ by the little penguin. Only output the results in JSON format without any explanation."
        if test_type == "multi-evidence-retrieval-searching":
            return searching_question
        else:
            return reasoning_question

m = 32
n = 32
version = [[m, n]]
language_types = ["EN", "ZH"]
task_types = ["multi-evidence-retrieval-searching", "multi-evidence-retrieval-reasoning"]
# MeRS-ZH, MeRS-EN, MeRR-EN, MeRR-ZH
a_stars = get_stars("./context_data/a_stars.txt", m)
r_stars = get_stars("./context_data/r_stars.txt", m)
max_context_length = 128000

if __name__ == '__main__':
    for task_type in task_types:
        for language in language_types:
            if language == "EN":
                scalar = 0.8
            else:
                scalar = 0.7
            context = get_context(language=language)
            for m, n in version:
                line_count = 0
                interval = int(max_context_length/n)
                context_size = [int(i*scalar) for i in range(interval, max_context_length+1, interval)]
                file_name = f"./test_data/Counting_Stars_{language}_{task_type}_{max_context_length}_{m}_{n}.jsonl"
                test_data = open(file_name, "w", encoding="utf-8")
                print(file_name)
                for j in context_size:
                    indicator = 0
                    sprinkle_stars_context = " ".join(context.split(" ")[:j]) if language == "EN" else context[:j]
                    for k in range(0, j, int(j / m)):
                        single_star = sentence_with_star(language, task_type, indicator)
                        if language == "ZH":
                            sprinkle_stars_context = (sprinkle_stars_context[:k+int(j/m)+len(single_star)*indicator] + single_star + sprinkle_stars_context[k+int(j/m)+len(single_star)*indicator:])            
                        else:
                            sprinkle_stars_context = (" ".join(sprinkle_stars_context.split(" ")[:len(single_star.split(" "))*indicator+k+int(j / m)]) + single_star + " ".join(sprinkle_stars_context.split(" ")[int(j / m)+k+len(single_star.split(" "))*indicator:]))
                        indicator += 1
                        if indicator == m:
                            print(f"撒了{indicator}次星星")
                            break
                    print(len(sprinkle_stars_context + select_question(language, task_type)))
                    output_template = {"question": sprinkle_stars_context + select_question(language, task_type), "context_size": j, "retrieval_question": select_question(language, task_type),
                                "reference_counting_results": a_stars, "parameters": {"temperature": 0.0}}
                    print(json.dumps(output_template, ensure_ascii=False), file=test_data)
                    line_count += 1
                    test_data.flush()
                test_data.close()
                print(f"共计{line_count}条数据")

./test_data/Counting_Stars_EN_multi-evidence-retrieval-searching_128000_32_32.jsonl
撒了32次星星
18866
撒了32次星星
37223
撒了32次星星
55676
撒了32次星星
72449
撒了32次星星
90226
撒了32次星星
108808
撒了32次星星
126043
撒了32次星星
143722
撒了32次星星
161214
撒了32次星星
178738
撒了32次星星
196217
撒了32次星星
214366
撒了32次星星
232257
撒了32次星星
250212
撒了32次星星
267685
撒了32次星星
284783
撒了32次星星
302384
撒了32次星星
320187
撒了32次星星
338073
撒了32次星星
355557
撒了32次星星
374231
撒了32次星星
392587
撒了32次星星
410318
撒了32次星星
428433
撒了32次星星
446051
撒了32次星星
463849
撒了32次星星
482058
撒了32次星星
499498
撒了32次星星
516604
撒了32次星星
533982
撒了32次星星
552214
撒了32次星星
570151
共计32条数据
./test_data/Counting_Stars_ZH_multi-evidence-retrieval-searching_128000_32_32.jsonl
撒了32次星星
3289
撒了32次星星
6089
撒了32次星星
8889
撒了32次星星
11689
撒了32次星星
14489
撒了32次星星
17289
撒了32次星星
20089
撒了32次星星
22889
撒了32次星星
25689
撒了32次星星
28489
撒了32次星星
31288
撒了32次星星
34089
撒了32次星星
36889
撒了32次星星
39689
撒了32次星星
42489
撒了32次星星
45289
撒了32次星星
48089
撒了32次星星
50889
撒了32次星星
53689
撒了32次星星
56489
撒了32次星星
59288
撒了32次星星
62088
撒了32次星星
64888
撒了32次星星
67689
撒了32次星星
70489
撒了