In [14]:
import uuid
import pandas as pd
import json
import os
import glob
import jsonlines
import requests
from tqdm import trange
import random
import glob
import re

def get_context():
    if language == "EN":
        context = ""
        for file in glob.glob("./context_data/PaulGrahamEssays/*.txt"):
            with open(file, 'r') as f:
                context += f.read().replace("\n", " ")
        return context
    elif language == "ZH":
        string_punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        context_file = open("./context_data/The_Story_of_the_Stone.txt", "r", encoding="utf-8")
        context = ""
        for i in context_file.readlines():
            context += i.strip().replace("------------"," ").replace("\n", " ").replace(" ","")
        context = re.sub('[{}]'.format(string_punctuation),"",context)
        context = re.sub('[a-zA-Z]','',context)
        return context
    else:
        print("wrong!")

def get_stars(stars_dir, counting_times):
    stars_file = open(stars_dir,"r")
    return eval(stars_file.readline())[str(counting_times)]


def sentence_with_star(language, test_type, indicator):
    if language == "ZH":
        if test_type == "acquisition":
            single_star = f"\n小企鹅数了{a_stars[indicator]}颗★\n"
        elif test_type == "reasoning":
            single_star = f"\n小企鹅数了{r_stars[indicator]}颗★，但发现数错了，于是又数了一遍，这次数对了，是{a_stars[indicator]}颗★\n"
        else:
            print("wrong!")
    elif language == "EN":
        if test_type == "acquisition":
            single_star = f"\nThe little penguin counted {a_stars[indicator]} ★\n"
        elif test_type == "reasoning":
            single_star = f"\nThe little penguin counted {r_stars[indicator]} ★, but found that a mistake had been made, so the counting was done again, and this time {a_stars[indicator]} ★ was counted correctly.\n"
        else:
            print("wrong!")
    else:
        print("wrong!")
    return single_star

def select_question(language, test_type):
    if language == "ZH":
        acquisition_question = "\n\n\n\n在这个月光皎洁、云雾缭绕的夜晚，小企鹅正望向天空，全神贯注地数★。请帮助小企鹅收集所数★的颗数，按照如下格式：{\"小企鹅\":[x,x,x,...]}，不要求和，[x,x,x,...]中数字为小企鹅每次数★的颗数，仅以JSON格式输出结果，不需要输出任何解释。"
        reasoning_question = "\n\n\n\n在这个月光皎洁、云雾缭绕的夜晚，小企鹅正望向天空，全神贯注地数★。请帮助小企鹅收集所数★的正确颗数，按照如下格式：{\"小企鹅\":[x,x,x,...]}，不要求和，[x,x,x,...]中数字为小企鹅正确数★的颗数，仅以JSON格式输出结果，不需要输出任何解释。"
        if test_type == "acquisition":
            return acquisition_question
        elif test_type == "reasoning":
            return reasoning_question
        else:
            print("wrong test type!")
    elif language == "EN":
        acquisition_question = "\n\n\n\n" + "On this moonlit and misty night, the little penguin is looking up at the sky and concentrating on counting ★. Please help the little penguin collect the number of ★, for example: {\"little_penguin\": [x, x, x,...]}. The summation is not required, and the numbers in [x, x, x,...] represent the counted number of ★ by the little penguin. Only output the results in JSON format without any explanation."
        reasoning_question = "\n\n\n\n" + "On this moonlit and misty night, the little penguin is looking up at the sky and concentrating on counting ★. Please help the little penguin collect the correct number of ★, for example: {\"little_penguin\": [x, x, x,...]}. The summation is not required, and the numbers in [x, x, x,...] represent the correctly counted number of ★ by the little penguin. Only output the results in JSON format without any explanation."
        if test_type == "acquisition":
            return acquisition_question
        elif test_type == "reasoning":
            return reasoning_question
        else:
            print("wrong test type!")
    else:
        print("wrong language")

m = 32
n = 32
version = [[m, n]]
#test_type = "acquisition"
test_type = "reasoning"
a_stars = get_stars("./context_data/a_stars.txt", m)
r_stars = get_stars("./context_data/r_stars.txt", m)
max_context_length = 128000
language = "EN"
if language == "EN":
    scalar = 0.8
elif language == "ZH":
    scalar = 0.7


if __name__ == '__main__':
    context = get_context()
    for m, n in version:
        line_count = 0
        interval = int(max_context_length/n)
        context_size = [int(i*scalar) for i in range(interval, max_context_length+1, interval)]
        file_name = f"./test_data/Counting_Stars_{language}_{test_type}_{max_context_length}_{m}_{n}.jsonl"
        #file_name = f"Counting_Stars_{max_context_length}_{m}_{n}_reasoning.jsonl"
        test_data = open(file_name, "w", encoding="utf-8")
        print(file_name)
        for j in context_size:
            indicator = 0
            if language == "EN":
                sprinkle_stars_context = " ".join(context.split(" ")[:j])
            elif language == "ZH":
                sprinkle_stars_context = context[:j]
            else:
                print("wrong language!")
            for k in range(0, j, int(j / m)):
                single_star = sentence_with_star(language, test_type, indicator)
                if language == "ZH":
                    sprinkle_stars_context = (sprinkle_stars_context[:k+int(j/m)+len(single_star)*indicator] + single_star + sprinkle_stars_context[k+int(j/m)+len(single_star)*indicator:])  
                elif language == "EN":        
                    sprinkle_stars_context = (" ".join(sprinkle_stars_context.split(" ")[:len(single_star.split(" "))*indicator+k+int(j / m)]) + single_star + " ".join(sprinkle_stars_context.split(" ")[int(j / m)+k+len(single_star.split(" "))*indicator:]))                    
                else:
                    print("wrong version!")
                indicator += 1
                if indicator == m:
                    print(f"撒了{indicator}次星星")
                    break
            print(len(sprinkle_stars_context + select_question(language, test_type)))
            output_template = {"question": sprinkle_stars_context + select_question(language, test_type), "context_size": j, "retrieval_question": select_question(language, test_type),
                        "reference_counting_results": a_stars, "parameters": {"temperature": 0.0}}
            print(json.dumps(output_template, ensure_ascii=False), file=test_data)
            line_count += 1
            test_data.flush()
        test_data.close()
        print(f"共计{line_count}条数据")

Counting_Stars_EN_reasoning_128000_32_32.jsonl
撒了32次星星
22906
撒了32次星星
41564
撒了32次星星
60401
撒了32次星星
77422
撒了32次星星
95635
撒了32次星星
114513
撒了32次星星
132050
撒了32次星星
150140
撒了32次星星
167780
撒了32次星星
185752
撒了32次星星
203625
撒了32次星星
222266
撒了32次星星
240235
撒了32次星星
258400
撒了32次星星
276155
撒了32次星星
293479
撒了32次星星
311767
撒了32次星星
330042
撒了32次星星
347861
撒了32次星星
366176
撒了32次星星
385336
撒了32次星星
403452
撒了32次星星
421675
撒了32次星星
440284
撒了32次星星
458045
撒了32次星星
476795
撒了32次星星
494661
撒了32次星星
512274
撒了32次星星
529935
撒了32次星星
547830
撒了32次星星
566207
撒了32次星星
584094
共计32条数据
