In [5]:
import json
import os

import dotenv
from aiohttp.typedefs import StrOrURL

from langchain_community.cache import SQLiteCache
from langchain_core.globals import set_llm_cache
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

from tqdm import tqdm

dotenv.load_dotenv()

True

In [6]:
with open('data/Final_TestSet/Final_TestSet.json', 'r', encoding='utf-8') as f:
    dataset_init=json.load(f)
with open('data/Final_Example.json', 'r', encoding='utf-8') as f:
    preliminary_example=json.load(f)

for i in range(0, len(dataset_init)):
    # 检查数据集文件是否一致
    assert dataset_init[i]["ID"] == preliminary_example[i]["ID"] 
    assert dataset_init[i]["question"] == preliminary_example[i]["question"]
    
print("样本数量：",len(dataset_init))
# print("问题类型：",",".join(set([item["problem_type"] for item in dataset_init])))


FROM=0
TO=FROM+100
dataset=dataset_init[FROM:TO]
# INDEX=[22,23,24,28,41,44,59,64,79,80,93,96]
# dataset=[dataset_init[i-1] for i in INDEX]

样本数量： 512


In [7]:
gpt4o=ChatOpenAI(
    api_key=os.getenv("WLAI_API_KEY"),
    base_url=os.getenv("WLAI_BASE_URL"),
    model="gpt-4o",
)

gpt4o.invoke("hello")
set_llm_cache(SQLiteCache(database_path=".langchain.db.bk")) # 

## 预处理


### 修改文件位置
1. 构建prompt
2. 修改题目中文件名位置

In [8]:
from gpt4o import *
for i in range(0, len(dataset)):
    content = d_template[dataset[i]["problem_type"]].format(dataset[i]["question"])
    filenames = extract_filenames(content)
    for filename in filenames:
        content = content.replace(filename, add_path(filename, data_path / 'Final_TestSet/data'))
    dataset[i]["content"]=content

### rag添加文档信息
1.  翻译
2. 从翻译提取函数和库
3. 查询
4. 计划
5. rag

如何衡量：暂时不做




In [9]:
from tool.model import translate_prompt

# 翻译所有问题，已经缓存，所以全量翻译
translation_runnable= translate_prompt | gpt4o | StrOutputParser()
translation_list = translation_runnable.batch([{"text":item["question"]} for item in dataset], config={"max_concurrency":1}, return_exceptions=True)
for i in range(0,len(dataset)):
    dataset[i]["translation"]=translation_list[i]

In [10]:
from tool.model import extract_runnable

# 从翻译中提取出函数和库
extract_list=extract_runnable.batch([{"text":item["translation"]} for item in dataset[:]], config={"max_concurrency":5}, return_exceptions=True)
for i in range(len(extract_list)):
    # print(i+1,extract_list[i])
    dataset[i]["func_extract"]=extract_list[i]

In [11]:
from tool.rag_tool import search_documents_by_help_function

for i,key_work in tqdm(enumerate(extract_list), total=len(extract_list)):
    key_work=key_work if type(key_work) is list else [key_work]
    tmp_set=set()
    for kw in key_work:
        doc=search_documents_by_help_function(kw["function_name"],kw["module_name"])
        tmp_set.add("<api doc>\n" + doc + "\n</api doc>")
    dataset[i]["rag_infos"]=tmp_set

Note: to be able to use all crisp methods, you need to install some additional packages:  {'wurlitzer', 'graph_tool'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'wurlitzer'}


100%|██████████| 100/100 [01:24<00:00,  1.18it/s]


In [12]:
import inspect
from tool.rag_tool import search_documents_in_mutil_keywords

for item in tqdm(dataset):
    question=item["question"]
    tmp_l=[]
    for doc_json,_,_ in search_documents_in_mutil_keywords([], question,5):
        function_name=""
        for key in doc_json:
            if str(key).startswith("Field List > Methods > "):
                function_name=key[22:]
        if function_name!="":
            class_name=doc_json["Section_id"] if "Section_id" in doc_json else doc_json["Section ID"]
        else:
            function_name=doc_json["Section_id"] if "Section_id" in doc_json else doc_json["Section ID"]
            class_name=""
        package_name=doc_json["module"]
        
        help_doc=search_documents_by_help_function(function_name,package_name)
        tmp_l.append(f"function:{function_name}, class:{class_name}, package:{package_name}, doc:'{repr(help_doc)[1:-1] }'")
    item["func_bk"]=tmp_l


100%|██████████| 100/100 [05:39<00:00,  3.40s/it]


In [13]:
for i in range(len(dataset)):
    dataset[i]["content"]=dataset[i]["content"]+"\n\nThe following function must be used:\n"+"\n".join(dataset[i]["rag_infos"])
    dataset[i]["content"]=dataset[i]["content"]+"\n\nThe following functions can be used optionally:\n"+"\n".join(dataset[i]["func_bk"])

### 添加目标

In [14]:
from tqdm import tqdm
from tool.model import cal_prompt, draw_prompt, tof_prompt

def get_goals(text:str, problem_type:str):
    types=[]
    goals=[]
    if problem_type.startswith("multi"):
        types.extend(problem_type[6:-1].split(", "))
    else:
        types.append(problem_type)

    for t in types:
        if t=="calculations":
            prompt=cal_prompt
        elif t=="True/False":
            prompt=tof_prompt
        elif t=="draw":
            prompt=draw_prompt
        else:
            raise Exception("unknown problem type")
    
        runnable=prompt|gpt4o|StrOutputParser()
        goal=runnable.invoke({"question":text})
        goals.append(goal)
    return goals
        
        
for i in tqdm(range(len(dataset))):
    if dataset[i]["problem_type"].startswith("multi"):
        goals=get_goals(dataset[i]["question"], dataset[i]["problem_type"])
        # print(i+1,goals)
        dataset[i]["goals"]=goals
        dataset[i]["content"]=dataset[i]["content"]+"\n\n"+"\nwe need to answer following question：\n"+"\n".join(goals)
    

100%|██████████| 100/100 [00:00<00:00, 806.46it/s]


## 运行
### 运行agent

In [32]:
from autogen import Cache

def run(item: dict,cache_seed=1):
    content = item["content"]
    item["content"]=content

    # Use DiskCache as cache
    with Cache.disk(cache_path_root="./autogen_cache",cache_seed=cache_seed) as cache:
        chat_result = code_executor_agent.initiate_chat(
            code_writer_agent,
            message=content,
            summary_method='reflection_with_llm',
            summary_args=dict(summary_prompt='only return the code output'),
            cache=cache,
            silent=True,
        )
    # code = extract_python_code(chat_result.chat_history[-3]['content'])[-1]
    code=""
    for i in range(len(chat_result.chat_history)-1, 0, -1):
        l=extract_python_code(chat_result.chat_history[i]['content'])
        if len(l)>0:
            code=l[-1]
            break
    
    answer = chat_result.summary
    if isinstance(answer, dict):
        answer = answer['content']
    item["code"]=code
    item["answer"]=answer
    # item['chat_history']=chat_result.chat_history
    return item


for item in tqdm(dataset[54:]):
    run(item)


  0%|          | 0/46 [00:00<?, ?it/s]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is sh)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


  2%|▏         | 1/46 [00:35<26:27, 35.28s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


  4%|▍         | 2/46 [00:58<20:38, 28.16s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


  7%|▋         | 3/46 [01:09<14:28, 20.20s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


  9%|▊         | 4/46 [01:33<15:17, 21.84s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 11%|█         | 5/46 [02:35<24:43, 36.19s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 13%|█▎        | 6/46 [04:27<41:25, 62.15s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 15%|█▌        | 7/46 [04:36<29:07, 44.80s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 17%|█▋        | 8/46 [04:54<22:51, 36.10s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 20%|█▉        | 9/46 [05:13<18:55, 30.70s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 22%|██▏       | 10/46 [05:13<12:48, 21.35s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 24%|██▍       | 11/46 [05:35<12:35, 21.60s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 26%|██▌       | 12/46 [05:45<10:08, 17.90s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 28%|██▊       | 13/46 [05:59<09:12, 16.74s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 30%|███       | 14/46 [06:07<07:34, 14.21s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 33%|███▎      | 15/46 [06:15<06:20, 12.28s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 35%|███▍      | 16/46 [06:44<08:42, 17.42s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 37%|███▋      | 17/46 [07:03<08:38, 17.87s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 39%|███▉      | 18/46 [08:06<14:41, 31.47s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 41%|████▏     | 19/46 [08:18<11:29, 25.55s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 43%|████▎     | 20/46 [08:26<08:50, 20.39s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 46%|████▌     | 21/46 [08:34<06:56, 16.68s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 48%|████▊     | 22/46 [08:45<05:55, 14.83s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 50%|█████     | 23/46 [08:52<04:49, 12.59s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 52%|█████▏    | 24/46 [09:07<04:51, 13.26s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING 2 CODE BLOCKS (inferred languages are [sh, python])...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 54%|█████▍    | 25/46 [09:56<08:20, 23.83s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 57%|█████▋    | 26/46 [09:56<05:36, 16.81s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 59%|█████▊    | 27/46 [10:03<04:20, 13.72s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 61%|██████    | 28/46 [10:09<03:29, 11.62s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 63%|██████▎   | 29/46 [10:17<02:58, 10.51s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 65%|██████▌   | 30/46 [10:29<02:52, 10.81s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 67%|██████▋   | 31/46 [10:37<02:31, 10.13s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 70%|██████▉   | 32/46 [11:04<03:30, 15.05s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 72%|███████▏  | 33/46 [11:29<03:54, 18.03s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 74%|███████▍  | 34/46 [11:44<03:28, 17.34s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 76%|███████▌  | 35/46 [11:55<02:49, 15.41s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 78%|███████▊  | 36/46 [12:02<02:08, 12.82s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 80%|████████  | 37/46 [12:31<02:39, 17.74s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 83%|████████▎ | 38/46 [13:30<04:00, 30.12s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 85%|████████▍ | 39/46 [13:34<02:34, 22.08s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 87%|████████▋ | 40/46 [13:40<01:44, 17.39s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 89%|████████▉ | 41/46 [27:50<22:14, 266.99s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 91%|█████████▏| 42/46 [27:57<12:36, 189.12s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 93%|█████████▎| 43/46 [28:07<06:46, 135.40s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 96%|█████████▌| 44/46 [28:39<03:28, 104.36s/it]

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


 98%|█████████▊| 45/46 [28:47<01:15, 75.44s/it] 

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


100%|██████████| 46/46 [29:20<00:00, 38.26s/it]


### voting

In [None]:
# for item in [dataset[i-1] for i in [3,6,9,10,12,14,19,21,23,24,28,30,37,40]]:
#     temp_answer=[]
#     for seed in tqdm(range(1,17)):
#         item=run(item,seed)
#         code,answer=item["code"],item["answer"]
#         temp_answer.append(answer)
#     prompt=f"""从下面的不同人表达中，直接返回大部分人想表达的内容，不附带其他信息：\n"""+"\n".join(temp_answer)
#     print(item["ID"],(gpt4o|StrOutputParser()).invoke(prompt))
        
        

## 存储

In [33]:
for i in dataset:
    if type(i['rag_infos']) is set:
        i['rag_infos']=list(i['rag_infos'])

In [34]:
with open('data/SMP_240915_check_1.json', 'w', encoding='utf-8') as f:
    s = json.dumps(dataset, indent=4, ensure_ascii=False)
    f.write(s)

----

In [None]:
raise Exception("stop")

In [None]:
with open('data/SMP_240905_check_1.json', 'r', encoding='utf-8') as f:
    tmp_dataset=json.load(f)

In [30]:
len(dataset[53]["content"])

37540

In [None]:
tmp_id=50
i=tmp_id-1
print(tmp_dataset[i]["ID"], tmp_dataset[i]["problem_type"],"\n---\n", tmp_dataset[i]["translation"],"\n---\n", tmp_dataset[i]['answer'],"\n---\n",tmp_dataset[i]["code"],"\n---\n",tmp_dataset[i]["question"])

In [None]:


def remove_empty_values(d):
    """
    递归删除字典中的所有空值（包括空字符串、空列表、空字典、None等）
    """
    if not isinstance(d, dict):
        return d
    
    # 使用字典推导式递归遍历字典
    return {k: remove_empty_values(v) for k, v in d.items() if v not in ('', None, [], {}, set(), ())}


for i,key_work in tqdm(enumerate(extract_list), total=len(extract_list)):
    infos=""
    INFO_LIMIT=3000

    for item in key_work:
        if item["function_name"] != "":
            module,function = item['module_name'],item['function_name']
            api_docs=search_documents(function,module,dataset[i]["question"])
            for doc in api_docs[:2]:
                if len(infos)<INFO_LIMIT:
                    if not doc.startswith("no"):
                        doc=json.dumps(remove_empty_values(json.loads(doc)))
                    infos=infos + "\n\n"+doc
        # 没有抽取，尝试用整个问题查询
        else:
            api_docs=search_documents(method_description=dataset[i]["question"])
            for doc in api_docs:
                if len(infos)<INFO_LIMIT:
                    if not doc.startswith("no"):
                        doc=json.dumps(remove_empty_values(json.loads(doc)))
                    infos=infos + "\n\n"+doc
    dataset[i]["rag_infos"]=infos
for i in range(len(dataset)):
    dataset[i]["content"]=dataset[i]["content"]+"\n\n"+dataset[i]["rag_infos"]

In [None]:
for i in range(len(dataset)):
    rag_infos=dataset[i]["rag_infos"]
    print()
    print(i+1,extract_list[i],[round(t[1]+t[2],2)  for t in rag_infos])
        

In [None]:
from tool.rag_tool import search_documents_by_help_function

for i in range(0,len(dataset)):
    for item in extract_list[i]:
        fn=item["function_name"].split(".")[-1]
        mo=str(item["module_name"]).lower().strip().split(".")[0]
        print(i, fn, mo,end=" ")
        doc=search_documents_by_help_function(fn, mo)
        if doc:
            print(len(doc))
        else:
            print(None)
            