# Init

In [1]:
import sys
import os
import json
import time

from tqdm import tqdm

cwd = os.getcwd()
os.chdir(cwd)
sys.path.append('tools')

import chat
import parse_data

In [26]:
system_prompt = ""

prompt_dir = os.path.join(cwd, 'prompt')
version = 'v2.7.0'
fname = f'table_finder-stage_1-{version}.md'
prompt_fpath = os.path.join(prompt_dir, fname)

with open(prompt_fpath, 'r') as f:
    prompt_template = ''.join(f.readlines())

def make_prompt(query: str) -> str:

    prompt = prompt_template + query

    return prompt

In [4]:
question_path = os.path.join(cwd, 'data' + os.sep + 'question.json')

questions = parse_data.read_json(question_path)

# GLM-4-Plus

In [15]:
model = 'glm_4_plus'

## Test

In [30]:
query = make_prompt(questions[12]['team'][0]['question'])

history = []

start_time = time.time()
message = chat.create_message(query, history=history, system_prompt=system_prompt, temperature=0.7, top_p=0.9, response_format='text')
end_time = time.time()

execution_time = end_time - start_time
usage = chat.get_token_usage(message, True)
content = chat.get_content(message, True)
history = chat.build_history(history, message=message)

{'prompt_tokens': 10491, 'completion_tokens': 736, 'total_tokens': 11227}
```json
{
  "raw_question": "今天是2020年10月27日，当日收盘价第3高的港股是？(以下都回答简称)",
  "data_source_reasoning": [
    {
      "step": "解析格式要求",
      "instruction": "识别用户对最终输出格式或数据精度的任何明确要求。",
      "cot_thinking": "用户要求返回公司简称，因此需要确保最终输出格式为简称，且无需额外的数据精度处理。"
    },
    {
      "step": "识别信息需求",
      "instruction": "提取 query 的核心意图，即用户希望获取什么类型的信息。",
      "cot_thinking": "用户的问题是询问某日港股收盘价第 3 高的股票简称。核心意图是获取特定日期的港股收盘价排名信息以及对应的港股股票简称。",
      "result": ["特定日期的港股收盘价排名", "港股股票简称"]
    },
    {
      "step": "定位相关数据库和表格（需求1：特定日期的港股收盘价排名）",
      "instruction": "根据第一个信息需求（收盘价排名），依次、逐步罗列**所有**可能包含相关数据的数据库和表格，再根据需求确定合适的表格。",
      "cot_thinking": "查看 Database-Table Schema，港股收盘价数据与 '港股行情表现' (CS_HKStockPerformance) 表相关。该表包含港股股票从最近一个交易日往前追溯一段时期的行情表现信息，也包括收盘价。但第一个信息需求核心是在于'特定日期的收盘价排名'，CS_HKStockPerformance 重点在于某时间段的行情数据，而我们需要的是当日数据。因此，CS_HKStockPerformance 不适合用于筛选 2020 年 10 月 27 日收盘价第 3 高的港股。"
    },
    {
      "step": "定位相关数据库和表格（需求2：港股股票简称）"

In [31]:
t = questions[12]
t['table_finder'] = {}
t['table_finder']['stage_1']= [json.loads(content.strip('`json'))]
t['token_usage'] = {}
t['token_usage']['table_finder-stage_1'] = [usage]
t['time_usage'] = {}
t['time_usage']['table_finder-stage_1'] = [f"{execution_time:.2f}s"]
t = [t]

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-table_finder-test-{version}.json')
parse_data.write_json(t, saved_path)

## ALL

In [32]:
answers = []

for question in tqdm(questions[:]):
    try:
        # the first question
        query = make_prompt(question['team'][0]['question'])

        history = []

        start_time = time.time()
        message = chat.create_message(query, history=history, system_prompt=system_prompt, temperature=0.7, top_p=0.9, response_format='text')
        end_time = time.time()

        execution_time = end_time - start_time
        usage = chat.get_token_usage(message, False)
        content = chat.get_content(message, False)

        res = question
        res['table_finder'] = {}
        res['table_finder']['stage_1']= [json.loads(content.strip('`json'))]
        res['token_usage'] = {}
        res['token_usage']['table_finder-stage_1'] = [usage]
        res['time_usage'] = {}
        res['time_usage']['table_finder-stage_1'] = [f"{execution_time:.2f}s"]

        answers.append(res)
    except:
        print(question['tid'])

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-table_finder-{version}.json')
parse_data.write_json(answers, saved_path)

  9%|▉         | 9/101 [04:44<50:36, 33.00s/it]  

tttt----9


 73%|███████▎  | 74/101 [39:29<15:32, 34.55s/it]

tttt----74


 84%|████████▍ | 85/101 [45:52<09:48, 36.80s/it]

tttt----85


 86%|████████▌ | 87/101 [47:20<09:34, 41.04s/it]

tttt----87


 88%|████████▊ | 89/101 [48:44<08:17, 41.49s/it]

tttt----89


 95%|█████████▌| 96/101 [53:46<03:36, 43.21s/it]

tttt----96


100%|██████████| 101/101 [57:26<00:00, 34.12s/it]

tttt----101





In [33]:
len(answers)

94

# Deepseek-chat

In [34]:
model = 'deepseek_v3'

In [35]:
deepseek_api = 'sk-ba0f5eed3bea4fa6be16eb33b139c684'

## Test

In [None]:
from openai import OpenAI

query = make_prompt(questions[12]['team'][0]['question'])

client = OpenAI(api_key= deepseek_api, base_url="https://api.deepseek.com")

start_time = time.time()
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "user", "content": query},
    ],
    stream=False,
    top_p=0.7,
    temperature=0.9
)
end_time = time.time()

response = json.loads(response.to_json())
content = response['choices'][0]['message']['content']

content = content.strip('`json')
usage = response['usage']
execution_time = end_time - start_time

In [37]:
t = questions[12]
t['table_finder'] = {}
t['table_finder']['stage_1']= [json.loads(content.strip('`json'))]
t['token_usage'] = {}
t['token_usage']['table_finder-stage_1'] = [usage]
t['time_usage'] = {}
t['time_usage']['table_finder-stage_1'] = [f"{execution_time:.2f}s"]
t = [t]

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-table_finder-test-{version}.json')
parse_data.write_json(t, saved_path)

## ALL

In [38]:
answers = []

for question in tqdm(questions[:]):
    
    query = make_prompt(question['team'][0]['question'])

    start_time = time.time()
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "user", "content": query},
        ],
        stream=False,
        top_p=0.7,
        temperature=0.9
    )
    end_time = time.time()

    response = json.loads(response.to_json())
    content = response['choices'][0]['message']['content']

    content = content.strip('`json')
    usage = response['usage']
    execution_time = end_time - start_time

    res = question
    res['table_finder'] = {}
    res['table_finder']['stage_1']= [json.loads(content.strip('`json'))]
    res['token_usage'] = {}
    res['token_usage']['table_finder-stage_1'] = [usage]
    res['time_usage'] = {}
    res['time_usage']['table_finder-stage_1'] = [f"{execution_time:.2f}s"]

    answers.append(res)

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-table_finder-{version}.json')
parse_data.write_json(answers, saved_path)

100%|██████████| 101/101 [41:01<00:00, 24.37s/it]


# Compare Results

Compare the stage 1 results => find the differences => get the correct answer.

In [42]:
import os
import json
import copy

# Path to the folder containing model answer files
dir_path = os.path.join(cwd, 'answer_tmp')

# List of model names
models = ['deepseek_v3', 'glm_4_plus']

# Create a dictionary of file paths for each model's JSON file
model_files = {model: os.path.join(dir_path, f"stage_1-{model}-table_finder-{version}.json") for model in models}

# Dictionary to store the data of each model
model_data = {}

# Read the JSON data for each model
for model, file_path in model_files.items():
    with open(file_path, 'r', encoding='utf-8') as f:
        model_data[model] = json.load(f)

# Dictionary to store the data_source for each question id across different models
data_sources = {}

# Traverse through each model's data to extract the data_source for each question id
for model in models:
    for entry in copy.deepcopy(model_data[model]):
        tmp = entry['table_finder']['stage_1'][0]['data_source']
        for i in range(len(tmp)):
            del tmp[i]['question']
        data_sources.setdefault(entry['tid'].replace(' ', ''), {}).update({model: tmp})

# Compare the data_source for each question id across models
for question_id, sources in data_sources.items():
    # Check if the data_source is consistent across models
    # Convert each model's data_source to a JSON string (to handle the dictionary comparison)
    serialized_sources = {model: json.dumps(ds, sort_keys=True, ensure_ascii=False) for model, ds in sources.items()}
    
    # If there are any differences in data_source, output the details
    if len(set(serialized_sources.values())) > 1:
        print(f"Question ID: {question_id}")

        q_id = int(question_id.split('-')[-1]) - 1
        
        # Calculate the maximum length of model names to align the output
        max_model_length = max(len(model) for model in models)
        # Print the data_source for each model, with aligned output
        for model in models:
            # Left-align model names with the calculated maximum length
            print(f"{model.ljust(max_model_length)}:")
            print('```')
            print(json.dumps(model_data[model][q_id]['table_finder']['stage_1'], indent=2, ensure_ascii=False))
            print('```')
        
        print("")

Question ID: tttt----1
deepseek_v3:
```
[
  {
    "raw_question": "600872的全称、A股简称、法人、法律顾问、会计师事务所及董秘是？",
    "data_source_reasoning": [
      {
        "step": "解析格式要求",
        "instruction": "识别用户对最终输出格式或数据精度的任何明确要求。",
        "result": "问题没有对回答格式或数据结构提出特定要求，因此无需进行额外的格式或精度处理。"
      },
      {
        "step": "分析实体",
        "instruction": "根据NER Result分析实体属于的证券市场：A股、港股、美股。",
        "result": "600872 是A股代码，因此需要查询A股相关的数据库和表格。"
      },
      {
        "step": "识别信息需求",
        "instruction": "提取用户 query 的核心意图，即用户希望获取什么类型的信息。",
        "cot_thinking": "用户的问题是询问某A股公司的全称、A股简称、法人、法律顾问、会计师事务所及董秘信息。核心意图是获取A股公司的基本信息、法人信息、法律顾问、会计师事务所及董秘信息。",
        "result": [
          "A股公司的全称",
          "A股简称",
          "法人",
          "法律顾问",
          "会计师事务所",
          "董秘"
        ]
      },
      {
        "step": "定位相关数据库和表格（需求1：A股公司的全称、A股简称）",
        "instruction": "根据第一个信息需求（A股公司的全称、A股简称），依次、逐步罗列**所有**可能包含相关数据的数据库和表格，再根据需求确定合适的表格。",
        "cot_thinking": "查看 Database-Table Schema，A股公司的全称和简称信

: 