# Init

In [11]:
import sys
import os
import json
import time
import re

from tqdm import tqdm

cwd = os.getcwd()
os.chdir(cwd)
sys.path.append('tools')

import chat
import parse_data

In [12]:
version = 'v2.7.1'

In [13]:
system_prompt = ""

prompt_dir = os.path.join(cwd, 'prompt')

fname = f'table_finder-stage_1-{version}.md'
prompt_fpath = os.path.join(prompt_dir, fname)

with open(prompt_fpath, 'r') as f:
    prompt_template = ''.join(f.readlines())


def parse_database_and_table(query: str) -> dict:
    """
    Parse the given SQL query to return the database and table names in a dictionary.
    
    Args:
    - query (str): The SQL query to parse.
    
    Returns:
    - dict: A dictionary with 'database' and 'table' keys.
    """

    pattern = r'FROM\s+([a-zA-Z0-9_]+)\.([a-zA-Z0-9_]+)'
    match = re.search(pattern, query, re.IGNORECASE)
    
    if match:
        database = match.group(1)
        table = match.group(2)
        return {'database': database, 'table': table}
    
    return {}


def make_prompt(query: str, ner: dict) -> str:

    """
    ner_res: content from the stage_3
    """

    prompt = prompt_template + query

    # ner_result is None
    if not ner['result']:
        return prompt

    ner_content = {}
    ner_content.update(ner['result'][0])

    sql_res = ner['sql']

    for k, v in sql_res.items():
        for j in v:
            if not j['result']:
                continue

            sql_query = j['query']
            # add database and table
            ner_content.update(parse_database_and_table(sql_query))
            ner_content['data_from_table'] = j['result']

    # add NER result

    ner_str = f"\n\n### **Name Entity Recognition Result**\n```json\n{json.dumps(ner_content, ensure_ascii=False,indent=2)}\n```"

    prompt += ner_str
    
    return prompt

In [14]:
question_path = os.path.join(cwd, 'answer_tmp' + os.sep + 'stage_1-glm_4_plus-ner-v2.0.0-sql-HF-Post.json')

questions = parse_data.read_json(question_path)

# GLM-4-Plus

In [8]:
model = 'glm_4_plus'

## Test

In [20]:
idx = 12

history = []
table_finder = {}
table_finder['token_usage'] = {}
table_finder['time_usage'] = {}

for i in range(len(questions[idx]['team'])):
    if i == 0:
        query = make_prompt(questions[idx]['team'][i]['question'], questions[idx]['ner']['stage_1'])
    else:
        query = questions[idx]['team'][i]['question']

    start_time = time.time()
    message = chat.create_message(query, history=history, system_prompt=system_prompt, temperature=0.7, top_p=0.9, response_format='text')
    end_time = time.time()

    execution_time = end_time - start_time
    usage = chat.get_token_usage(message, True)
    content = chat.get_content(message, True)
    history = chat.build_history(history, message=message)

    # the init is 0
    table_finder[f'stage_{i+1}'] = [json.loads(content.strip('`json'))]
    table_finder['token_usage'][f'stage_{i+1}'] = usage
    table_finder['time_usage'][f'stage_{i+1}'] = execution_time

{'prompt_tokens': 9383, 'completion_tokens': 544, 'total_tokens': 9927}
```json
{
    "raw_question": "今天是2020年10月27日，当日收盘价第3高的港股是？(以下都回答简称)",
    "data_source_reasoning": "现在进行逐步分析： 1. **解析输出格式要求**：要求返回公司简称。 2. **意图识别**： 问题关注的是特定日期的'收盘价第3高'的港股，并且要求'回答简称'。 3. **返回数据识别**： 收盘价、简称。 4. **关联信息**： 查询到收盘价第3高的港股后，需要获取该港股的简称。  5. **定位数据**： 查看Database-Table Schema，港股数据仅与'港股数据库' (HKStockDB) 中的'港股行情表现' (CS_HKStockPerformance) 表相关。CS_HKStockPerformance包含港股从最近一个交易日往前追溯一段时期的行情表现信息，包括收盘价，因此适合用于筛选收盘价第3高的港股。同时，还需要回答'简称'。查看Database-Table Schema，港股简称与'常量库'（ConstantDB）中的'港股证券主表'（HK_SecuMain）的强相关，和'港股数据库' (HKStockDB) 中的'港股公司概况' (HK_StockArchives)弱相关。根据给定的表描述，HK_SecuMain表记录港股单个证券品种的简称、中英文名称、上市交易所、上市状态等基础信息； 而HK_StockArchives表 收录港股上市公司的基础信息，包括名称、成立日期、注册地点、注册资本、公司业务、所属行业分类、主席、公司秘书、联系方式等信息，并没有明确提及简称。相较之下，HK_SecuMain更适合用于查询简称信息。 6. **结论**： 因此，要回答这个问题，我们需要先查询 HKStockDB 数据库中的 CS_HKStockPerformance 表，筛选出2020年10月27日的数据，并找出收盘价第3高的港股。然后，再查询 ConstantDB 数据库中的 HK_SecuMain 表，获取该港股的简称。",
    "data_source": [
        {"ques

In [None]:
t = questions[idx]
t['table_finder'] = table_finder

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_3-{model}-table_finder-test-{version}.json')
parse_data.write_json(t, saved_path)

print(json.dumps(table_finder, ensure_ascii=False, indent=2))

## ALL

In [25]:
answers = []

for question in tqdm(questions[:1]):
    try:
        res = question
        res['table_finder'] = {}
        history = []
        
        for i in range(len(question['team'])):

            if i == 0:
                query = make_prompt(question['team'][i]['question'], question['ner']['stage_1'])
            else:
                query = question['team'][i]['question']

            start_time = time.time()
            message = chat.create_message(query, history=history, system_prompt=system_prompt, temperature=0.7, top_p=0.9, response_format='text')
            end_time = time.time()

            execution_time = end_time - start_time
            usage = chat.get_token_usage(message, False)
            content = chat.get_content(message, False)
            history = chat.build_history(history, message=message)

            res['table_finder'][f'stage_{i+1}'] = [json.loads(content.strip('`json'))]
            res['token_usage'][f'table_finder-stage_{i+1}'] = [usage]
            res['time_usage'][f'table_finder-stage_{i+1}'] = [f"{execution_time:.2f}s"]

            answers.append(res)
    except:
        print(question['tid'])

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_3-{model}-table_finder-{version}.json')
parse_data.write_json(answers, saved_path)

100%|██████████| 1/1 [01:07<00:00, 67.98s/it]


# Deepseek-chat

In [15]:
model = 'deepseek_v3'

In [16]:
deepseek_api = 'sk-ba0f5eed3bea4fa6be16eb33b139c684'

## Test

In [32]:
from openai import OpenAI

client = OpenAI(api_key= deepseek_api, base_url="https://api.deepseek.com")

idx = 84

history = []
table_finder = {}
table_finder['token_usage'] = {}
table_finder['time_usage'] = {}

for i in range(len(questions[idx]['team'])):
    if i == 0:
        query = make_prompt(questions[idx]['team'][i]['question'], questions[idx]['ner']['stage_1'])
    else:
        query = questions[idx]['team'][i]['question']

    history.append({"role": "user", "content": query})

    start_time = time.time()
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages= history,
        stream=False,
        top_p=0.7,
        temperature=0.9
    )
    end_time = time.time()

    response = json.loads(response.to_json())
    content = response['choices'][0]['message']['content']
    content = content.strip('`json')
    usage = response['usage']
    execution_time = end_time - start_time

    history.append({"role": "assistant", "content": content})

    # the init is 0
    table_finder[f'stage_{i+1}'] = [json.loads(content.strip('`json'))]
    table_finder['token_usage'][f'stage_{i+1}'] = usage
    table_finder['time_usage'][f'stage_{i+1}'] = execution_time

In [33]:
print(json.dumps(table_finder, ensure_ascii=False, indent=2))

{
  "token_usage": {
    "stage_1": {
      "completion_tokens": 461,
      "prompt_tokens": 13434,
      "total_tokens": 13895,
      "prompt_tokens_details": {
        "cached_tokens": 10048
      },
      "prompt_cache_hit_tokens": 10048,
      "prompt_cache_miss_tokens": 3386
    },
    "stage_2": {
      "completion_tokens": 475,
      "prompt_tokens": 13920,
      "total_tokens": 14395,
      "prompt_tokens_details": {
        "cached_tokens": 13376
      },
      "prompt_cache_hit_tokens": 13376,
      "prompt_cache_miss_tokens": 544
    },
    "stage_3": {
      "completion_tokens": 549,
      "prompt_tokens": 14435,
      "total_tokens": 14984,
      "prompt_tokens_details": {
        "cached_tokens": 13888
      },
      "prompt_cache_hit_tokens": 13888,
      "prompt_cache_miss_tokens": 547
    }
  },
  "time_usage": {
    "stage_1": 159.93575406074524,
    "stage_2": 434.6102180480957,
    "stage_3": 475.40166902542114
  },
  "stage_1": [
    {
      "raw_question": "鹏华基金在2

In [34]:
t = questions[idx]
t['table_finder'] = table_finder

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_3-{model}-table_finder-test-{version}.json')
parse_data.write_json(t, saved_path)

print(json.dumps(table_finder, ensure_ascii=False, indent=2))

{
  "token_usage": {
    "stage_1": {
      "completion_tokens": 461,
      "prompt_tokens": 13434,
      "total_tokens": 13895,
      "prompt_tokens_details": {
        "cached_tokens": 10048
      },
      "prompt_cache_hit_tokens": 10048,
      "prompt_cache_miss_tokens": 3386
    },
    "stage_2": {
      "completion_tokens": 475,
      "prompt_tokens": 13920,
      "total_tokens": 14395,
      "prompt_tokens_details": {
        "cached_tokens": 13376
      },
      "prompt_cache_hit_tokens": 13376,
      "prompt_cache_miss_tokens": 544
    },
    "stage_3": {
      "completion_tokens": 549,
      "prompt_tokens": 14435,
      "total_tokens": 14984,
      "prompt_tokens_details": {
        "cached_tokens": 13888
      },
      "prompt_cache_hit_tokens": 13888,
      "prompt_cache_miss_tokens": 547
    }
  },
  "time_usage": {
    "stage_1": 159.93575406074524,
    "stage_2": 434.6102180480957,
    "stage_3": 475.40166902542114
  },
  "stage_1": [
    {
      "raw_question": "鹏华基金在2

## ALL

In [40]:
answers = []

for question in tqdm(questions[:1]):
    try:
        res = question
        res['table_finder'] = {}
        history = []
        
        for i in range(len(question['team'])):

            if i == 0:
                query = make_prompt(question['team'][i]['question'], question['ner']['stage_1'])
            else:
                query = question['team'][i]['question']


            history.append({"role": "user", "content": query})

            start_time = time.time()
            response = client.chat.completions.create(
                model="deepseek-chat",
                messages=history,
                stream=False,
                top_p=0.7,
                temperature=0.9
            )
            end_time = time.time()

            response = json.loads(response.to_json())
            content = response['choices'][0]['message']['content']
            content = content.strip('`json')
            usage = response['usage']
            execution_time = end_time - start_time
            
            history.append({"role": "assistant", "content": content})

            res['table_finder'][f'stage_{i+1}'] = [json.loads(content.strip('`json'))]
            res['token_usage'][f'table_finder-stage_{i+1}'] = [usage]
            res['time_usage'][f'table_finder-stage_{i+1}'] = [f"{execution_time:.2f}s"]

            answers.append(res)
    except:
        print(question['tid'])

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_3-{model}-table_finder-{version}.json')
parse_data.write_json(answers, saved_path)

100%|██████████| 1/1 [00:28<00:00, 28.36s/it]


: 

# Compare Results

Compare the stage 1 results => find the differences => get the correct answer.

In [51]:
import os
import json
import copy

# Path to the folder containing model answer files
dir_path = os.path.join(cwd, 'answer_tmp')

# List of model names
models = ['deepseek_v3', 'glm_4_plus']
stage = "stage_3"
version = 'v2.7.3'

# Create a dictionary of file paths for each model's JSON file
model_files = {model: os.path.join(dir_path, f"stage_3-{model}-table_finder-{version}.json") for model in models}

# Dictionary to store the data of each model
model_data = {}

# Read the JSON data for each model
for model, file_path in model_files.items():
    with open(file_path, 'r', encoding='utf-8') as f:
        model_data[model] = json.load(f)

# sort data
for key in model_data:
    model_data[key] = sorted(model_data[key], key=lambda x: x['tid'])

# Dictionary to store the data_source for each question id across different models
data_sources = {}

# Traverse through each model's data to extract the data_source for each question id
for model in models:
    for entry in copy.deepcopy(model_data[model]):
        try:
            tmp = entry['table_finder'][stage][0]['data_source']
            data_sources.setdefault(entry['tid'].replace(' ', ''), {}).update({model: tmp})
        except:
            pass

# Compare the data_source for each question id across models
for question_id, sources in data_sources.items():
    # Check if the data_source is consistent across models
    # Convert each model's data_source to a JSON string (to handle the dictionary comparison)
    results = [v for k, v in sources.items()]
    if results[0] == results[1]:
        pass
    else:
        print(f"Question ID: {question_id}")

        q_id = int(question_id.split('-')[-1]) - 1
        
        # Calculate the maximum length of model names to align the output
        max_model_length = max(len(model) for model in models)
        # Print the data_source for each model, with aligned output
        for model in models:
            # Left-align model names with the calculated maximum length
            print(f"{model.ljust(max_model_length)}:")
            print('```')
            for d in model_data[model]:
                if d['tid'] == question_id:
                    print(json.dumps(d['table_finder'][stage], indent=2, ensure_ascii=False))
            print('```')
        
        print("")

Question ID: tttt----1
deepseek_v3:
```
[
  {
    "raw_question": "在实控人发生变化的当年股权发生了几次转让？",
    "data_source_reasoning": [
      {
        "step_1": "提取问题的信息意图",
        "restate_question_abstractly": "问题关注某A股公司在实控人发生变化的当年，股权转让的次数。",
        "information_intention": [
          "实控人变化当年的股权转让次数"
        ]
      },
      {
        "step_2": "定位其相关的所有表格",
        "list_all_related_tables": "查看 Database-Table Schema，A股公司股权转让信息与'股东股权变动' (LC_ShareTransfer) 表相关。该表记录公司股东的股权转让情况，包括转让时间、转让股数、出让方、受让方等信息。"
      },
      {
        "step_3": "结论",
        "identify_database_respectively": "查看 Database-Table Schema，LC_ShareTransfer 在'上市公司股东与股本/公司治理'（AStockShareholderDB）数据库中。",
        "conclusion": "综上所述，要回答这个问题，我们需要查询 AStockShareholderDB 数据库中的 LC_ShareTransfer 表，获取600872在实控人变化当年的股权转让次数。"
      }
    ],
    "data_source": [
      {
        "database": "AStockShareholderDB",
        "table": "LC_ShareTransfer"
      }
    ]
  }
]
```
glm_4_plus :
```
[
  {
    "raw_question": "基于之前的聊天记录。在实控人发生变化的当年股权发

In [40]:
import os
import json
import copy

# Path to the folder containing model answer files
dir_path = os.path.join(cwd, 'answer_tmp')

# List of model names
models = ['stage_1', 'stage_3']

# Create a dictionary of file paths for each model's JSON file
model_files = {model: os.path.join(dir_path, f"{model}-glm_4_plus-table_finder-{version}.json") for model in models}

# Dictionary to store the data of each model
model_data = {}

# Read the JSON data for each model
for model, file_path in model_files.items():
    with open(file_path, 'r', encoding='utf-8') as f:
        model_data[model] = json.load(f)

# sort data
for key in model_data:
    model_data[key] = sorted(model_data[key], key=lambda x: x['tid'])

# Dictionary to store the data_source for each question id across different models
data_sources = {}

# Traverse through each model's data to extract the data_source for each question id
for model in models:
    for entry in copy.deepcopy(model_data[model]):
        tmp = entry['table_finder']['stage_2'][0]['data_source']
        data_sources.setdefault(entry['tid'].replace(' ', ''), {}).update({model: tmp})

# Compare the data_source for each question id across models
for question_id, sources in data_sources.items():
    # Check if the data_source is consistent across models
    # Convert each model's data_source to a JSON string (to handle the dictionary comparison)
    results = [v for k, v in sources.items()]
    if results[0] == results[1]:
        pass
    else:
        print(f"Question ID: {question_id}")

        q_id = int(question_id.split('-')[-1]) - 1
        
        # Calculate the maximum length of model names to align the output
        max_model_length = max(len(model) for model in models)
        # Print the data_source for each model, with aligned output
        for model in models:
            # Left-align model names with the calculated maximum length
            print(f"{model.ljust(max_model_length)}:")
            print('```')
            for d in model_data[model]:
                if d['tid'] == question_id:
                    print(json.dumps(d['table_finder']['stage_1'], indent=2, ensure_ascii=False))
            print('```')
        
        print("")

KeyError: 'stage_2'