# Init

In [16]:
import sys
import os
import json
import time

from tqdm import tqdm

cwd = os.getcwd()
os.chdir(cwd)
sys.path.append('tools')

import chat
import parse_data
import sql

In [17]:
system_prompt = ""

version = 'v2.2.1'
task = 'ner'

prompt_dir = os.path.join(cwd, 'prompt')
fname = f'{task}-stage_1-{version}.md'
prompt_fpath = os.path.join(prompt_dir, fname)

with open(prompt_fpath, 'r') as f:
    prompt_template = ''.join(f.readlines())

def make_prompt(conversation_turn: dict) -> str:

    query = conversation_turn['team'][0]['question']

    prompt = prompt_template + query

    return prompt

In [18]:
question_path = os.path.join(cwd, 'data' + os.sep + 'question.json')

questions = parse_data.read_json(question_path)

# GLM-4-Plus

In [19]:
model = 'glm_4_plus'

## Test

In [20]:
query = make_prompt(questions[0])

history = []

start_time = time.time()
message = chat.create_message(query, history=history, system_prompt=system_prompt, temperature=0.1, top_p=1, response_format='text')
end_time = time.time()

execution_time = end_time - start_time
usage = chat.get_token_usage(message, True)
content = chat.get_content(message, True)
history = chat.build_history(history, message=message)

{'prompt_tokens': 912, 'completion_tokens': 98, 'total_tokens': 1010}
```json
{
    "reasoning_process_cot": "根据查询内容，'600872' 是一个股票代码，指向了一个上市公司，因此应该识别为一个代码。而'全称'、'A股简称'、'法人'、'法律顾问'、'会计师事务所'和'董秘'并没有出现在查询内容中，所以无需识别它们为实体。",
    "result": [
        {"代码": "600872"}
    ]
}
```


In [21]:
t = questions[0]
t['ner_result'] = {}
t['ner_result']['stage_1'] = json.loads(content.strip('`json'))
t['token_usage'] = {}
t['token_usage']['ner-stage_1'] = usage
t['time_usage'] = {}
t['time_usage']['ner-stage_1'] = f"{execution_time:.2f}s"
t = [t]

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-{task}-test-{version}.json')
parse_data.write_json(t, saved_path)

## ALL

In [22]:
answers = []

for question in tqdm(questions[:]):
    
    query = make_prompt(question)

    history = []

    start_time = time.time()
    message = chat.create_message(query, history=history, system_prompt=system_prompt, temperature=0.1, top_p=1, response_format='text')
    end_time = time.time()

    execution_time = end_time - start_time
    usage = chat.get_token_usage(message, False)
    content = chat.get_content(message, False)

    res = question
    res['ner'] = {}
    res['ner']['stage_1'] = json.loads(content.strip('`json'))
    res['token_usage'] = {}
    res['token_usage']['ner-stage_1'] = usage
    res['time_usage'] = {}
    res['time_usage']['ner-stage_1'] = f"{execution_time:.2f}s"

    answers.append(res)

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-{task}-{version}.json')
parse_data.write_json(answers, saved_path)

100%|██████████| 101/101 [04:29<00:00,  2.67s/it]


# Deepseek-chat

In [23]:
model = 'deepseek_v3'

In [24]:
deepseek_api = 'sk-ba0f5eed3bea4fa6be16eb33b139c684'

## Test

In [25]:
from openai import OpenAI

query = make_prompt(questions[0])

client = OpenAI(api_key= deepseek_api, base_url="https://api.deepseek.com")

start_time = time.time()
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "user", "content": query},
    ],
    stream=False,
    top_p=0.7,
    temperature=0.9
)
end_time = time.time()

response = json.loads(response.to_json())
content = response['choices'][0]['message']['content']

content = content.strip('`json')
usage = response['usage']
execution_time = end_time - start_time

In [26]:
t = questions[0]
t['ner_result'] = {}
t['ner_result']['stage_1'] = json.loads(content.strip('`json'))
t['token_usage'] = {}
t['token_usage']['ner-stage_1'] = usage
t['time_usage'] = {}
t['time_usage']['ner-stage_1'] = f"{execution_time:.2f}s"
t = [t]

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-{task}-test-{version}.json')
parse_data.write_json(t, saved_path)

## ALL

In [27]:
answers = []

for question in tqdm(questions[:]):
    
    query = make_prompt(question)

    start_time = time.time()
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "user", "content": query},
        ],
        stream=False,
        top_p=0.7,
        temperature=0.9
    )
    end_time = time.time()

    response = json.loads(response.to_json())
    content = response['choices'][0]['message']['content']

    content = content.strip('`json')
    usage = response['usage']
    execution_time = end_time - start_time

    res = question
    res['ner'] = {}
    res['ner']['stage_1'] = json.loads(content.strip('`json'))
    res['token_usage'] = {}
    res['token_usage']['ner-stage_1'] = usage
    res['time_usage'] = {}
    res['time_usage']['ner-stage_1'] = f"{execution_time:.2f}s"

    answers.append(res)

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-{task}-{version}.json')
parse_data.write_json(answers, saved_path)

100%|██████████| 101/101 [05:33<00:00,  3.30s/it]


# Compare Results

Compare the stage 1 results => find the differences => get the correct answer.

In [29]:
import os
import json

# Path to the folder containing model answer files
dir_path = os.path.join(cwd, 'answer_tmp')

# List of model names
models = ['deepseek_v3', 'glm_4_plus']

# Create a dictionary of file paths for each model's JSON file
model_files = {model: os.path.join(dir_path, f"stage_1-{model}-{task}-{version}.json") for model in models}

# Dictionary to store the data of each model
model_data = {}

# Read the JSON data for each model
for model, file_path in model_files.items():
    with open(file_path, 'r', encoding='utf-8') as f:
        model_data[model] = json.load(f)

# Dictionary to store the data_source for each question id across different models
data_sources = {}

# Traverse through each model's data to extract the data_source for each question id
for model in models:
    for entry in model_data[model]:
        data_sources.setdefault(entry['tid'].replace(' ', ''), {}).update({model: entry['ner']['stage_1']['result']})
            

# Compare the data_source for each question id across models
for question_id, sources in data_sources.items():
    # Check if the data_source is consistent across models
    # Convert each model's data_source to a JSON string (to handle the dictionary comparison)
    serialized_sources = {model: json.dumps(ds, sort_keys=True, ensure_ascii=False) for model, ds in sources.items()}
    
    # If there are any differences in data_source, output the details
    if len(set(serialized_sources.values())) > 1:
        print(f"Question ID: {question_id}")
        
        # Calculate the maximum length of model names to align the output
        max_model_length = max(len(model) for model in models)
        
        # Print the data_source for each model, with aligned output
        for model in models:
            # Left-align model names with the calculated maximum length
            print(f"{model.ljust(max_model_length)}: {serialized_sources.get(model, 'No data available')}")
        
        print("\n")


Question ID: tttt----27
deepseek_v3: []
glm_4_plus : [{"上市公司名称": "RXW"}]


Question ID: tttt----31
deepseek_v3: [{"行业名称": "证券公司"}]
glm_4_plus : []


Question ID: tttt----43
deepseek_v3: [{"代码": "TOUR"}]
glm_4_plus : [{"上市公司名称": "TOUR"}]


Question ID: tttt----47
deepseek_v3: []
glm_4_plus : [{"基金公司名称": "中证指数有限公司"}]


Question ID: tttt----91
deepseek_v3: [{"上市公司名称": "中文名称缩写"}, {"代码": "证券代码"}]
glm_4_plus : []




经验证，glm-4-plus 的正确率目前是 100%。
但这里存在一些考虑：
1. 没有行业信息，行业也是需要进行定位的。
2. 可能 deepseek-v3 和 glm-4-plus 一起错了。

----
v2.0.0

经验证，glm-4-plus 的正确率是领先的，bad cases：

- "tttt----4": "互联网金融属于科技概念的什么分支？这个概念的英文名称是什么？" => Ignore
- "tttt----72": "2020-07-02风电零部件行业的总市值是多少(元)？" => add shot
- "tttt----35": "北京国枫律师事务所2020年见证多少家公司的年度股东大会？" => glm always didn't work
----
v2.2.0

100%

# Obtain SQL Results

In [None]:
answer_dir = os.path.join(cwd, 'answer_tmp')
fname = 'stage_1-glm_4_plus-ner.json'
fpath = os.path.join(answer_dir, fname)

data = parse_data.read_json(fpath)

In [None]:
for i in tqdm(data[:]):
    i['ner']['stage_1'] = sql.process_ner_res(i['ner']['stage_1'])

In [69]:
target_fpath = os.path.join(answer_dir, 'stage_1-glm_4_plus-ner-sql.json')
parse_data.write_json(data, target_fpath)

## Check Results

Bad cases:

- ~~`{ "id": "tttt----43----26-1-1", "question": "TOUR他是否已经退市了？（是或者否）" }` => "TOUR" => `SecuCode` in `US_SecuMain` is a name rather a numeric code~~
- `{'id': 'tttt----39----22-2-1', 'question': 'JD.com, Inc.这家公司在美股英文名称是什么？'}`=> "JD.com, Inc." => employee `LIKE` to handle this, while `LIKE` may result in multiple results, which is very bad.
- `{'id': 'tttt----62----35-1-1', 'question': '博时基金公司成立于（XXXX年XX月XX日）？'}` => "博时基金公司"
- `{'id': 'tttt----74----18-2-1', 'question': '天弘增利短债C的基金管理人是谁？'}` => "天弘增利短债C" => CAN'T FOUND

In [None]:
answer_dir = os.path.join(cwd, 'answer_tmp')
fname = 'stage_1-glm_4_plus-ner-sql.json'
fpath = os.path.join(answer_dir, fname)

data = parse_data.read_json(fpath)

In [None]:
for i in data[:]:  # Iterate through each element in the data list
    ner = i['ner']['stage_1']  # Extract the 'stage_1' data from the 'ner' key
    ner_result = i['ner']['stage_1']['result']  # Extract the 'result' from 'stage_1'

    # Check if all `result` fields in the `sql` data are empty
    all_results_empty = True  # Assume initially that all `result` fields are empty

    # Iterate through the `sql` dictionary in the `ner` object
    for key, queries in ner.get('sql', {}).items():
        for query_info in queries:  # Iterate through each query info in the list
            if query_info.get('result'):  # Check if the `result` field is not empty
                all_results_empty = False
                break  # Exit the loop immediately if a non-empty `result` is found
        if not all_results_empty:
            break  # Exit the outer loop if a non-empty `result` is found

    # If all `result` fields are empty and `ner_result` is not empty
    if all_results_empty and ner_result:
        print(i['team'][0])  # Print the first element of the 'team' list
        print(ner)  # Print the 'ner' object
        print()  # Print an empty line for separation

In [None]:
# company code 都相同

import requests
import json

url = "https://comm.chatglm.cn/finglm2/api/query"
headers = {
    "Content-Type": "application/json",
    "Authorization": "Bearer 21e162cdd305453dac64e7186ed3145f"
}

data = {
  "sql": """SELECT *
FROM ConstantDB.US_SecuMain
WHERE 
    SecuCode LIKE '%JD.com, Inc.%' OR
    SecuAbbr LIKE '%JD.com, Inc.%' OR
    ChiSpelling LIKE '%JD.com, Inc.%' OR
    EngName LIKE '%JD.com, Inc.%' OR
    ChiName LIKE '%JD.com, Inc.%';""",
  "limit": 1000
}

response = requests.post(url, headers=headers, json=data)
print(json.dumps(response.json(), indent=2, ensure_ascii=False))

In [None]:
import requests
import json

url = "https://comm.chatglm.cn/finglm2/api/query"
headers = {
    "Content-Type": "application/json",
    "Authorization": "Bearer 21e162cdd305453dac64e7186ed3145f"
}

data = {
  "sql": """SELECT *
FROM InstitutionDB.LC_InstiArchive
WHERE 
    ChiName LIKE '%博时基金%' OR
    AbbrChiName LIKE '%博时基金%' OR
    NameChiSpelling LIKE '%博时基金%' OR
    EngName LIKE '%博时基金%' OR
    AbbrEngName LIKE '%博时基金%';""",
  "limit": 1000
}

response = requests.post(url, headers=headers, json=data)
print(json.dumps(response.json(), indent=2, ensure_ascii=False))

In [None]:
import requests
import json

url = "https://comm.chatglm.cn/finglm2/api/query"
headers = {
    "Content-Type": "application/json",
    "Authorization": "Bearer 21e162cdd305453dac64e7186ed3145f"
}
data = {
  "sql": "SELECT * FROM ConstantDB.SecuMain WHERE ChiName LIKE '%增利%'",
  "limit": 1
}

response = requests.post(url, headers=headers, json=data)
print(json.dumps(response.json(), indent=2, ensure_ascii=False))

## Check NULL

In [7]:
answer_dir = os.path.join(cwd, 'answer_tmp')
fname = 'stage_1-glm_4_plus-ner-sql.json'
fpath = os.path.join(answer_dir, fname)

data = parse_data.read_json(fpath)

In [None]:
for i in data[:]:

    if not i['ner']['stage_1']['result']:
        print(i['team'][0])
        continue

    tmp = [j for j in list(i['ner']['stage_1']['sql'].values())][0]

    all_empty = True
    for j in tmp:
        if j['result']:
            all_empty = False
    
    if all_empty:
        print(i['team'][0])