# Init

In [1]:
import sys
import os
import json
import time

from tqdm import tqdm

cwd = os.getcwd()
os.chdir(cwd)
sys.path.append('tools')

import chat
import parse_data

In [80]:
system_prompt = ""

def make_prompt(conversation_turn: dict) -> str:
    prompt = """## **Role**

你是一个专精于金融领域的命名实体识别（NER，Named Entity Recognition）任务的模型。

## **Task Description**

对 `Current Query` 里的内容进行命名实体识别。需要识别的实体为四类：

- 上市公司名称
- 代码
- 基金名称
- 基金公司名称

除此之外的实体不需要识别。

### **Output Format**

使用 JSON 进行输出。

```json
{
    "reasoning_process_cot": use CoT to step-by-step reason the NER results,
    "result": [
    {"{entity_type}": "{entity}"},
    ...
    ]
}
```

## **Shots**

**Shot 1**

Current Query: 唐山港集团股份有限公司是什么时间上市的（回答XXXX-XX-XX）
Output       : ```json
[{"上市公司名称": "唐山港集团股份有限公司"}]
```

**Shot 2**

Current Query: JD的职工总数有多少人？
Output       : ```json
[{"上市公司名称": "JD"}]
```

**Shot 3**

Current Query: 600872的全称、A股简称、法人、法律顾问、会计师事务所及董秘是？
Output       : ```json
{
    "reasoning_process_cot": "根据查询内容，'600872' 是一个股票代码，指向了一个上市公司，因此应该识别为一个代码。而'法人'、'法律顾问'、'会计师事务所'和'董秘'并没有出现在查询内容中，所以无需识别它们为实体。",
    "result": [
        {"代码": "600872"},
    ]
}

```

**Shot 4**

Current Query: 华夏鼎康债券A在2019年的分红次数是多少？每次分红的派现比例是多少？
Output       : ```json
{
    "reasoning_process_cot": "从当前查询中，可以看出涉及到一个基金名称‘华夏鼎康债券A’，以及基金的分红情况。‘华夏鼎康债券A’是一个基金名称，而查询中并未提及其他实体，如上市公司名称、股票代码等。",
    "result": [
        {"基金名称": "华夏鼎康债券A"}
    ]
}
```

**Shot 5**

Current Query: 易方达基金管理有限公司在19年成立了多少支基金？
Output       : ```json
{
    "reasoning_process_cot": "根据问题中的关键词，'易方达基金管理有限公司'是一个基金公司名称。问题询问的是该公司在2019年成立了多少支基金。因此，'易方达基金管理有限公司'是唯一的实体，属于基金公司名称。",
    "result": [
        {"基金公司名称": "易方达基金管理有限公司"}
    ]
}
```

**Shot 6**

Current Query: 化工纳入过多少个子类概念？
Output       : ```json
{
    "reasoning_process_cot": "该查询没有涉及任何具体的上市公司名称、股票代码、基金名称或基金公司名称。问题内容是关于'化工'（化学行业）及其子类概念的讨论，因此没有识别出命名实体。",
    "result": []
}
```

## **Current Query**
"""

    first_ques = conversation_turn['team'][0]['question']

    prompt = prompt + first_ques

    return prompt

In [81]:
question_path = os.path.join(cwd, 'data' + os.sep + 'question.json')

questions = parse_data.read_json(question_path)

# GLM-4-Plus

In [82]:
model = 'glm_4_plus'

## Test

In [84]:
query = make_prompt(questions[0])

history = []

start_time = time.time()
message = chat.create_message(query, history=history, system_prompt=system_prompt, temperature=0.1, top_p=1, response_format='text')
end_time = time.time()

execution_time = end_time - start_time
usage = chat.get_token_usage(message, True)
content = chat.get_content(message, True)
history = chat.build_history(history, message=message)

{'prompt_tokens': 680, 'completion_tokens': 127, 'total_tokens': 807}
```json
{
    "reasoning_process_cot": "根据查询内容，'600872' 是一个股票代码，指向了一个上市公司，因此应该识别为一个代码。而'全称'、'A股简称'、'法人'、'法律顾问'、'会计师事务所'及'董秘'是关于该上市公司的具体信息，但它们本身并不是需要识别的实体类型（上市公司名称、代码、基金名称、基金公司名称）。因此，只需识别'600872'为代码。",
    "result": [
        {"代码": "600872"}
    ]
}
```


In [None]:
t = questions[0]
t['ner_result'] = {}
t['ner_result']['stage_1'] = json.loads(content.strip('`json'))
t['token_usage'] = {}
t['token_usage']['ner-stage_1'] = usage
t['time_usage'] = {}
t['time_usage']['ner-stage_1'] = f"{execution_time:.2f}s"
t = [t]

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + 'stage_1-glm_4_plus-ner-test.json')
parse_data.write_json(t, saved_path)

## ALL

In [None]:
answers = []

for question in tqdm(questions[:]):
    
    query = make_prompt(question)

    history = []

    start_time = time.time()
    message = chat.create_message(query, history=history, system_prompt=system_prompt, temperature=0.1, top_p=1, response_format='text')
    end_time = time.time()

    execution_time = end_time - start_time
    usage = chat.get_token_usage(message, False)
    content = chat.get_content(message, False)

    res = question
    res['ner_result'] = {}
    res['ner_result']['stage_1'] = json.loads(content.strip('`json'))
    res['token_usage'] = {}
    res['token_usage']['ner-stage_1'] = usage
    res['time_usage'] = {}
    res['time_usage']['ner-stage_1'] = f"{execution_time:.2f}s"

    answers.append(res)

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-ner.json')
parse_data.write_json(answers, saved_path)

100%|██████████| 101/101 [05:15<00:00,  3.12s/it]


# Deepseek-chat

In [87]:
model = 'deepseek_v3'

## Test

In [88]:
from openai import OpenAI

query = make_prompt(questions[0])

client = OpenAI(api_key= deepseek_api, base_url="https://api.deepseek.com")

start_time = time.time()
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "user", "content": query},
    ],
    stream=False,
    top_p=0.7,
    temperature=0.9
)
end_time = time.time()

response = json.loads(response.to_json())
content = response['choices'][0]['message']['content']

content = content.strip('`json')
usage = response['usage']
execution_time = end_time - start_time

In [None]:
t = questions[0]
t['ner_result'] = {}
t['ner_result']['stage_1'] = json.loads(content.strip('`json'))
t['token_usage'] = {}
t['token_usage']['ner-stage_1'] = usage
t['time_usage'] = {}
t['time_usage']['ner-stage_1'] = f"{execution_time:.2f}s"
t = [t]

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-ner-test.json')
parse_data.write_json(t, saved_path)

## ALL

In [None]:
answers = []

for question in tqdm(questions[:]):
    
    query = make_prompt(question)

    start_time = time.time()
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "user", "content": query},
        ],
        stream=False,
        top_p=0.7,
        temperature=0.9
    )
    end_time = time.time()

    response = json.loads(response.to_json())
    content = response['choices'][0]['message']['content']

    content = content.strip('`json')
    usage = response['usage']
    execution_time = end_time - start_time

    res = question
    res['ner_result'] = {}
    res['ner_result']['stage_1'] = json.loads(content.strip('`json'))
    res['token_usage'] = {}
    res['token_usage']['ner-stage_1'] = usage
    res['time_usage'] = {}
    res['time_usage']['ner-stage_1'] = f"{execution_time:.2f}s"

    answers.append(res)

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-ner.json')
parse_data.write_json(answers, saved_path)

100%|██████████| 101/101 [02:53<00:00,  1.72s/it]


# Compare Results

Compare the stage 1 results => find the differences => get the correct answer.

In [None]:
import os
import json

# Path to the folder containing model answer files
dir_path = os.path.join(cwd, 'answer_tmp')

# List of model names
models = ['deepseek_v3', 'glm_4_plus']

# Create a dictionary of file paths for each model's JSON file
model_files = {model: os.path.join(dir_path, f"stage_1-{model}-ner.json") for model in models}

# Dictionary to store the data of each model
model_data = {}

# Read the JSON data for each model
for model, file_path in model_files.items():
    with open(file_path, 'r', encoding='utf-8') as f:
        model_data[model] = json.load(f)

# Dictionary to store the data_source for each question id across different models
data_sources = {}

# Traverse through each model's data to extract the data_source for each question id
for model in models:
    for entry in model_data[model]:
        data_sources.setdefault(entry['tid'].replace(' ', ''), {}).update({model: entry['ner_result']['stage_1']['result']})
            

# Compare the data_source for each question id across models
for question_id, sources in data_sources.items():
    # Check if the data_source is consistent across models
    # Convert each model's data_source to a JSON string (to handle the dictionary comparison)
    serialized_sources = {model: json.dumps(ds, sort_keys=True, ensure_ascii=False) for model, ds in sources.items()}
    
    # If there are any differences in data_source, output the details
    if len(set(serialized_sources.values())) > 1:
        print(f"Question ID: {question_id}")
        
        # Calculate the maximum length of model names to align the output
        max_model_length = max(len(model) for model in models)
        
        # Print the data_source for each model, with aligned output
        for model in models:
            # Left-align model names with the calculated maximum length
            print(f"{model.ljust(max_model_length)}: {serialized_sources.get(model, 'No data available')}")
        
        print("\n")


Question ID: tttt----11
deepseek_v3: [{"上市公司名称": "工商银行"}, {"代码": "H股代码"}]
glm_4_plus : [{"上市公司名称": "工商银行"}]


Question ID: tttt----27
deepseek_v3: []
glm_4_plus : [{"代码": "RXW"}]


Question ID: tttt----43
deepseek_v3: [{"代码": "TOUR"}]
glm_4_plus : [{"上市公司名称": "TOUR"}]




经验证，glm-4-plus 的正确率目前是 100%。
但这里存在一些考虑：
1. 没有行业信息，行业也是需要进行定位的。
2. 可能 deepseek-v3 和 glm-4-plus 一起错了。