# Init

In [1]:
import sys
import os
import json
import time
import re

from tqdm import tqdm

cwd = os.getcwd()
os.chdir(cwd)
sys.path.append('tools')

import chat
import parse_data
import sql

# Craft Prompt

In [15]:
system_prompt = ""

prompt_dir = os.path.join(cwd, 'prompt')
version = 'v1.1.0'
fname = f'sql_generator-stage_1-{version}.md'
prompt_fpath = os.path.join(prompt_dir, fname)

with open(prompt_fpath, 'r') as f:
    prompt_template = ''.join(f.readlines())

def make_prompt(data: dict) -> str:

    prompt = prompt_template

    # 
    table_finder_res = data['table_finder']['stage_1'][0]['data_source'][0]
    try:
        del table_finder_res['question']
    except:
        pass
    table = table_finder_res['table']
    table_finder_res = json.dumps(table_finder_res, ensure_ascii=False, indent=2)
    reg_p = re.compile('<Database and Table>')
    prompt = re.sub(reg_p, table_finder_res, prompt)

    # 
    table_fname = f'{table}-with_table_name.md'
    table_dir = os.path.join(cwd, 'data' + os.sep + 'table-column')
    table_fpath = os.path.join(table_dir, table_fname)
    with open(table_fpath,'r') as f:
        table_schema = ''.join(f.readlines())
    reg_p = re.compile('<Table-Column Schema>')
    prompt = re.sub(reg_p, table_schema, prompt)

    # 
    if data['ner']['stage_1']['result']:
        ner_res = [i for i in data['ner']['stage_1']['sql'].values() if i][0]
        ner_res = [i['result'] for i in ner_res if i['result']][0][0]
        ner_res = json.dumps(ner_res, ensure_ascii=False, indent=2)
        reg_p = re.compile('<Background Knowledge>')
        prompt = re.sub(reg_p, ner_res, prompt)
    else:
        reg_p = re.compile('<Background Knowledge>')
        prompt = re.sub(reg_p, '', prompt)
        reg_p = re.compile('## Background Knowledge')
        prompt = re.sub(reg_p, '', prompt)

    # replace query
    query = data['team'][0]['question']
    reg_p = re.compile('<Current Query>')
    prompt = re.sub(reg_p, query, prompt)

    return prompt

In [4]:
question_path = os.path.join(cwd, 'answer_tmp' + os.sep + 'stage_1-glm_4_plus-table_finder-v2.6.2.json')

questions = parse_data.read_json(question_path)

# GLM-4-Plus

In [5]:
model = 'glm_4_plus'

## Test

In [6]:
query = make_prompt(questions[1])

history = []

start_time = time.time()
message = chat.create_message(query, history=history, system_prompt=system_prompt, temperature=0.7, top_p=0.9, response_format='text')
end_time = time.time()

execution_time = end_time - start_time
usage = chat.get_token_usage(message, True)
content = chat.get_content(message, True)
history = chat.build_history(history, message=message)

{'prompt_tokens': 3911, 'completion_tokens': 174, 'total_tokens': 4085}
```json
{
    "query": "今天是2021年12月24日，创近半年新高的股票有几只？",
    "sql_cot_reasoning": "为了回答这个问题，我们需要查询在2021年12月24日这一天创近半年新高的股票数量。根据表结构，创近半年新高的字段是`IfHighestHPriceRMSix`，其值为1表示创近半年新高。我们需要筛选出在2021年12月24日这一天的记录，并且`IfHighestHPriceRMSix`值为1的记录。使用`COUNT`函数可以统计满足条件的记录数量。",
    "sql_query": "SELECT COUNT(*) FROM CS_StockPatterns WHERE TradingDay LIKE '2021-12-24%' AND IfHighestHPriceRMSix = 1"
}
```


In [8]:
t = questions[1]
t['sql_generator'] = {}
t['sql_generator']['stage_1']= [json.loads(content.strip('`json'))]
t['token_usage'] = {}
t['token_usage']['sql_generator-stage_1'] = [usage]
t['time_usage'] = {}
t['time_usage']['sql_generator-stage_1'] = [f"{execution_time:.2f}s"]
t = [t]

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-sql_generator-test-{version}.json')
parse_data.write_json(t, saved_path)

## ALL

In [9]:
answers = []

for question in tqdm(questions[:]):
    try:
        # the first question
        query = make_prompt(question)

        history = []

        start_time = time.time()
        message = chat.create_message(query, history=history, system_prompt=system_prompt, temperature=0.7, top_p=0.9, response_format='text')
        end_time = time.time()

        execution_time = end_time - start_time
        usage = chat.get_token_usage(message, False)
        content = chat.get_content(message, False)

        res = question
        res['sql_generator'] = {}
        res['sql_generator']['stage_1']= [json.loads(content.strip('`json'))]
        res['token_usage']['sql_generator-stage_1'] = [usage]
        res['time_usage']['sql_generator-stage_1'] = [f"{execution_time:.2f}s"]

        answers.append(res)
    except:
        print(question['tid'])

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-sql_generator-{version}.json')
parse_data.write_json(answers, saved_path)

  1%|          | 1/101 [00:05<09:58,  5.99s/it]

tttt----2


 13%|█▎        | 13/101 [01:37<13:36,  9.28s/it]

tttt----13


 62%|██████▏   | 63/101 [07:06<05:29,  8.67s/it]

tttt----63


 63%|██████▎   | 64/101 [07:18<06:01,  9.78s/it]

tttt----64


 72%|███████▏  | 73/101 [08:09<02:47,  5.96s/it]

tttt----74


 83%|████████▎ | 84/101 [09:49<02:42,  9.53s/it]

tttt----84


 96%|█████████▌| 97/101 [11:48<00:34,  8.73s/it]

tttt----97


100%|██████████| 101/101 [12:18<00:00,  7.31s/it]


# Deepseek-chat

In [10]:
model = 'deepseek_v3'

In [11]:
deepseek_api = 'sk-ba0f5eed3bea4fa6be16eb33b139c684'

## Test

In [18]:
from openai import OpenAI

query = make_prompt(questions[1])

client = OpenAI(api_key= deepseek_api, base_url="https://api.deepseek.com")

start_time = time.time()
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "user", "content": query},
    ],
    stream=False,
    top_p=0.7,
    temperature=0.9
)
end_time = time.time()

response = json.loads(response.to_json())
content = response['choices'][0]['message']['content']

content = content.strip('`json')
usage = response['usage']
execution_time = end_time - start_time

In [19]:
t = questions[1]
t['sql_generator'] = {}
t['sql_generator']['stage_1']= [json.loads(content.strip('`json'))]
t['token_usage'] = {}
t['token_usage']['sql_generator-stage_1'] = [usage]
t['time_usage'] = {}
t['time_usage']['sql_generator-stage_1'] = [f"{execution_time:.2f}s"]
t = [t]

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-sql_generator-test-{version}.json')
parse_data.write_json(t, saved_path)

# ALL

In [21]:
answers = []

for question in tqdm(questions[:]):
    try:
        query = make_prompt(question)

        start_time = time.time()
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "user", "content": query},
            ],
            stream=False,
            top_p=0.7,
            temperature=0.9
        )
        end_time = time.time()

        response = json.loads(response.to_json())
        content = response['choices'][0]['message']['content']

        content = content.strip('`json')
        usage = response['usage']
        execution_time = end_time - start_time

        res = question
        res['sql_generator'] = {}
        res['sql_generator']['stage_1']= [json.loads(content.strip('`json'))]
        res['token_usage']['sql_generator-stage_1'] = [usage]
        res['time_usage']['sql_generator-stage_1'] = [f"{execution_time:.2f}s"]

        answers.append(res)
    except:
        print(question['tid'])
        
saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-sql_generator-{version}.json')
parse_data.write_json(answers, saved_path)

 44%|████▎     | 44/101 [03:42<04:42,  4.95s/it]

tttt----44


 57%|█████▋    | 58/101 [04:56<04:08,  5.78s/it]

tttt----58


 58%|█████▊    | 59/101 [05:06<04:59,  7.12s/it]

tttt----59


 59%|█████▉    | 60/101 [05:19<05:58,  8.75s/it]

tttt----60


 72%|███████▏  | 73/101 [06:39<03:03,  6.57s/it]

tttt----74


 91%|█████████ | 92/101 [08:40<01:04,  7.19s/it]

tttt----92


100%|██████████| 101/101 [09:45<00:00,  5.80s/it]


# Compare Results

Compare the stage 1 results => find the differences => get the correct answer.

In [None]:
import os
import json
import copy

# Path to the folder containing model answer files
dir_path = os.path.join(cwd, 'answer_tmp')

# List of model names
models = ['deepseek_v3', 'glm_4_plus']

# Create a dictionary of file paths for each model's JSON file
model_files = {model: os.path.join(dir_path, f"stage_1-{model}-sql_generator-{version}.json") for model in models}

# Dictionary to store the data of each model
model_data = {}

# Read the JSON data for each model
for model, file_path in model_files.items():
    with open(file_path, 'r', encoding='utf-8') as f:
        model_data[model] = json.load(f)

# Dictionary to store the data_source for each question id across different models
data_sources = {}

# Traverse through each model's data to extract the data_source for each question id
for model in models:
    for entry in copy.deepcopy(model_data[model]):
        tmp = entry['sql_generator']['stage_1'][0]['sql_query']
        for i in range(len(tmp)):
            del tmp[i]['question']
        data_sources.setdefault(entry['tid'].replace(' ', ''), {}).update({model: tmp})

# Compare the data_source for each question id across models
for question_id, sources in data_sources.items():
    # Check if the data_source is consistent across models
    # Convert each model's data_source to a JSON string (to handle the dictionary comparison)
    serialized_sources = {model: json.dumps(ds, sort_keys=True, ensure_ascii=False) for model, ds in sources.items()}
    
    # If there are any differences in data_source, output the details
    if len(set(serialized_sources.values())) > 1:
        print(f"Question ID: {question_id}")

        q_id = int(question_id.split('-')[-1]) - 1
        
        # Calculate the maximum length of model names to align the output
        max_model_length = max(len(model) for model in models)
        # Print the data_source for each model, with aligned output
        for model in models:
            # Left-align model names with the calculated maximum length
            print(f"{model.ljust(max_model_length)}:")
            print('```')
            print(json.dumps(model_data[model][q_id]['sql_generator']['stage_1'], indent=2, ensure_ascii=False))
            print('```')
        
        print("")