# Init

In [1]:
import sys
import os
import json
import time
import re

from tqdm import tqdm

cwd = os.getcwd()
os.chdir(cwd)
sys.path.append('tools')

import chat
import parse_data
import sql

In [26]:
api_key = "43d1f290abf2441ea6b52b6e1ef95e79"

import requests
import json

def fetch_data(data: dict):
    url = "https://comm.chatglm.cn/finglm2/api/query"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    response = requests.post(url, headers=headers, json=data)

    return response.json()

# Craft Prompt

In [9]:
system_prompt = ""

prompt_dir = os.path.join(cwd, 'prompt')
version = 'v2.0.0'
fname = f'sql_generator-stage_1-{version}.md'
prompt_fpath = os.path.join(prompt_dir, fname)

with open(prompt_fpath, 'r') as f:
    prompt_template = ''.join(f.readlines())

def make_prompt(data: dict) -> str:

    prompt = prompt_template

    # 
    table_finder_res = data['table_finder']['stage_1'][0]['data_source'][0]
    try:
        del table_finder_res['question']
    except:
        pass
    table = table_finder_res['table']
    table_finder_res = json.dumps(table_finder_res, ensure_ascii=False, indent=2)
    reg_p = re.compile('<Database and Table>')
    prompt = re.sub(reg_p, table_finder_res, prompt)

    # 
    table_fname = f'{table}-with_table_name.md'
    table_dir = os.path.join(cwd, 'data' + os.sep + 'table-column')
    table_fpath = os.path.join(table_dir, table_fname)
    with open(table_fpath,'r') as f:
        table_schema = ''.join(f.readlines())
    reg_p = re.compile('<Table-Column Schema>')
    prompt = re.sub(reg_p, table_schema, prompt)

    # 
    if data['ner']['stage_1']['result']:
        ner_res = [i for i in data['ner']['stage_1']['sql'].values() if i][0]
        ner_res = [i['result'] for i in ner_res if i['result']][0][0]
        ner_res = json.dumps(ner_res, ensure_ascii=False, indent=2)
        reg_p = re.compile('<Background Knowledge>')
        prompt = re.sub(reg_p, ner_res, prompt)
    else:
        reg_p = re.compile('<Background Knowledge>')
        prompt = re.sub(reg_p, '', prompt)
        reg_p = re.compile('## Background Knowledge')
        prompt = re.sub(reg_p, '', prompt)

    # replace query
    query = data['team'][0]['question']
    reg_p = re.compile('<Current Query>')
    prompt = re.sub(reg_p, query, prompt)

    return prompt

In [3]:
question_path = os.path.join(cwd, 'answer_tmp' + os.sep + 'stage_1-glm_4_plus-table_finder-v2.7.1.json')

questions = parse_data.read_json(question_path)

# GLM-4-Plus

In [4]:
model = 'glm_4_plus'

## Test

In [None]:
query = make_prompt(questions[1])

history = []

start_time = time.time()
message = chat.create_message(query, history=history, system_prompt=system_prompt, temperature=0.7, top_p=0.9, response_format='text')
end_time = time.time()

execution_time = end_time - start_time
usage = chat.get_token_usage(message, True)
content = chat.get_content(message, True)
history = chat.build_history(history, message=message)

In [6]:
t = questions[1]
t['sql_generator'] = {}
t['sql_generator']['stage_1']= [json.loads(content.strip('`json'))]
t['token_usage'] = {}
t['token_usage']['sql_generator-stage_1'] = [usage]
t['time_usage'] = {}
t['time_usage']['sql_generator-stage_1'] = [f"{execution_time:.2f}s"]
t = [t]

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-sql_generator-test-{version}.json')
parse_data.write_json(t, saved_path)

## ALL

In [None]:
answers = []

for question in tqdm(questions[:]):
    try:
        # the first question
        query = make_prompt(question)

        history = []

        start_time = time.time()
        message = chat.create_message(query, history=history, system_prompt=system_prompt, temperature=0.7, top_p=0.9, response_format='text')
        end_time = time.time()

        execution_time = end_time - start_time
        usage = chat.get_token_usage(message, False)
        content = chat.get_content(message, False)

        res = question
        res['sql_generator'] = {}
        res['sql_generator']['stage_1']= [json.loads(content.strip('`json'))]
        res['token_usage']['sql_generator-stage_1'] = [usage]
        res['time_usage']['sql_generator-stage_1'] = [f"{execution_time:.2f}s"]

        answers.append(res)
    except:
        print(question['tid'])

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-sql_generator-{version}.json')
parse_data.write_json(answers, saved_path)

# Deepseek-chat

In [12]:
model = 'deepseek_v3'

In [13]:
deepseek_api = 'sk-ba0f5eed3bea4fa6be16eb33b139c684'

## Test

In [14]:
from openai import OpenAI

query = make_prompt(questions[1])

client = OpenAI(api_key= deepseek_api, base_url="https://api.deepseek.com")

start_time = time.time()
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "user", "content": query},
    ],
    stream=False,
    top_p=0.7,
    temperature=0.9
)
end_time = time.time()

response = json.loads(response.to_json())
content = response['choices'][0]['message']['content']

content = content.strip('`json')
usage = response['usage']
execution_time = end_time - start_time

In [15]:
t = questions[1]
t['sql_generator'] = {}
t['sql_generator']['stage_1']= [json.loads(content.strip('`json'))]
t['token_usage'] = {}
t['token_usage']['sql_generator-stage_1'] = [usage]
t['time_usage'] = {}
t['time_usage']['sql_generator-stage_1'] = [f"{execution_time:.2f}s"]
t = [t]

saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-sql_generator-test-{version}.json')
parse_data.write_json(t, saved_path)

# ALL

In [20]:
answers = []

for question in tqdm(questions[:]):

    max_retries = 5
    retries = 0

    res = question
    res['sql_generator'] = {}

    while retries < max_retries:
        try:
            query = make_prompt(question)

            start_time = time.time()
            response = client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "user", "content": query},
                ],
                stream=False,
                top_p=0.7,
                temperature=0.9
            )
            end_time = time.time()

            response = json.loads(response.to_json())
            content = response['choices'][0]['message']['content']

            content = content.strip('`json')
            usage = response['usage']
            execution_time = end_time - start_time

            res['sql_generator']['stage_1']= [json.loads(content.strip('`json'))]
            res['token_usage']['sql_generator-stage_1'] = [usage]
            res['time_usage']['sql_generator-stage_1'] = [f"{execution_time:.2f}s"]

            data = {
                "sql": json.loads(content.strip('`json'))['sql_query'],
                "limit": 100
            }
            sql_res = fetch_data(data)['data']
            
            if sql_res:
                answers.append(res)
                break

        except Exception as e:
            retries += 1
            print(f"Error processing question {question['tid']} (attempt {retries}/{max_retries}): {e}")
            if retries == max_retries:
                print(f"Failed to process question {question['tid']} after {max_retries} attempts.")
                answers.append(res)
        
saved_path = os.path.join(cwd, 'answer_tmp' + os.sep + f'stage_1-{model}-sql_generator-{version}.json')
parse_data.write_json(answers, saved_path)

 14%|█▍        | 14/101 [05:57<17:20, 11.96s/it] 

Error processing question tttt----9 (attempt 1/5): 'data'
Error processing question tttt----9 (attempt 2/5): 'data'
Error processing question tttt----9 (attempt 3/5): 'data'
Error processing question tttt----9 (attempt 4/5): 'data'


 15%|█▍        | 15/101 [07:01<39:43, 27.72s/it]

Error processing question tttt----9 (attempt 5/5): 'data'
Failed to process question tttt----9 after 5 attempts.


 16%|█▌        | 16/101 [07:09<30:43, 21.69s/it]

Error processing question tttt----13 (attempt 1/5): 'data'
Error processing question tttt----13 (attempt 2/5): 'data'
Error processing question tttt----13 (attempt 3/5): 'data'
Error processing question tttt----13 (attempt 4/5): 'data'


 17%|█▋        | 17/101 [08:03<44:14, 31.60s/it]

Error processing question tttt----13 (attempt 5/5): 'data'
Failed to process question tttt----13 after 5 attempts.


 19%|█▉        | 19/101 [08:22<27:44, 20.30s/it]

Error processing question tttt----8 (attempt 1/5): 'data'


 35%|███▍      | 35/101 [10:53<10:46,  9.79s/it]

Error processing question tttt----36 (attempt 1/5): 'data'
Error processing question tttt----36 (attempt 2/5): 'data'
Error processing question tttt----36 (attempt 3/5): 'data'


 36%|███▌      | 36/101 [11:48<25:08, 23.20s/it]

Error processing question tttt----38 (attempt 1/5): 'data'
Error processing question tttt----38 (attempt 2/5): 'data'
Error processing question tttt----38 (attempt 3/5): 'data'
Error processing question tttt----38 (attempt 4/5): 'data'


 37%|███▋      | 37/101 [12:24<29:03, 27.24s/it]

Error processing question tttt----38 (attempt 5/5): 'data'
Failed to process question tttt----38 after 5 attempts.


 50%|████▉     | 50/101 [14:12<07:13,  8.50s/it]

Error processing question tttt----49 (attempt 1/5): 'data'
Error processing question tttt----49 (attempt 2/5): 'data'
Error processing question tttt----49 (attempt 3/5): 'data'
Error processing question tttt----49 (attempt 4/5): 'data'


 50%|█████     | 51/101 [14:50<14:19, 17.19s/it]

Error processing question tttt----49 (attempt 5/5): 'data'
Failed to process question tttt----49 after 5 attempts.


 57%|█████▋    | 58/101 [15:51<06:57,  9.72s/it]

Error processing question tttt----58 (attempt 1/5): Invalid control character at: line 4 column 289 (char 337)
Error processing question tttt----58 (attempt 2/5): Invalid control character at: line 4 column 252 (char 300)
Error processing question tttt----58 (attempt 3/5): Invalid control character at: line 4 column 289 (char 337)
Error processing question tttt----58 (attempt 4/5): Invalid control character at: line 4 column 252 (char 300)


 58%|█████▊    | 59/101 [16:45<16:00, 22.87s/it]

Error processing question tttt----58 (attempt 5/5): Invalid control character at: line 4 column 289 (char 337)
Failed to process question tttt----58 after 5 attempts.


 61%|██████▏   | 62/101 [17:14<09:34, 14.72s/it]

Error processing question tttt----60 (attempt 1/5): Invalid control character at: line 4 column 255 (char 320)
Error processing question tttt----60 (attempt 2/5): Invalid control character at: line 4 column 218 (char 283)
Error processing question tttt----60 (attempt 3/5): Invalid control character at: line 4 column 236 (char 301)
Error processing question tttt----60 (attempt 4/5): Invalid control character at: line 4 column 205 (char 270)


 62%|██████▏   | 63/101 [18:39<22:43, 35.88s/it]

Error processing question tttt----60 (attempt 5/5): Invalid control character at: line 4 column 195 (char 260)
Failed to process question tttt----60 after 5 attempts.


 63%|██████▎   | 64/101 [19:11<21:18, 34.56s/it]

Error processing question tttt----72 (attempt 1/5): Invalid control character at: line 4 column 222 (char 269)
Error processing question tttt----72 (attempt 2/5): Invalid control character at: line 4 column 222 (char 269)
Error processing question tttt----72 (attempt 3/5): Invalid control character at: line 4 column 222 (char 269)
Error processing question tttt----72 (attempt 4/5): Invalid control character at: line 4 column 222 (char 269)


 64%|██████▍   | 65/101 [20:29<28:39, 47.76s/it]

Error processing question tttt----72 (attempt 5/5): Invalid control character at: line 4 column 222 (char 269)
Failed to process question tttt----72 after 5 attempts.
Error processing question tttt----59 (attempt 1/5): Invalid control character at: line 4 column 218 (char 271)
Error processing question tttt----59 (attempt 2/5): Invalid control character at: line 4 column 193 (char 246)
Error processing question tttt----59 (attempt 3/5): Invalid control character at: line 4 column 205 (char 258)
Error processing question tttt----59 (attempt 4/5): Invalid control character at: line 4 column 205 (char 258)


 65%|██████▌   | 66/101 [21:39<31:40, 54.31s/it]

Error processing question tttt----59 (attempt 5/5): Invalid control character at: line 4 column 193 (char 246)
Failed to process question tttt----59 after 5 attempts.


 66%|██████▋   | 67/101 [22:15<27:43, 48.92s/it]

Error processing question tttt----64 (attempt 1/5): 'data'


 69%|██████▉   | 70/101 [23:09<14:14, 27.55s/it]

Error processing question tttt----67 (attempt 1/5): Invalid control character at: line 4 column 235 (char 295)


 87%|████████▋ | 88/101 [28:11<02:37, 12.09s/it]

Error processing question tttt----93 (attempt 1/5): Invalid control character at: line 4 column 346 (char 441)
Error processing question tttt----93 (attempt 2/5): Invalid control character at: line 4 column 353 (char 448)
Error processing question tttt----93 (attempt 3/5): Invalid control character at: line 4 column 346 (char 441)
Error processing question tttt----93 (attempt 4/5): Invalid control character at: line 4 column 353 (char 448)


 88%|████████▊ | 89/101 [30:08<08:40, 43.40s/it]

Error processing question tttt----88 (attempt 1/5): Invalid control character at: line 4 column 394 (char 477)
Error processing question tttt----88 (attempt 2/5): Invalid control character at: line 4 column 394 (char 477)
Error processing question tttt----88 (attempt 3/5): Invalid control character at: line 4 column 394 (char 477)
Error processing question tttt----88 (attempt 4/5): Invalid control character at: line 4 column 394 (char 477)


 89%|████████▉ | 90/101 [31:43<10:49, 59.04s/it]

Error processing question tttt----88 (attempt 5/5): Invalid control character at: line 4 column 394 (char 477)
Failed to process question tttt----88 after 5 attempts.


 90%|█████████ | 91/101 [31:55<07:27, 44.75s/it]

Error processing question tttt----95 (attempt 1/5): Invalid control character at: line 4 column 256 (char 336)
Error processing question tttt----95 (attempt 2/5): Invalid control character at: line 4 column 257 (char 337)


 91%|█████████ | 92/101 [32:46<07:01, 46.82s/it]

Error processing question tttt----96 (attempt 1/5): Invalid control character at: line 4 column 258 (char 319)
Error processing question tttt----96 (attempt 2/5): Invalid control character at: line 4 column 270 (char 331)
Error processing question tttt----96 (attempt 3/5): Invalid control character at: line 4 column 273 (char 334)
Error processing question tttt----96 (attempt 4/5): Invalid control character at: line 4 column 270 (char 331)


 92%|█████████▏| 93/101 [34:00<07:18, 54.78s/it]

Error processing question tttt----96 (attempt 5/5): Invalid control character at: line 4 column 270 (char 331)
Failed to process question tttt----96 after 5 attempts.


 93%|█████████▎| 94/101 [34:10<04:50, 41.47s/it]

Error processing question tttt----90 (attempt 1/5): Invalid control character at: line 4 column 323 (char 422)


 95%|█████████▌| 96/101 [34:53<02:31, 30.31s/it]

Error processing question tttt----91 (attempt 1/5): 'data'


 97%|█████████▋| 98/101 [35:40<01:17, 25.90s/it]

Error processing question tttt----98 (attempt 1/5): 'data'
Error processing question tttt----98 (attempt 2/5): 'data'
Error processing question tttt----98 (attempt 3/5): 'data'
Error processing question tttt----98 (attempt 4/5): 'data'


100%|██████████| 101/101 [37:02<00:00, 22.00s/it]


# Obtain SQL Results

## GLM-4-Plus

In [17]:
model = 'glm_4_plus'

In [None]:
fname = f'stage_1-{model}-sql_generator-{version}.json'
fpath = os.path.join(cwd, 'answer_tmp' + os.sep + fname)
data = parse_data.read_json(fpath)

for i in tqdm(data[:]):
    tmp = sql.process_sql_generator_res(i['sql_generator']['stage_1'][0])

fname = f'stage_1-{model}-sql_generator-{version}-sql.json'
fpath = os.path.join(cwd, 'answer_tmp' + os.sep + fname)
data = parse_data.write_json(data, fpath)

## Deepseek-v3

In [21]:
model = 'deepseek_v3'

In [28]:
fname = f'stage_1-{model}-sql_generator-{version}.json'
fpath = os.path.join(cwd, 'answer_tmp' + os.sep + fname)
data = parse_data.read_json(fpath)

for i in tqdm(data[:]):
    try:
        tmp = sql.process_sql_generator_res(i['sql_generator']['stage_1'][0])
    except Exception as e:
        print(e)

fname = f'stage_1-{model}-sql_generator-{version}-sql.json'
fpath = os.path.join(cwd, 'answer_tmp' + os.sep + fname)
data = parse_data.write_json(data, fpath)

  0%|          | 0/101 [00:00<?, ?it/s]

 16%|█▌        | 16/101 [00:03<00:19,  4.37it/s]

Request failed: 500 Server Error: Internal Server Error for url: https://comm.chatglm.cn/finglm2/api/query


 17%|█▋        | 17/101 [00:03<00:19,  4.38it/s]

Request failed: 500 Server Error: Internal Server Error for url: https://comm.chatglm.cn/finglm2/api/query


 37%|███▋      | 37/101 [00:09<00:22,  2.89it/s]

Request failed: 500 Server Error: Internal Server Error for url: https://comm.chatglm.cn/finglm2/api/query


 50%|█████     | 51/101 [00:12<00:11,  4.22it/s]

Request failed: 500 Server Error: Internal Server Error for url: https://comm.chatglm.cn/finglm2/api/query


 57%|█████▋    | 58/101 [00:15<00:15,  2.81it/s]

'stage_1'


 61%|██████▏   | 62/101 [00:16<00:11,  3.53it/s]

'stage_1'


 64%|██████▍   | 65/101 [00:16<00:10,  3.42it/s]

'stage_1'


 91%|█████████ | 92/101 [00:24<00:02,  4.10it/s]

'stage_1'


100%|██████████| 101/101 [00:27<00:00,  3.63it/s]
