# Init

In [1]:
import sys
import os
import json
import time

from tqdm import tqdm

cwd = os.getcwd()
os.chdir(cwd)
sys.path.append('tools')

import chat
import parse_data
import sql

In [2]:
sql_api = "f8bc1dd6592f491895617e1dfd1dc89b"

# Table-column Table 的处理

## 拆分数据字典中的表字段关系

- Motivation: 表字段关系表太大，无法直接作为 context 喂给 llm，所以只能将需要的表字段关系提取出来后再喂给 llm。
- Data Description: 为了降低未来的工作成本，同一个表格会生成两个文件，一个文件有 table_name 一个问题没有 table_name。文件名分别是 `{X}-with_table_name.md` 和 `{X}-without_table_name.md`。

In [None]:
import os

import pandas

cwd = os.getcwd()
data_dir = os.path.join(cwd, 'data')
target_dir = os.path.join(data_dir, "table-column")
src_fpath = os.path.join(data_dir, '数据字典.xlsx')

In [2]:
import pandas as pd

# Set pandas options to display all rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Disable width limit
pd.set_option('display.max_colwidth', None)  # Show full content in each cell

# Read the entire Excel file and check sheet names
xls = pd.ExcelFile(src_fpath)
print(xls.sheet_names)  # Print all sheet names to check for Chinese names

# Read the specific sheet (with Chinese name)
df = pd.read_excel(src_fpath, sheet_name='表字段信息')  # Using the sheet name in Chinese

# Group the DataFrame by 'table_name'
grouped = df.groupby('table_name')

# Function to convert DataFrame to Markdown table format
def df_to_markdown_compact(dataframe, include_table_name=True):
    """
    Convert a DataFrame to a compact Markdown table format.
    :param dataframe: The DataFrame to convert
    :param include_table_name: Boolean flag to include or exclude 'table_name' column
    :return: Compact Markdown-formatted string
    """
    # Remove nan values and replace them with empty strings
    dataframe = dataframe.fillna('')

    # Drop 'table_name' if not needed
    if not include_table_name and 'table_name' in dataframe.columns:
        dataframe = dataframe.drop(columns=['table_name'])
    
    # Convert to Markdown without index
    return dataframe.to_markdown(index=False, tablefmt="github").strip()

# Process the first group only for debugging
for table_name, group in grouped:
    # Generate compact Markdown table
    md_compact_with_table_name = df_to_markdown_compact(group, include_table_name=True)
    md_compact_without_table_name = df_to_markdown_compact(group, include_table_name=False)
    
    # Save to Markdown files
    with open(f"{target_dir}/{table_name}-with_table_name.md", "w", encoding="utf-8") as f:
        t = f.write(f"{md_compact_with_table_name}")
    
    with open(f"{target_dir}/{table_name}-without_table_name.md", "w", encoding="utf-8") as f:
        t = f.write(f"{md_compact_without_table_name}")

print('Done.')

['库表关系', '表字段信息']
Done.


## markdown 的格式预处理

1. 移除两个及以上的空格。
2. 改写第二行中 markdown table 的格式。

In [None]:
import os

import pandas

cwd = os.getcwd()
data_dir = os.path.join(cwd, 'data')
src_dir = os.path.join(data_dir, "table-column")
exclusive_fname = "001-table.md"

In [None]:
files = os.listdir(src_dir)
# remove the original file
files.remove(exclusive_fname)

for fname in files[:]:
    fpath = os.path.join(src_dir, fname)

    with open(fpath, 'r') as f:
        content = f.readlines()

    new_content = []

    for line in content:
        line = line.replace('  ', '')
        # make the format more readable
        line = line.replace('| ||', ' | | |')
        new_content.append(line)

    new_content[1] = "|---|---|---|---|---|\n"

    with open(fpath, 'w') as f:
        f.writelines(new_content)

## 重写备注



# Database-Table 的处理

## 给每一个表增加 date_type 的属性

Q：为什么需要添加 data type？
A：将 query 和 table-filed schema 进行匹配的时候，避免 llm 因为缺乏必要的信息而将 query 的主体与不对应的 field 进行匹配。比如「小米的法人是谁？」，在 `LC_StockArchives` 中

## 提取 database-table 的 pair

- 保存为 JSON

In [7]:
import os
import json
from tqdm import tqdm


cwd = os.getcwd()
data_dir = os.path.join(cwd, 'data')
target_dir = os.path.join(data_dir, "database-table")
src_fpath = os.path.join(target_dir, 'database.json')

with open(src_fpath, 'r') as f:
    content = json.load(f)

In [9]:
for i in tqdm(range(len(content))):
    query = content[i]

    res = sql.get_description(sql_api, query)

    content[i]['sql_description'] = res

saved_fpath = os.path.join(target_dir, 'database-with_description.json')

with open(saved_fpath, 'w', encoding='utf-8') as f:
    json.dump(content, f, ensure_ascii=False, indent=2)

100%|██████████| 77/77 [00:16<00:00,  4.55it/s]


## 对于每一个 table 都获取例子

In [10]:
import os
import json
from tqdm import tqdm

cwd = os.getcwd()
data_dir = os.path.join(cwd, 'data')
target_dir = os.path.join(data_dir, "database-table")
src_fpath = os.path.join(target_dir, 'database-with_description.json')

with open(src_fpath, 'r') as f:
    content = json.load(f)

In [11]:
for i in tqdm(range(len(content[:]))):
    query = content[i]

    try:
        res = sql.get_instance(sql_api, query, 3)
        content[i]['sql_instances'] = res
    except:
        print(content[i]['database_name_en'], content[i]['table_name_en'])

100%|██████████| 77/77 [00:18<00:00,  4.15it/s]


In [12]:
saved_fpath = os.path.join(target_dir, 'database-with_instances.json')

with open(saved_fpath, 'w', encoding='utf-8') as f:
    json.dump(content, f, ensure_ascii=False, indent=2)