In [4]:
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import ast

emb_model = SentenceTransformer('moka-ai/m3e-base')

data_dict_df = pd.read_csv('data/data_dictionary.csv')
db_description_json = json.load(open('data/database-table/database-with_description.json'))

db_description_dict = {}
for db in db_description_json:
    key = db['table_name_en']
    value = db['table_description']
    db_description_dict[key] = value 

from tqdm import tqdm

table_emb_dict = {}

for table in tqdm(db_description_dict):
    table_name = table
    table_emb_dict[table] = {
        'description': db_description_dict[table],
        'description_emb': emb_model.encode(db_description_dict[table])  
    }

columns_description_dict = {}

for i, row in tqdm(data_dict_df.iterrows(), total=data_dict_df.shape[0]):
    key = row['column_name']
    description = (row['column_description'] if isinstance(row['column_description'], str) else '')
    annotation = (row['注释'] if isinstance(row['注释'], str) else '')

    columns_description_dict[key] = {
        'column_name': key,
        'column_description': description,
        'annotation': annotation,
        'column_description_emb': emb_model.encode(row['column_description']) if description else None,
        'annotation_emb': emb_model.encode(annotation) if annotation else None,
        'comb_emb': emb_model.encode(description + ' ' + annotation),
    }


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 77/77 [00:04<00:00, 17.22it/s]
100%|██████████| 3489/3489 [02:12<00:00, 26.32it/s]


保存embedding为文件

In [5]:
def convert_embeddings(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    if isinstance(obj, dict):
        return {key: convert_embeddings(value) for key, value in obj.items()}
    if isinstance(obj, list):
        return [convert_embeddings(item) for item in obj]
    return obj

# Convert dictionaries with embeddings to JSON serializable format
table_emb_dict_serializable = convert_embeddings(table_emb_dict)
columns_description_dict_serializable = convert_embeddings(columns_description_dict)

with open('data/database-table/table_embeddings.json', 'w', encoding='utf-8') as f:
    json.dump(table_emb_dict_serializable, f, ensure_ascii=False, indent=4)

with open('data/database-table/columns_description.json', 'w', encoding='utf-8') as f:
    json.dump(columns_description_dict_serializable, f, ensure_ascii=False, indent=4)

    

读取保存的文件

In [6]:
def parse_embedding(embedding_str):
    """
    将存储在 JSON 中的嵌入字符串转换为 numpy 数组。
    假设嵌入存储为形如 "[0.1, 0.2, ...]" 的字符串。
    """
    return np.array(ast.literal_eval(embedding_str))

# 读取 JSON 文件并转换回字典
def load_json_to_dict(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # 递归解析嵌入数据
    def convert_embeddings(obj):
        if isinstance(obj, list) and all(isinstance(x, (int, float)) for x in obj):
            return np.array(obj)
        if isinstance(obj, dict):
            return {key: convert_embeddings(value) for key, value in obj.items()}
        if isinstance(obj, list):
            return [convert_embeddings(item) for item in obj]
        return obj
    
    return convert_embeddings(data)

# 加载 JSON 文件到字典
table_emb_dict = load_json_to_dict('data/database-table/table_embeddings.json')
columns_description_dict = load_json_to_dict('data/database-table/columns_description.json')

以下计算：
score = 0.5 * table_description_cos + 0.25 * (column_name_cos + column_annotation_cos)

In [7]:
import numpy as np
from sentence_transformers import util

def cosine_similarity(vec_a, vec_b):
    return np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

def find_top_similar_columns(word, table_emb_dict, columns_description_dict, emb_model, top_k=20):
    """
    Find the top `top_k` most similar columns to the given word based on cosine similarity.

    Args:
    - word (str): The query word.
    - columns_description_dict (dict): Dictionary containing column details with embeddings.
    - emb_model (SentenceTransformer): The embedding model.
    - top_k (int): Number of top similar columns to return.

    Returns:
    - List of tuples (column_name, similarity_score, column_description)
    """
    # Encode the input word
    word_embedding = emb_model.encode(word)

    similarities = []

    for table, col_info in columns_description_dict.items():
        # Get column name and description
        column_name = col_info['column_name']
        column_description = col_info.get('column_description', '')
        annotation = col_info.get('annotation', '')

        # Get embeddings (ensure they exist)
        table_description_emb = table_emb_dict.get(table, {}).get('description_emb')
        col_desc_emb = col_info.get('column_description_emb')
        annotation_emb = col_info.get('annotation_emb')


        table_desc_similarity = cosine_similarity(word_embedding, table_description_emb) if table_description_emb is not None else 0
        col_desc_similarity = cosine_similarity(word_embedding, col_desc_emb) if col_desc_emb is not None else 0
        annotation_similarity = cosine_similarity(word_embedding, annotation_emb) if annotation_emb is not None else 0

        if annotation:
            similarity_score = 0.5 * table_desc_similarity + 0.25 * (col_desc_similarity + annotation_similarity)
        else:
            similarity_score = 0.5 * table_desc_similarity + 0.5 * (col_desc_similarity)

        similarities.append((column_name, similarity_score, column_description))

    # Sort by similarity score
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

    # Return top `top_k` results
    return similarities[:top_k]

entities = ['中文全称', '全称变更', 'A股简称','法人','法律顾问','会计师事务所','董秘', '实控人', '近一个月最高价', '现金流量净额',
            '注册邮箱', '注册地址', '信披网址', '公司电话','硕士及以上学历（硕士+博士）的人员占比', '一级行业', 
            '唐山港集团股份有限公司是什么时间上市的']
for entity in entities:
    print(f"{entity}: ")
    results = find_top_similar_columns(entity, table_emb_dict, columns_description_dict, emb_model, top_k=5)
    for result in results:
        print(f'{result[0]} ({result[2]})')
    print()

中文全称: 
ChiName (中文名称)
AbbrChiName (中文简称)
ChiNameAbbr (中文名称缩写)
EngName (英文名称)
AbbrEngName (英文简称)

全称变更: 
ChangeReason (简称变更原因)
ChangeDate (全称更改日期)
ChangeNote (变更内容)
FormerName (曾用名)
ChiName (中文名称)

A股简称: 
AShareAbbr (A股证券简称)
SecuAbbr (证券简称)
BShareAbbr (B股证券简称)
HShareAbbr (H股证券简称)
AStockCode (A股证券代码)

法人: 
LegalRepr (法人代表)
LegalPersonRepr (法人代表)
RegArea (注册地城市)
SHKind (股东性质)
RaisedLPShares (2.募集法人股(股))

法律顾问: 
LegalConsultant (法律顾问)
Lawyer (经办律师)
TestmonyLawOffice (见证律师事务所)
AttorneyFee (律师费用(元))
LegalRepr (法人代表)

会计师事务所: 
AccountingFirm (会计师事务所)
AccountingFirms (会计师事务所)
AuditInstitution (审计机构)
CPA (注册会计师)
CertifiedAccountant (合资格会计师)

董秘: 
SecretaryBD (董事会秘书)
AuthReprSBD (董秘授权代表)
SecretaryBDFax (董秘传真)
SecretaryBDTel (董秘电话)
CompanySecretary (公司秘书)

实控人: 
ControllerName (实际控制人)
StructureChart (实际控制人结构图)
ControllerNature (实际控制人所属性质)
ControllerCode (实际控制人代码)
MSHPercentage (持股比例)

近一个月最高价: 
HighPriceRM (近一月最高价(元))
HighPriceRMThree (近三个月以来最高价(元))
HighPriceTM (本月以来最高价(元))
HighPriceRMSix (近六个月以来

下面这个差不多, 直接算 similarity (annotation + description, word)

In [8]:
import numpy as np

def cosine_similarity(vec_a, vec_b):
    return np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

def find_top_similar_columns(word, columns_description_dict, emb_model, top_k=20):
    """
    Find the top `top_k` most similar columns to the given word based on cosine similarity.

    Args:
    - word (str): The query word.
    - columns_description_dict (dict): Dictionary containing column details with embeddings.
    - emb_model (SentenceTransformer): The embedding model.
    - top_k (int): Number of top similar columns to return.

    Returns:
    - List of tuples (column_name, similarity_score, column_description)
    """
    # Encode the input word
    word_embedding = emb_model.encode(word)

    similarities = []

    for table, col_info in columns_description_dict.items():
        # Get column name and description
        column_name = col_info['column_name']
        column_description = col_info.get('column_description', '')
        comb_similarity = cosine_similarity(word_embedding, col_info.get('comb_emb')) if col_info.get('comb_emb') is not None else 0
        similarities.append((column_name, comb_similarity, column_description))

    # Sort by similarity score
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

    # Return top `top_k` results
    return similarities[:top_k]

entities = ['中文全称', '全称变更', 'A股简称','法人','法律顾问','会计师事务所','董秘', '实控人', '近一个月最高价', '现金流量净额',
            '注册邮箱', '注册地址', '信披网址', '公司电话','硕士及以上学历（硕士+博士）的人员占比', '一级行业', 
            '唐山港集团股份有限公司是什么时间上市的']
for entity in entities:
    print(f"{entity}: ")
    results = find_top_similar_columns(entity, columns_description_dict, emb_model, top_k=5)
    for result in results:
        print(f'{result[0]} ({result[2]})')
    print()

中文全称: 
ChiName (中文名称)
AbbrChiName (中文简称)
ChiNameAbbr (中文名称缩写)
EngName (英文名称)
AbbrEngName (英文简称)

全称变更: 
ChangeReason (简称变更原因)
ChangeDate (全称更改日期)
ChangeNote (变更内容)
FormerName (曾用名)
ChiName (中文名称)

A股简称: 
AShareAbbr (A股证券简称)
SecuAbbr (证券简称)
BShareAbbr (B股证券简称)
HShareAbbr (H股证券简称)
AStockCode (A股证券代码)

法人: 
LegalRepr (法人代表)
LegalPersonRepr (法人代表)
RegArea (注册地城市)
SHKind (股东性质)
RaisedLPShares (2.募集法人股(股))

法律顾问: 
LegalConsultant (法律顾问)
Lawyer (经办律师)
TestmonyLawOffice (见证律师事务所)
AttorneyFee (律师费用(元))
LegalRepr (法人代表)

会计师事务所: 
AccountingFirm (会计师事务所)
AccountingFirms (会计师事务所)
AuditInstitution (审计机构)
CPA (注册会计师)
CertifiedAccountant (合资格会计师)

董秘: 
SecretaryBD (董事会秘书)
AuthReprSBD (董秘授权代表)
SecretaryBDFax (董秘传真)
SecretaryBDTel (董秘电话)
CompanySecretary (公司秘书)

实控人: 
ControllerName (实际控制人)
StructureChart (实际控制人结构图)
MSHPercentage (持股比例)
SHName (股东名称)
MSHName (股东名称)

近一个月最高价: 
HighPriceRM (近一月最高价(元))
HighPriceRMThree (近三个月以来最高价(元))
HighPriceTM (本月以来最高价(元))
HighPriceRMSix (近六个月以来最高价(元))
HighPrice (最高价)

