In [1]:
import json
from sentence_transformers import SentenceTransformer
import chromadb
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import shutil

try:
    shutil.rmtree(r"C:\last_project\make_domain_selector\concept_store")
except:
    pass

In [3]:
base_path = r"C:\last_project\model_pipeline\3. Concept Resolver"

file_names = os.listdir(base_path)
        
def build_document(concept):
    canonical = concept["canonical"]
    aliases = concept.get("aliases", [])
    db_filter = concept["db_filter"]

    table = db_filter["table"]
    column = db_filter["column"]
    operator = db_filter["operator"]
    values = db_filter["values"]
    unit = concept.get("unit")

    aliases_str = ", ".join(aliases)
    values_str = ", ".join(map(str, values))

    if operator == "IN":
        condition_str = f"{column} IN ({values_str})"
    else:
        condition_str = f"{column} {operator} {values_str}"

    sentence = f"""
{canonical} is a drug concept in the MIMIC-IV database.
Canonical name: {canonical}.
Aliases: {aliases_str}.
Unit: {unit}.

When a natural language query refers to this drug or any of its aliases,
filter the {table} table using the following Oracle WHERE condition:

{condition_str}
"""
    return " ".join(sentence.split())

def build_metadata(concept):
    metadata = {
        "table": str(concept["db_filter"]["table"]),
        "column": str(concept["db_filter"]["column"]),
        "values": str(concept["db_filter"]["values"]),
        "unit": str(concept.get("unit")) if concept.get("unit") else None,
        "operator": "IN",
        "name" : str(concept["canonical"])
    }
    
    return {k: v for k, v in metadata.items() if v is not None}

def build_drug_document(concept):
    canonical = concept["canonical"]
    aliases = concept.get("aliases", [])
    db_filter = concept["db_filter"]

    column = db_filter["column"]
    values = db_filter["values"]

    # % 제거 후 regex 패턴 생성
    regex_terms = [v.strip("%") for v in values]
    regex_pattern = "|".join(regex_terms)

    aliases_str = ", ".join(aliases)

    sentence = f"""
{canonical} is a medication concept in the MIMIC-IV database.
Canonical name: {canonical}.
Aliases: {aliases_str}.

When a natural language query refers to {canonical} or any of its aliases,
filter medication records in either the prescriptions or pharmacy tables
using the following Oracle WHERE condition on the {column} column:

REGEXP_LIKE({column}, '{regex_pattern}', 'i')
"""

    return " ".join(sentence.split())

def build_drug_metadata(concept):
    metadata = {
        "table": str(concept["db_filter"]["table"]),
        "column": str(concept["db_filter"]["column"]),
        "values": json.dumps([i.strip('%') for i in concept["db_filter"]["values"]]),
        "unit": str(concept.get("unit")) if concept.get("unit") else None,
        "operator": "MATCH_ANY",
        "match_type": "regex",
        "case_sensitive": False,
    }

    return {k: v for k, v in metadata.items() if v is not None}

In [4]:
document = []
metadata = []

In [5]:
for file_name in file_names:
    print(file_name)
    mini_document = []
    mini_metadata = []
    full_path = os.path.join(base_path, file_name)

    with open(full_path, "r", encoding="utf-8") as f:
        concepts = json.load(f)
        if 'med_class_dictionary' in full_path:
            for concept in concepts:
                mini_document.append(build_drug_document(concept))
                mini_metadata.append(build_drug_metadata(concept))
        else:
            for concept in concepts:
                mini_document.append(build_document(concept))
                mini_metadata.append(build_metadata(concept))
    
    document += mini_document
    metadata += mini_metadata

concepts_d_labitems_to_loinc.json
inputevents_to_rxnorm.json
lab_itemid_to_loinc.json
meas_chartevents_main.json
med_class_dictionary.json
numerics-summary.json
outputevents_to_loinc.json
proc_datetimeevents.json
proc_itemid.json
waveforms-summary.json


In [6]:
embedding_model = SentenceTransformer("intfloat/e5-small-v2")

client = chromadb.PersistentClient(
    path=r"C:\last_project\make_domain_selector\concept_store"
)

def embed_passages(texts):
    texts = [f"passage: {t}" for t in texts]
    return embedding_model.encode(texts, batch_size=64, show_progress_bar=True).tolist()

def embed_query(text):
    """검색 쿼리 임베딩용"""
    if isinstance(text, str):
        text = [text]
    texts = [f"query: {t}" for t in text]
    return embedding_model.encode(texts, show_progress_bar=True).tolist()

# 컬렉션 삭제 시 에러 처리
try:
    client.delete_collection(name="concept_all")
except:
    pass  # 컬렉션이 없어도 계속 진행
collection = client.create_collection(name=f"concept_all")

collection.add(
    documents=document,
    embeddings=embed_passages(document),
    metadatas=metadata,
    ids= [f'document_item{i}' for i in range(len(document))])

Batches: 100%|██████████| 44/44 [00:09<00:00,  4.59it/s]


In [7]:
collection = client.get_collection("concept_all")

# 검색 시 수정
results = collection.query(
    query_embeddings=embed_query("MAP"),  # "query:" prefix 사용
    n_results=3
)

print(results["documents"][0][0])
print(results["metadatas"][0][0])

Batches: 100%|██████████| 1/1 [00:00<00:00, 35.72it/s]

Output.stool [Volume] is a drug concept in the MIMIC-IV database. Canonical name: Output.stool [Volume]. Aliases: Output.stool [Volume], Stool, Fecal Bag, Ostomy (output), Rectal Tube, Ileoconduit. Unit: mL. When a natural language query refers to this drug or any of its aliases, filter the outputevents table using the following Oracle WHERE condition: itemid IN (226579, 226580, 226582, 226583, 226584)
{'unit': 'mL', 'operator': 'IN', 'values': '[226579, 226580, 226582, 226583, 226584]', 'table': 'outputevents', 'name': 'Output.stool [Volume]', 'column': 'itemid'}



