In [10]:
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings
from langchain_community.document_loaders import JSONLoader
from dotenv import load_dotenv
import os
import pandas as pd
import json
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy


load_dotenv()

HF_API_TOKEN = os.environ["HF_API_TOKEN"]

In [17]:
with open('../data/KTAS_성인.json', 'r') as f:
    json_KTAS_adult = json.load(f)

In [18]:
# 임베딩으로 위해 JSON을 document로 변환

def metadata_func(record: dict, metadata: dict) -> dict:
    # symptom을 제외한 모든 필드를 metadata로 추가
    for key, value in record.items():
        if key != 'symptom':
            if isinstance(value, dict):
                value.astype(str)
            metadata_dict = {key: value}
            metadata.update(metadata_dict)

    # source와 seq 키가 있다면 명시적으로 제거
    if 'source' in metadata:
        del metadata['source']
    if 'seq_num' in metadata:
        del metadata['seq_num']

    return metadata

loader = JSONLoader(
    file_path='../data/KTAS_성인.json',
    jq_schema=".[]",
    content_key="symptom",
    metadata_func=metadata_func
)

In [19]:
docs = loader.load()
docs[:5]

[Document(metadata={'ROS': '임신 / 여성생식계', 'primary': [{'consideration': '중증 호흡곤란', 'KTAS_level': 1}, {'consideration': '쇼크', 'KTAS_level': 1}, {'consideration': '무의식', 'KTAS_level': 1}, {'consideration': '중등도 호흡곤란', 'KTAS_level': 2}, {'consideration': '혈역학적 장애', 'KTAS_level': 2}, {'consideration': '의식변화', 'KTAS_level': 2}, {'consideration': '열, 면역저하 상태', 'KTAS_level': 2}, {'consideration': '패혈증 의증', 'KTAS_level': 2}, {'consideration': '급성 중심성 중증 통증', 'KTAS_level': 2}, {'consideration': '출혈성 질환 (생명 혹은 사지를 소실할 정도의 위급한 출혈)', 'KTAS_level': 2}, {'consideration': '고위험성 사고기전', 'KTAS_level': 2}, {'consideration': '경증 호흡곤란', 'KTAS_level': 3}, {'consideration': '비정상 맥박수지만 혈역학적으로 안정', 'KTAS_level': 3}, {'consideration': '전신염증반응증후군', 'KTAS_level': 3}, {'consideration': '열 (아파 보임)', 'KTAS_level': 3}, {'consideration': '급성 중심성 중등도 통증', 'KTAS_level': 3}, {'consideration': '출혈성 질환 (중등도나 경도의 출혈)', 'KTAS_level': 3}, {'consideration': '급성 중심성 경증 통증', 'KTAS_level': 4}], 'secondary': [{'consideration': '주기적

In [20]:
# 임베딩 모델

model_name = "intfloat/multilingual-e5-large-instruct"

hf_embeddings = HuggingFaceEndpointEmbeddings(
    model=model_name,
    # task = 'feature-extraction',
    huggingfacehub_api_token=HF_API_TOKEN
)

In [21]:
# 임베딩

db = FAISS.from_documents(docs, hf_embeddings, distance_strategy = DistanceStrategy.COSINE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [22]:
# 벡터 DB 저장

db.save_local('db/KTAS_adult_faiss')