In [None]:
import os
import pandas as pd
import ast
from tqdm.auto import tqdm

# ChromaDB 영속 클라이언트
from chromadb import PersistentClient

# 문장 임베딩
from sentence_transformers import SentenceTransformer

# --- 설정 ---
DATA_PATH       = "/mnt/data/preprocessed_ingredients.csv"
CHROMA_DIR      = "./chroma_ingredients"
COLLECTION_NAME = "ingredients_collection"
EMBED_MODEL     = "all-MiniLM-L6-v2"

# 진행률 표시기 활성화
tqdm.pandas()
# TF 연동 끄기 (옵션)
os.environ["TRANSFORMERS_NO_TF"] = "1"

# --- 1) CSV 로드 & 컬럼 확인 ---
df = pd.read_csv(DATA_PATH)
print(f"Loaded {len(df)} rows, columns: {df.columns.tolist()}")

# --- 2) Embedding 모델 초기화 ---
embedder = SentenceTransformer(EMBED_MODEL)

# --- 3) PersistentClient 초기화 & 컬렉션 준비 ---
client     = PersistentClient(path=CHROMA_DIR)
collection = client.create_collection(name=COLLECTION_NAME, get_or_create=True)
print(f"Collection ready: '{COLLECTION_NAME}'")

# --- 4) description → embedding 생성 ---
df['embedding'] = df['description'].fillna("").progress_apply(
    lambda txt: embedder.encode(txt).tolist() if txt else []
)

# --- 5) 유니크 ID 생성 (ingredient + 인덱스) ---
df['uid'] = [
    f"{ing}::{i}"
    for i, ing in enumerate(df['ingredient'])
]

# --- 6) 메타데이터 준비 ---
ids       = df['uid'].tolist()
docs      = df['description'].fillna("").tolist()
embs      = df['embedding'].tolist()
metas     = df[['ingredient']].to_dict(orient='records')

# --- 7) upsert를 이용해 데이터 삽입/갱신 ---
collection.upsert(
    ids=ids,
    embeddings=embs,
    documents=docs,
    metadatas=metas
)
print(f"✅ Upserted {len(ids)} items into '{COLLECTION_NAME}'")

# --- 8) 예시 쿼리 테스트 ---
query_text = "피부 진정에 도움이 되는 성분"
q_emb      = embedder.encode(query_text).tolist()

results = collection.query(
    query_embeddings=[q_emb],
    n_results=5
)

print("\n--- Top 5 Results ---")
for _id, doc, meta in zip(
    results['ids'][0],
    results['documents'][0],
    results['metadatas'][0]
):
    print(f"• {_id} → {meta['ingredient']}\n  설명: {doc[:60]}...\n")
