In [3]:
# ------------------------------------------------------------
# 0. 준비 ─ 데이터 로드 (앞서 로드한 객체가 있으면 생략)
# ------------------------------------------------------------
import pandas as pd, re, numpy as np
from pathlib import Path

# 원본 노드·엣지
nodes  = pd.read_csv("data/DRKG+DrugBank/nodes.tsv", sep="\t",
                     names=["id", "name", "type"])
edges  = pd.read_csv("data/DRKG+DrugBank/edges.tsv", sep="\t",
                     names=["h", "r", "t"])

# FoodDB, DrugBank-SMILES, 교집합(compound) ⇒ df_common
foodb  = pd.read_excel("data/foodb_final.xlsx")
foodb['smiles'] = foodb['moldb_smiles']                       # 'orig_food_common_name', 'smiles'
df_smiles = (
    pd.read_csv("data/drugbank_smiles.txt", sep="\t", header=None,
                names=["drug_id", "smiles"])
      .dropna(subset=["drug_id", "smiles"])
      .assign(drug_id=lambda d: d["drug_id"].str.strip(),
              smiles  =lambda d: d["smiles"].str.strip())
)

# DrugBank ID 추출해 drug 노드에 붙여 두기
nodes["drug_id"] = nodes["name"].str.extract(r"Compound::(DB\d{5})")

# food 측 SMILES 목록(이미 추출해 두셨다면 생략 가능)
food_smiles_list = (
    Path("data/smiles_food.txt").read_text(encoding="utf-8")
      .split(",")
)
food_smiles_list = [tok.strip().strip("'\"") for tok in food_smiles_list if tok.strip()]
df_food = pd.DataFrame({"smiles": sorted(set(food_smiles_list))})

# Drug-Food 공통 SMILES (df_common) ------------------------------------------------
df_drug = nodes.dropna(subset=["drug_id"]).merge(
    df_smiles.rename(columns={"drug_id": "drug_id"}), on="drug_id", how="left")
df_common = (
    df_drug.dropna(subset=["smiles"])
           .merge(df_food, on="smiles", how="inner")
)

# ------------------------------------------------------------
# 1. foodb 와 df_common 병합  →  food_drug
# ------------------------------------------------------------
food_drug = (
    foodb.merge(df_common[["drug_id", "smiles"]], on="smiles", how="inner")
          .rename(columns={"drug_id": "drug_id",
                           "orig_food_common_name": "food_name"})
)

# ------------------------------------------------------------
# 2. Food 노드 신설  →  food_nodes
#     id = food_name  |  name = Food::{food_name}  |  type = 14
# ------------------------------------------------------------
food_nodes = (
    pd.DataFrame({
        "id":   food_drug["food_name"].unique(),
        "name": ["Food::" + n for n in food_drug["food_name"].unique()],
        "type": 14
    })
)

# ------------------------------------------------------------
# 3. 기존 + 신설 노드 합치기  →  new_nodes
#    그리고 id→index 매핑 딕셔너리(id2idx)
# ------------------------------------------------------------
new_nodes = pd.concat([nodes[["id", "name", "type"]], food_nodes],
                      ignore_index=True)

id2idx = pd.Series(new_nodes.index.values, index=new_nodes["id"]).to_dict()

# ------------------------------------------------------------
# 4. food_drug 에 h,r,t 컬럼 추가
#     h = food_name  ,  t = drug_id  ,  r = 108
# ------------------------------------------------------------
food_drug["h"] = food_drug["food_name"].map(id2idx)
food_drug["t"] = food_drug["drug_id"].map(id2idx)
food_drug["r"] = 108     # ‘food-compound 포함’ 관계 타입 고정

edge_cols = food_drug[["h", "r", "t"]]

# ------------------------------------------------------------
# 5. edges + edge_cols 합치기  →  new_edges
# ------------------------------------------------------------
new_edges = pd.concat([edges, edge_cols], ignore_index=True)

# ---- (선택) 결과 저장 -----------------------------------------------------------
out_dir = Path("data/_export"); out_dir.mkdir(exist_ok=True)
new_nodes.to_csv(out_dir / "new_nodes.tsv", sep="\t", index=False, header=False)
new_edges.to_csv(out_dir / "new_edges.tsv", sep="\t", index=False, header=False)

print(f"✅  신규 Food 노드   : {len(food_nodes):,}")
print(f"✅  신규 Food-Drug 엣지: {len(edge_cols):,}")
print(f"📁  new_nodes.tsv / new_edges.tsv 저장 완료 →  {out_dir.resolve()}")


✅  신규 Food 노드   : 1,614
✅  신규 Food-Drug 엣지: 230,650
📁  new_nodes.tsv / new_edges.tsv 저장 완료 →  C:\Users\als31\Documents\Code\HetDDI\data\_export
