In [None]:
import random
import numpy as np
import torch
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [None]:
!pip install sentence_transformers

In [None]:
# Cell 1: imports & config

import os
import json
import copy
import math
import queue

import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity as cos


# -----------------------
# 하이퍼파라미터 (원하면 튜닝)
# -----------------------
TOP_K = 3          # 문서당 최대 core class 개수
MIN_PROB = 0.05    # softmax 확률 기준 최소값 (너무 애매한 라벨 컷)
USE_MARGIN = True
MARGIN_DELTA = 0.02  # 1등과 2등의 확률 차이가 이보다 크면 1등만 뽑도록 stop

# -----------------------
# 데이터 경로 설정 (여기만 네 상황에 맞게 수정)
# -----------------------
DATA_DIR = "Amazon_products"      # 예: "/content/data"
DATASET = "Amazon_products/train/train_corpus.txt"          # document
GPU = 0                         # cuda 디바이스 번호

# output 파일 이름 (원본과 헷갈리지 않게)
print("DATA_DIR:", DATA_DIR)
print("DATASET:", DATASET)
print("GPU:", GPU)


In [None]:
# Cell: Node 클래스 + 그래프 생성 함수

class Node:
    def __init__(self, node_id, name):
        self.node_id = str(node_id)       # "0", "1", ...
        self.name = name                  # "grocery_gourmet_food"
        self.childs = []                  # List[Node]
        self.parents = []                 # List[Node]
        self.path_score = 0.0
        self.similarity_score = 0.0

    def __repr__(self):
        return f"Node(id={self.node_id}, name={self.name})"


def build_graph_from_files(label_file, edge_file):
    """
    label_file: "id<tab>label_name"
    edge_file : "parent_id<tab>child_id"
    """
    id2label = {}
    label2id = {}
    id2node = {}

    # 1) 라벨 파일 읽기
    with open(label_file, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split("\t")
            if len(parts) != 2:
                continue
            idx, name = parts
            idx = str(idx)
            name = name.strip()
            id2label[idx] = name
            label2id[name] = idx
            id2node[idx] = Node(idx, name)

    # 2) 부모-자식 엣지 파일 읽기
    with open(edge_file, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split("\t")
            if len(parts) != 2:
                continue
            p, c = parts
            p = str(p); c = str(c)
            if p not in id2node or c not in id2node:
                continue
            parent = id2node[p]
            child = id2node[c]
            parent.childs.append(child)
            child.parents.append(parent)

    # 3) 부모가 없는 노드들 = top-level 루트들
    roots = [n for n in id2node.values() if len(n.parents) == 0]

    if len(roots) == 1:
        root = roots[0]
    else:
        # top-level이 여러 개면 슈퍼루트 하나 만들어서 모두 연결
        root = Node("-1", "ROOT")
        for r in roots:
            root.childs.append(r)
            r.parents.append(root)

    print(f"#labels: {len(id2label)}, #roots(before super-root): {len(roots)}")
    return root, id2label, label2id, id2node


In [None]:
import mmap
def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines

In [None]:
# Cell 2: graph, corpus, embedding 준비

# 1) 라벨 키워드 로드 (llm_enrichment.txt)
enriched_file = os.path.join(DATA_DIR, "class_related_keywords.txt")
label_keyterm_dict = {}

with open(enriched_file, encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        components = line.split(":")
        node = components[0]          # label_name (with underscore)
        keywords = components[1]
        keyword_list = [k for k in keywords.split(",") if k]
        label_keyterm_dict[node] = keyword_list

print("num labels with keywords:", len(label_keyterm_dict))

# 2) SentenceTransformer 모델 로드
model_name = "Alibaba-NLP/gte-base-en-v1.5"
model = SentenceTransformer(model_name, device=f"cuda:{GPU}",trust_remote_code=True)
print("Loaded SentenceTransformer:", model_name)

# 3) taxonomy 그래프 로드
LABEL_FILE = os.path.join(DATA_DIR, "classes.txt")      # 예시 이름
EDGE_FILE  = os.path.join(DATA_DIR, "class_hierarchy.txt") # 예시 이름

root, id2label, label2id, id2node = build_graph_from_files(LABEL_FILE, EDGE_FILE)

num_class = len(id2label)
print("num_class:", num_class)
# 4) corpus.txt 로드: "doc_id \t text"
corpus_path = os.path.join(DATASET)
num_line = get_num_lines(corpus_path)

all_docs = []
all_docs_id = []

with open(corpus_path, encoding="utf-8") as f:
    for i, line in tqdm(enumerate(f), total=num_line):
        line = line.rstrip("\n")
        if not line:
            continue
        doc_id, doc = line.split("\t", 1)
        all_docs.append(doc)
        all_docs_id.append(doc_id)

print("num_docs:", len(all_docs))

# 5) 모든 문서 임베딩 계산
with torch.no_grad():
    total_doc_embedding = model.encode(
        all_docs,
        batch_size=128,
        show_progress_bar=True,
        convert_to_numpy=True,
    )

# 6) 각 라벨의 key-term 임베딩 계산
# 6) 각 라벨의 key-term 임베딩 계산
key_term_emb_dict = {}
for i in tqdm(range(num_class)):
    current_label = id2label[str(i)]           # 예: "hair_care"

    # 임베딩에 쓸 label 텍스트에서만 _ → 공백
    label_text = current_label.replace("_", " ")

    # label 본인(공백 버전) + keyterms(원문 그대로, _ 유지)
    current_key = [label_text] + label_keyterm_dict[current_label]

    current_embed = model.encode(
        current_key,
        batch_size=128,
        convert_to_numpy=True,
    )  # (num_keyterms_for_label, dim)

    # key_term_emb_dict의 키는 기존 label 그대로 (underscore 버전) 유지
    key_term_emb_dict[current_label] = current_embed

print("built key_term_emb_dict for all labels")




In [None]:
# Cell 4: adjacency matrix + parents/siblings
import json
import numpy as np
adj_upper = np.zeros((num_class, num_class), dtype=np.int32)

with open(EDGE_FILE, encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        p_str, c_str = line.split("\t")
        p = int(p_str)
        c = int(c_str)
        adj_upper[p, c] = 1


print("adj_upper shape:", adj_upper.shape)

# ----- 위에서 이미 정의했던 함수들을 그대로 사용 -----
import numpy as np

def build_parents_children(adj: np.ndarray):
    C = adj.shape[0]
    parents = [np.flatnonzero(adj[:, j]).astype(np.int64) for j in range(C)]
    children = [np.flatnonzero(adj[j]).astype(np.int64) for j in range(C)]
    return parents, children

def build_siblings(parents, children):
    C = len(parents)
    sibs = [set() for _ in range(C)]
    for c in range(C):
        for p in parents[c]:
            for ch in children[p]:
                if ch != c:
                    sibs[c].add(int(ch))
    sibs = [np.array(sorted(s), dtype=np.int64) for s in sibs]
    return sibs

parents, children = build_parents_children(adj_upper)
siblings = build_siblings(parents, children)
roots = [i for i, ps in enumerate(parents) if len(ps) == 0]

print("built parents & siblings")
print(roots)


In [None]:
# 2.99 시작  3.05끝  0.06
# 257 call

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("Amazon_products/run_20251218T131003Z/label_docs.csv")

# cid -> doc 맵
cid2doc = df.set_index("cid")["doc"].to_dict()

# num_labels(=C)가 있으면 0..C-1 순서로 정렬
gpt_docs = [cid2doc[i] for i in range(0,531)]  # (C,)

# 임베딩
label_emb_gpt = model.encode(
    gpt_docs,
    batch_size=128,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
).astype(np.float32)

print("label_emb_gpt:", label_emb_gpt.shape)  # (C, d)


In [None]:
# Cell 3: label_emb 만들기 (라벨별 key-term 평균)

import numpy as np

dim = next(iter(key_term_emb_dict.values())).shape[1]
label_emb = np.zeros((num_class, dim), dtype=np.float32)

for i in range(num_class):
    label_name = id2label[str(i)]  # 예: "hair_care"
    term_emb = key_term_emb_dict[label_name]  # (num_terms, dim)
    label_emb[i] = term_emb.mean(axis=0)

print("label_emb shape:", label_emb.shape)  # (num_class, dim)


In [None]:
from sentence_transformers import SentenceTransformer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "Alibaba-NLP/gte-base-en-v1.5"
st_model = SentenceTransformer(model_name, device=f"cuda:{GPU}",trust_remote_code=True)

with torch.no_grad():
    total_doc_emb = st_model.encode(
        all_docs,
        batch_size=128,
        convert_to_tensor=True,
        show_progress_bar=True,
    ).cpu().numpy()   # [N_docs, d_doc]

print("total_doc_emb:", total_doc_emb.shape)


In [None]:
# 이미 이전 셀/파일에 있다고 가정하는 함수들
# - hierarchical_beam_similarity_avg
# - compute_conf_doc

def compute_conf_doc(
    sims: np.ndarray,
    cand: np.ndarray,
    parents,
    siblings,
) -> np.ndarray:
    conf = np.empty_like(cand, dtype=np.float32)
    for k, c in enumerate(cand):
        par = parents[c]
        sib = siblings[c]
        neigh_vals = []

        if len(par) > 0:
            neigh_vals.append(sims[par].max())
        if len(sib) > 0:
            neigh_vals.append(sims[sib].max())

        neigh_max = max(neigh_vals) if neigh_vals else 0.0
        conf[k] = float(sims[c] - neigh_max)
    return conf


In [None]:
# ================================================
# (사전 준비) label_term_emb / label_emb 만들기
#   - label_term_emb: (C, 11, d)  [0]=label_name, [1:]=keywords(10)
#   - label_emb: (C, d)  term 평균
# ================================================
import numpy as np

K = 10
T = 1 + K

dim = next(iter(key_term_emb_dict.values())).shape[1]

label_term_emb = np.zeros((num_class, T, dim), dtype=np.float32)
label_emb      = np.zeros((num_class, dim), dtype=np.float32)

stack = []
for i in range(num_class):
    label_name = id2label[str(i)]                  # underscore 버전
    term_emb = key_term_emb_dict[label_name].astype(np.float32)  # (11, d) 기대

    if term_emb.shape[0] != T:
        stack.append(i)
        continue
        raise ValueError(f"[{label_name}] term count = {term_emb.shape[0]} (expected {T}), {i}")

    label_term_emb[i] = term_emb
    label_emb[i] = term_emb.mean(axis=0)

print("label_term_emb:", label_term_emb.shape)  # (C, 11, d)
print("label_emb:", label_emb.shape)            # (C, d)
print(stack)


In [None]:
# 5-1. node score similarity (alpha * label_name + beta * keywords_agg)

EPS = 1e-12

# ===== 하이퍼파라미터 =====
ALPHA = 1.0
BETA  = 1.0
KW_AGG = "topk"   # "topk" or "mean"
K_TOP  = 3        # KW_AGG="topk"일 때만 사용
LABEL_CHUNK = 256

# ===== doc embedding =====
doc_emb = total_doc_emb.astype(np.float32)  # (N_docs, d)
doc_emb_cos = doc_emb / (np.linalg.norm(doc_emb, axis=1, keepdims=True) + EPS)

# ===== label_term_emb 준비되어 있어야 함: (C, 11, d), [0]=label_name, [1:]=keywords(10)
# label_term_emb가 아직 없으면, 미리 만들어둔 걸 사용하거나 생성해두면 됨.
lab_term = label_term_emb.astype(np.float32)
lab_term_cos = lab_term / (np.linalg.norm(lab_term, axis=2, keepdims=True) + EPS)

N_docs = doc_emb_cos.shape[0]
C = lab_term_cos.shape[0]
K = lab_term_cos.shape[1] - 1  # 10

sim_matrix = np.zeros((N_docs, C), dtype=np.float32)

label_emb_gpt = label_emb_gpt.astype(np.float32)
label_emb_gpt_cos = label_emb_gpt / (np.linalg.norm(label_emb_gpt, axis=1, keepdims=True) + EPS)

for s in range(0, C, LABEL_CHUNK):
    e = min(C, s + LABEL_CHUNK)
    chunk = lab_term_cos[s:e]  # (Lc, 11, d)
    Lc = e - s

    # label-name cosine: (N, Lc)
    sim_name = doc_emb_cos @ chunk[:, 0, :].T

    # keyword cosine들: (N, Lc, K)
    kw = chunk[:, 1:, :]  # (Lc, 10, d)
    sim_kw_all = np.einsum("nd,lkd->nlk", doc_emb_cos, kw)

    # keyword aggregate: (N, Lc)
    if KW_AGG == "mean":
        sim_kw = sim_kw_all.mean(axis=2)
    elif KW_AGG == "topk":
        k = min(K_TOP, sim_kw_all.shape[2])
        topk = np.partition(sim_kw_all, kth=sim_kw_all.shape[2]-k, axis=2)[:, :, -k:]
        sim_kw = topk.mean(axis=2)
    else:
        raise ValueError(f"Unknown KW_AGG={KW_AGG}")

    # ---- (기존) node score ----
    sim_node = ALPHA * sim_name + BETA * sim_kw   # (N, Lc)

    # ---- (추가) gpt-doc emb 내적(=cosine) 점수 ----
    sim_gpt = doc_emb_cos @ label_emb_gpt_cos[s:e].T  # (N, Lc)

    # ---- (변경) 두 점수 평균 ----
    sim_matrix[:, s:e] = 0.7*sim_node + 0.3* sim_gpt

print("sim_matrix(node+gpt avg):", sim_matrix.shape,
      "range:", float(sim_matrix.min()), "~", float(sim_matrix.max()))

In [None]:

# 5-2. path(c) = root → ... → c 경로 만들기 (parents 리스트 사용)
def build_paths_from_parents(parents_list):
    """
    parents_list[c]: np.array of parent indices for class c
    return: paths[c] = [root, ..., c]
    """
    C = len(parents_list)
    paths = {}

    for cid in range(C):
        path = []
        cur = cid
        visited = set()

        while True:
            path.append(cur)
            visited.add(cur)

            # 부모 없으면 루트
            if len(parents_list[cur]) == 0:
                break

            # 부모가 여러 개일 수 있지만, 일단 첫 번째만 사용 (tree 가정)
            par = int(parents_list[cur][0])
            if par in visited:   # 안전 장치 (cycle 방지)
                break
            cur = par

        # root → ... → cid 순서로 뒤집기
        paths[cid] = list(reversed(path))

    return paths

In [None]:

paths = build_paths_from_parents(parents)
print("예시 path[0]:", paths[0], "->", [id2label[str(i)] for i in paths[0]])


# 5-3. path score 계산
# path score(d, c) = 평균_{j ∈ path(c)} sim_matrix[d, j]
N_docs, C = sim_matrix.shape
path_scores = np.zeros_like(sim_matrix, dtype=np.float32)

for cid, path in paths.items():
    vals = sim_matrix[:, path]          # (N_docs, len(path))
    path_scores[:, cid] = vals.mean(axis=1)

print("path_scores:", path_scores.shape)


# 5-4. 각 문서마다 core class 선택 (Top-k + confidence 반영)
TOP_K_CORE   = 2      # 문서당 core class 최대 개수 (원하면 조정)
MIN_SCORE    = None   # 필요하면 path score threshold (예: 0.2)
CAND_TOP     = 50     # confidence 계산에 쓸 후보 개수
CONF_ALPHA   = 0.2    # path score에 confidence를 얼마나 섞을지
CONF_MIN     = 0.0   # 예: 0.0 으로 두고 conf <= 0인 애들 컷하고 싶으면 사용

core_classes_per_doc = []      # 문서별 core class id 리스트
core_primary         = np.full(N_docs, -1, dtype=np.int32)   # 문서당 대표 core (1개)
core_primary_score   = np.zeros(N_docs, dtype=np.float32)

for i in range(N_docs):
    path_row = path_scores[i]    # (C,) path score(d, c)
    sim_row  = sim_matrix[i]     # (C,) local similarity(d, c) — confidence 계산용

    # 1) path score 기준 1차 정렬
    idx_sorted = np.argsort(-path_row)

    # (옵션) path score 전역 threshold
    if MIN_SCORE is not None:
        idx_sorted = [cid for cid in idx_sorted if path_row[cid] >= MIN_SCORE]

    if len(idx_sorted) == 0:
        core_classes_per_doc.append([])
        continue

    # 2) 상위 일부만 candidate로 두고 confidence 계산
    cand = np.array(idx_sorted[:CAND_TOP], dtype=np.int64)

    conf_vals = compute_conf_doc(
        sims=sim_row,      # local sim(d, :)
        cand=cand,
        parents=parents,
        siblings=siblings,
    )  # (len(cand),)

    # (옵션) confidence threshold (부모/형제보다 안 좋은 애 걸러내기 등)
    if CONF_MIN is not None:
        mask = conf_vals >= CONF_MIN
        cand = cand[mask]
        conf_vals = conf_vals[mask]
        if cand.size == 0:
            core_classes_per_doc.append([])
            continue

    # 3) path score + confidence 를 합쳐서 최종 점수
    #    → "조금"만 섞고 싶으니 작은 alpha
    combined_scores = path_row[cand] + CONF_ALPHA * conf_vals

    order = np.argsort(-combined_scores)
    chosen = cand[order[:TOP_K_CORE]]

    core_classes_per_doc.append(chosen.tolist())

    # 대표 core 하나 (가장 점수 높은 것) — path score 기준 or combined 둘 중 택1
    best_c = int(chosen[0])
    core_primary[i] = best_c
    core_primary_score[i] = float(path_row[best_c])  # or combined_scores[order[0]]

# 예시 출력
print("예시 문서 0의 core classes (id):", core_classes_per_doc[0])
print("예시 문서 0의 core classes (name):",
      [id2label[str(c)] for c in core_classes_per_doc[0]])
print("예시 문서 0의 대표 core:", core_primary[0],
      id2label[str(core_primary[0])], "score =", core_primary_score[0])



In [None]:
import numpy as np
from collections import Counter

def summarize_core_class_mapping_v2(
    core_classes_per_doc,
    core_primary,
    core_primary_score,
    id2label=None,
    depths=None,
    top_k_labels=20,
):
    """
    core_classes_per_doc : List[List[int]]  # 문서별 core class id 목록
    core_primary         : np.ndarray (N_docs,)  # 대표 core class id (없으면 -1)
    core_primary_score   : np.ndarray (N_docs,)  # 대표 core의 path / combined score
    id2label             : dict[str,str] or None # label id(str) -> name
    depths               : np.ndarray (C,) or None  # 각 label의 depth (root=0)
    """

    N_docs = len(core_classes_per_doc)
    core_counts = np.array([len(x) for x in core_classes_per_doc], dtype=int)

    print("========== Core Class Mapping Summary (v2) ==========")
    print(f"총 문서 수                     : {N_docs}")
    print(f"core_primary 정의된 문서 수     : {(core_primary >= 0).sum()}")

    # 1) 문서당 core class 개수 분포 (평균 X, 분포만)
    print("\n[1] 문서당 core class 개수 분포")
    print(f" - 최소 : {core_counts.min()}  / 최대 : {core_counts.max()}")
    for k in sorted(np.unique(core_counts)):
        cnt = int((core_counts == k).sum())
        print(f"   · {k}개 core class 가진 문서 수 : {cnt} ({cnt / N_docs:.3%})")

    # 2) 대표 core score 분포 (평균 X, quantile + threshold 0.6/0.7/0.8/0.9)
    mask_has_primary = core_primary >= 0
    scores = core_primary_score[mask_has_primary]
    if scores.size > 0:
        print("\n[2] 대표 core score 분포 (core_primary가 있는 문서 기준)")
        print(f" - 문서 수 : {scores.size}")
        qs = np.percentile(scores, [0, 25, 50, 75, 100])
        print(
            " - 최소 / 25% / 50% / 75% / 최대 : "
            f"{qs[0]:.4f} / {qs[1]:.4f} / {qs[2]:.4f} / {qs[3]:.4f} / {qs[4]:.4f}"
        )

        for t in [0.6, 0.7, 0.8, 0.9]:
            cnt = int((scores >= t).sum())
            print(f"   · score ≥ {t:.1f} 인 문서 수 : {cnt} ({cnt / scores.size:.3%})")
    else:
        print("\n[2] 대표 core score 분포: core_primary가 정의된 문서가 없음")

    # 3) label별 core / primary 등장 횟수 (Top 20 + Worst 20)
    print("\n[3] label별 core / primary 등장 횟수 (문서 기준)")

    core_freq = Counter()
    for core_list in core_classes_per_doc:
        for c in core_list:
            core_freq[int(c)] += 1

    primary_freq = Counter()
    for c in core_primary[mask_has_primary]:
        primary_freq[int(c)] += 1

    distinct_core_labels = len(core_freq)
    distinct_primary_labels = len(primary_freq)
    print(f" - 한 번 이상 core로 등장한 label 수     : {distinct_core_labels}")
    print(f" - 한 번 이상 primary로 등장한 label 수 : {distinct_primary_labels}")

    def _label_name(cid: int) -> str:
        if id2label is None:
            return str(cid)
        return id2label.get(str(cid), str(cid))

    # Top 20 (core 등장 많은 순)
    print(f"\n   Top {top_k_labels} (core 등장 문서 수 기준)")
    for cid, cnt in core_freq.most_common(top_k_labels):
        name = _label_name(cid)
        p_cnt = primary_freq.get(cid, 0)
        print(f"   · [{cid:4d}] {name:30s}  core docs: {cnt:5d}  | primary docs: {p_cnt:5d}")

    # Worst 20 (core 등장 적은 순) – core로 한 번이라도 등장한 label 중에서
    print(f"\n   Worst {top_k_labels} (core로 거의 안 뽑힌 label들)")
    # Counter에는 1번 이상 등장한 label만 있으므로, 그 안에서 오름차순 정렬
    least_core = sorted(core_freq.items(), key=lambda x: (x[1], x[0]))[:top_k_labels]
    for cid, cnt in least_core:
        name = _label_name(cid)
        p_cnt = primary_freq.get(cid, 0)
        print(f"   · [{cid:4d}] {name:30s}  core docs: {cnt:5d}  | primary docs: {p_cnt:5d}")

    # 4) depth 정보가 있으면 depth별 분포
    if depths is not None:
        print("\n[4] depth별 분포 (depths가 주어진 경우)")
        depths = np.asarray(depths, dtype=int)

        # core 등장 기준 depth 분포
        depth_core_counts = Counter()
        for core_list in core_classes_per_doc:
            for c in core_list:
                d = int(depths[c])
                depth_core_counts[d] += 1

        depth_primary_counts = Counter()
        for c in core_primary[mask_has_primary]:
            d = int(depths[int(c)])
            depth_primary_counts[d] += 1

        print("   · depth별 core label 등장 횟수:")
        for d in sorted(depth_core_counts.keys()):
            cnt = depth_core_counts[d]
            print(f"     depth {d}: {cnt} 회")

        print("   · depth별 primary label 등장 문서 수:")
        for d in sorted(depth_primary_counts.keys()):
            cnt = depth_primary_counts[d]
            print(f"     depth {d}: {cnt} 문서")


# -----------------------------
# 예시 호출
# -----------------------------
summarize_core_class_mapping_v2(
    core_classes_per_doc = core_classes_per_doc,
    core_primary         = core_primary,
    core_primary_score   = core_primary_score,
    id2label             = id2label,   # 없으면 None
    depths               = None,       # depth 있으면 np.array로 넣기
    top_k_labels         = 60,
)


In [None]:
import pandas as pd
import json
import numpy as np

# ---------------------------------
# 1. 경로 설정 (네 파일 이름에 맞게 바꿔줘)
# ---------------------------------
GPT_TEXT_PATH  = "Amazon_products/train/generated_docs.txt"    # 예: 0<TAB>문장
GPT_LABEL_PATH = "Amazon_products/train/generated_doc2label.json"  # 예: {"0": {...}, "1": {...}, ...}


# ---------------------------------
# 2. GPT 텍스트 txt 불러오기
#    각 줄에서 첫 토큰은 doc_id, 나머지는 text로 파싱
# ---------------------------------
rows = []
with open(GPT_TEXT_PATH, "r", encoding="utf-8") as f:
    for line_no, line in enumerate(f, start=1):
        line = line.strip()
        if not line:
            continue
        parts = line.split(maxsplit=1)  # 첫 토큰과 나머지 전체를 분리
        if len(parts) == 1:
            print(f"[경고] 텍스트 없음 (line {line_no}): {line!r}")
            continue
        doc_id, text = parts[0], parts[1]
        rows.append((doc_id, text))

df_gpt = pd.DataFrame(rows, columns=["doc_id", "text"])
df_gpt["doc_id"] = df_gpt["doc_id"].astype(str)

print("GPT 텍스트 로드:", df_gpt.shape)
print(df_gpt.head(2))

# ---------------------------------
# 3. GPT 라벨 JSON 불러오기
# ---------------------------------
with open(GPT_LABEL_PATH, "r", encoding="utf-8") as f:
    gpt_label_info = json.load(f)

print("GPT 라벨 키 수:", len(gpt_label_info))
example_key = list(gpt_label_info.keys())[0]
print("라벨 예시:", example_key, "->", gpt_label_info[example_key])

# ---------------------------------
# 4. doc_id 정렬 & 정합성 체크
# ---------------------------------
text_ids  = set(df_gpt["doc_id"].tolist())
label_ids = set(gpt_label_info.keys())

print("\n텍스트 id 개수:", len(text_ids))
print("라벨   id 개수:", len(label_ids))
print("교집합 개수  :", len(text_ids & label_ids))

missing_in_labels = sorted(text_ids - label_ids)
missing_in_texts  = sorted(label_ids - text_ids)

if missing_in_labels:
    print("\n[경고] 라벨이 없는 텍스트 id들:", missing_in_labels[:10], "...")
if missing_in_texts:
    print("[경고] 텍스트가 없는 라벨 id들:", missing_in_texts[:10], "...")

# id 기준으로 정렬
df_gpt = df_gpt.sort_values("doc_id").reset_index(drop=True)
print("\n정렬 후 doc_id 상위 몇 개:", df_gpt["doc_id"].head().tolist())


In [None]:
import numpy as np

num_classes = 531 # 혹은 len(id2label)

def build_multihot_from_info(info, num_classes: int):
    """
    info 예:
    {
      "core_classes": "2",
      "with ancestors": ["0", "1", "2"]
    }
    -> 길이 num_classes 멀티핫 벡터
    """
    y = np.zeros(num_classes, dtype=np.float32)
    for cid_str in info["with ancestors"]:
        cid = int(cid_str)
        if 0 <= cid < num_classes:
            y[cid] = 1.0
    return y

gpt_texts = []
gpt_Y_all = []

for _, row in df_gpt.iterrows():   # df_gpt: doc_id, text (앞에서 만든 것)
    doc_id = row["doc_id"]
    text   = row["text"]

    if doc_id not in gpt_label_info:
        continue

    info = gpt_label_info[doc_id]
    y = build_multihot_from_info(info, num_classes)

    gpt_texts.append(text)
    gpt_Y_all.append(y)

gpt_Y_all = np.stack(gpt_Y_all, axis=0)  # [N_gpt, C]

print("GPT 샘플 수:", len(gpt_texts))
print("gpt_Y_all shape:", gpt_Y_all.shape)


In [None]:
with torch.no_grad():
    gpt_emb = st_model.encode(
        gpt_texts,
        batch_size=128,
        convert_to_tensor=True,
        show_progress_bar=True,
    ).cpu().numpy()   # [N_docs, d_doc]



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

# 1) 이미 위에서 만든 임베딩 그대로 사용
doc_emb_np   = total_doc_emb        # [N_docs, d_doc]
label_emb_np = label_emb            # [num_class, d_lab]

N_docs, d_doc = doc_emb_np.shape
num_classes, d_lab = label_emb_np.shape
print("N_docs:", N_docs, "| num_classes:", num_classes)

# 2) GAT용 adjacency (adj_upper → 대칭 + self-loop)
adj_np = adj_upper.astype(np.float32)
adj_np = adj_np + adj_np.T          # parent-child 양방향으로
np.fill_diagonal(adj_np, 1.0)       # self-loop

adj = torch.from_numpy(adj_np).to(device)           # [C, C]
init_class_emb = torch.from_numpy(label_emb_np).float().to(device)  # [C, d_lab]



In [None]:
# core_classes_per_doc: 길이 N_docs, 각 문서별 core class 리스트 (예: [3, 27, 81] …)
N_docs     = total_doc_emb.shape[0]
num_labels = num_class   # = label_emb.shape[0]

Y_ALL = np.zeros((N_docs, num_labels), dtype=np.float32)
for i, cores in enumerate(core_classes_per_doc):
    for cid in cores:
        Y_ALL[i, cid] = 1.0


In [None]:
# core_primary: (N_docs,)  각 문서당 대표 class id, 없으면 -1
core_primary = np.array(core_primary, dtype=np.int64)

# 유효한 라벨만 사용 (-1 제거)
mask = core_primary >= 0
doc_train = total_doc_emb[mask]          # [N_use, d_doc]
y_idx     = core_primary[mask]           # [N_use]


indices = np.arange(N_docs)
np.random.seed(42)
np.random.shuffle(indices)

split = int(N_docs * 0.8)
train_idx = indices[:split]
val_idx   = indices[split:]

X_train = total_doc_emb[train_idx]
Y_train = Y_ALL[train_idx]
X_val   = total_doc_emb[val_idx]
Y_val   = Y_ALL[val_idx]

print("train size:", X_train.shape[0])
print("val size  :", X_val.shape[0])



In [None]:
# 1) 기존 train/val 유지
print("원래 X_train:", X_train.shape, "Y_train:", Y_train.shape)
print("원래 X_val  :", X_val.shape,   "Y_val  :", Y_val.shape)

# 2) GPT를 train에만 concat
X_train_aug = np.concatenate([X_train, gpt_emb], axis=0)      # [N_train + N_gpt, d_doc]
Y_train_aug = np.concatenate([Y_train, gpt_Y_all], axis=0)    # [N_train + N_gpt, C]

print("증강 X_train:", X_train_aug.shape)
print("증강 Y_train:", Y_train_aug.shape)
print("val은 그대로 :", X_val.shape, Y_val.shape)


In [None]:
from torch.utils.data import Dataset, DataLoader

class DocLabelDataset(Dataset):
    def __init__(self, doc_np, y_np):
        self.doc = torch.from_numpy(doc_np).float()
        self.y   = torch.from_numpy(y_np).float()

    def __len__(self):
        return self.doc.size(0)

    def __getitem__(self, idx):
        return self.doc[idx], self.y[idx]

batch_size = 256

train_dataset = DocLabelDataset(X_train_aug, Y_train_aug)
val_dataset   = DocLabelDataset(X_val,   Y_val)


train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True,
    num_workers=2, pin_memory=True,
)
val_loader = DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False,
    num_workers=2, pin_memory=True,
)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LabelGATv2(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, adj, num_heads=4, dropout=0.1):
        """
        in_dim     : 입력 라벨 임베딩 차원
        hidden_dim : head당 hidden 차원
        out_dim    : 출력 라벨 임베딩 차원
        adj        : [C, C] adjacency (0/1 또는 양수, self-loop 포함)
        num_heads  : multi-head 개수
        """
        super().__init__()
        self.register_buffer("adj", adj)  # [C, C]

        self.num_heads = num_heads
        self.head_dim = hidden_dim

        # node feature projection: W
        self.lin = nn.Linear(in_dim, num_heads * hidden_dim, bias=False)

        # GATv2: head마다 하나의 attention 벡터 a_h
        self.att = nn.Parameter(torch.empty(num_heads, hidden_dim))

        self.leakyrelu = nn.LeakyReLU(0.2)
        self.dropout = nn.Dropout(dropout)

        self.fc_out = nn.Linear(num_heads * hidden_dim, out_dim)

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.lin.weight)
        nn.init.xavier_uniform_(self.att)
        nn.init.xavier_uniform_(self.fc_out.weight)
        if self.fc_out.bias is not None:
            nn.init.zeros_(self.fc_out.bias)

    def forward(self, x):
        """
        x : [C, in_dim]  (현재 label embedding)
        return : [C, out_dim]
        """
        C = x.size(0)

        # 1) 선형 변환 + head로 reshape
        h = self.lin(x)                              # [C, H*D]
        h = h.view(C, self.num_heads, self.head_dim) # [C, H, D]

        # GATv2 핵심: feature에 비선형 먼저
        h_act = F.elu(h)                             # [C, H, D]

        # 2) attention score 계산
        #   alpha[i,h] = a_h^T * h_act[i,h,:]
        alpha = (h_act * self.att.unsqueeze(0)).sum(-1)  # [C, H]

        # e_ij^h = LeakyReLU(alpha[i,h] + alpha[j,h])
        e = alpha.unsqueeze(1) + alpha.unsqueeze(0)      # [C, C, H]
        e = self.leakyrelu(e)

        # adjacency로 mask (edge 없는 곳은 -inf)
        mask = (self.adj > 0).unsqueeze(-1)              # [C, C, 1]
        e = e.masked_fill(~mask, float('-inf'))

        # 3) softmax로 attention coefficient
        #   각 타겟 i에 대해 이웃 j 방향으로 softmax (dim=1)
        attn = F.softmax(e, dim=1)      # [C, C, H]
        attn = self.dropout(attn)

        # 4) 이웃 aggregation: out[i,h,:] = Σ_j attn[i,j,h] * h_act[j,h,:]
        attn_h_first = attn.permute(2, 0, 1)   # [H, C, C]
        h_h_first    = h_act.permute(1, 0, 2)  # [H, C, D]

        out_heads = torch.bmm(attn_h_first, h_h_first)   # [H, C, D]
        out = out_heads.permute(1, 0, 2).contiguous()    # [C, H, D]
        out = out.view(C, self.num_heads * self.head_dim)  # [C, H*D]

        out = self.fc_out(out)  # [C, out_dim]
        out = F.elu(out)

        # residual: 차원이 같으면 x 더해줌
        if x.size(1) == out.size(1):
            out = out + x

        # label embedding은 보통 정규화해서 쓰는 게 안정적
        return F.normalize(out, dim=1)


In [None]:
import math
import numpy as np
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
from torch.nn import init

# transformers.AutoModel 는 이제 안 써도 됨
# from transformers import AutoModel


class ClassModel(nn.Module):
    def __init__(self, enc_dim, init_class_emb: torch.Tensor,
                 adj_norm: torch.Tensor, gnn_hidden=256):
        super().__init__()

        self.doc_dim = enc_dim
        self.num_classes, self.label_dim = init_class_emb.size()

        self.label_embedding = nn.Parameter(
            init_class_emb.clone().detach(), requires_grad=True
        )


        # 이제: GAT
        self.label_gnn = LabelGATv2(
            in_dim=self.label_dim,
            hidden_dim=gnn_hidden,
            out_dim=self.label_dim,
            adj=adj_norm,          # 또는 adj_mask
            num_heads=4,
            dropout=0.1,
        )

        self.interaction = LBM(
            self.doc_dim,
            self.label_dim,
            n_classes=self.num_classes,
            bias=False,
        )

    def forward(self, doc_emb):
        # doc_emb: [B, enc_dim]
        label_emb = self.label_gnn(self.label_embedding)    # [C, label_dim]
        scores = self.interaction(doc_emb, label_emb)       # [B, C]
        return scores







class LBM(nn.Module):
    def __init__(self, l_dim, r_dim, n_classes=None, bias=True):
        super(LBM, self).__init__()
        self.weight = Parameter(torch.Tensor(l_dim, r_dim))
        self.use_bias = bias
        if self.use_bias:
            self.bias = Parameter(torch.Tensor(n_classes))

        bound = 1.0 / math.sqrt(l_dim)
        init.uniform_(self.weight, -bound, bound)
        if self.use_bias:
            init.uniform_(self.bias, -bound, bound)

    def forward(self, e1, e2):
        """
        e1: tensor of size (batch_size, l_dim)  -> doc emb
        e2: tensor of size (n_classes, r_dim)   -> label emb
        return: tensor of size (batch_size, n_classes)
        """
        scores = torch.matmul(torch.matmul(e1, self.weight), e2.T)
        if self.use_bias:
            scores = scores + self.bias
        return scores



In [None]:
model = ClassModel(
    enc_dim=doc_train.shape[1],
    init_class_emb=init_class_emb,
    adj_norm=adj,
    gnn_hidden=256,
).to(device)



In [None]:
from sklearn.metrics import f1_score
import numpy as np
import torch
import torch.nn.functional as F

def logits_to_pred_binary(logits, threshold=0.5, min_k=1, max_k=3):
    """
    logits: [B, C] 텐서
    return: [B, C] numpy (0/1)
    """
    probs = torch.sigmoid(logits).detach().cpu().numpy()
    B, C = probs.shape
    pred_bin = np.zeros_like(probs, dtype=np.int32)

    for i in range(B):
        p = probs[i]
        pos = np.where(p >= threshold)[0].tolist()

        # 최소 min_k개는 선택
        if len(pos) < min_k:
            order = np.argsort(-p)
            for k in order:
                if k not in pos:
                    pos.append(k)
                if len(pos) >= min_k:
                    break

        # 최대 max_k개까지만
        if max_k is not None and len(pos) > max_k:
            pos = sorted(pos, key=lambda j: -p[j])[:max_k]

        pred_bin[i, pos] = 1

    return pred_bin


def evaluate_f1(model, dev_loader, device,
                threshold=0.5, min_k=1, max_k=3):
    """
    dev_loader: TeleEmbDataset 기준
        각 배치가 (doc_emb, labels) 형태라고 가정
        - doc_emb : [B, enc_dim]
        - labels  : [B, C] 멀티핫
    """
    model.eval()
    all_true = []
    all_pred = []

    with torch.no_grad():
        for doc_emb_b, labels_b_t in dev_loader:
            # doc_emb_b: [B, enc_dim] (CPU -> GPU)
            doc_emb_b = doc_emb_b.to(device)
            # labels_b: numpy [B, C]
            labels_b  = labels_b_t.cpu().numpy()

            # 모델은 이제 임베딩만 받음
            logits = model(doc_emb_b)  # [B, C]

            pred_b = logits_to_pred_binary(
                logits,
                threshold=threshold,
                min_k=min_k,
                max_k=max_k,
            )

            all_true.append(labels_b)
            all_pred.append(pred_b)

    y_true = np.concatenate(all_true, axis=0)
    y_pred = np.concatenate(all_pred, axis=0)

    micro = f1_score(y_true, y_pred, average="micro", zero_division=0)
    macro = f1_score(y_true, y_pred, average="macro", zero_division=0)
    return micro, macro


In [None]:
import torch
import torch.nn.functional as F

def build_pos_neg_masks_from_core(core_label_lists, num_labels, parents, children, device):
    """
    core_label_lists : List[List[int]] (배치 안 각 문서의 core 라벨 인덱스 리스트)
    return:
        pos_mask : [B, C] float (pos 위치 1.0)
        neg_mask : [B, C] float (neg 위치 1.0)
    """
    B = len(core_label_lists)
    pos_mask = torch.zeros(B, num_labels, dtype=torch.float32, device=device)
    neg_mask = torch.zeros(B, num_labels, dtype=torch.float32, device=device)

    all_classes = set(range(num_labels))

    for i, labs in enumerate(core_label_lists):
        if not labs:
            continue  # core 라벨이 전혀 없는 문서는 그냥 skip (이 배치에는 거의 없겠지만)

        ci = set(labs)

        # C_i^pos = parents ∪ Ci
        pos_set = set(ci)
        for c in ci:
            pos_set.update(parents[c])

        # children union
        chd_union = set()
        for c in ci:
            chd_union.update(children[c])

        # C_i^neg = C - C_i^pos - children union
        neg_set = all_classes - pos_set - chd_union

        pos_idx = list(pos_set)
        neg_idx = list(neg_set)

        pos_mask[i, pos_idx] = 1.0
        neg_mask[i, neg_idx] = 1.0

    return pos_mask, neg_mask


In [None]:
def hierarchical_bce_loss(logits, core_label_lists, num_labels, parents, children,
                          pos_alpha=3.0, neg_alpha=1.0,
                          pos_target=0.9, neg_target=0.1):
    device = logits.device
    pos_mask, neg_mask = build_pos_neg_masks_from_core(
        core_label_lists, num_labels, parents, children, device
    )

    log_p   = F.logsigmoid(logits)      # [B, C]
    log_1m_p = F.logsigmoid(-logits)    # [B, C]

    # 양성: 0.95, 음성: 0.05 로 label smoothing
    loss_pos = -pos_alpha * pos_mask * (
        pos_target * log_p + (1.0 - pos_target) * log_1m_p
    )
    loss_neg = -neg_alpha * neg_mask * (
        neg_target * log_p + (1.0 - neg_target) * log_1m_p
    )

    per_doc_loss = (loss_pos + loss_neg).sum(dim=1)  # [B]

    # 평균 낼 때는 마스크 개수 대신, 가중치 포함해서 나눠주는 게 깔끔
    per_doc_denom = (
        pos_alpha * pos_mask + neg_alpha * neg_mask
    ).sum(dim=1) + 1e-8

    per_doc_loss = per_doc_loss / per_doc_denom

    valid = per_doc_denom > 1e-6
    if valid.any():
        loss = per_doc_loss[valid].mean()
    else:
        loss = torch.tensor(0.0, device=device)

    return loss


In [None]:
from torch.utils.data import Dataset
import torch

class UnlabeledEmbDataset(Dataset):
    def __init__(self, emb_tensor):
        """
        emb_tensor : [N_unlab, enc_dim]  (SentenceTransformer 등으로 미리 뽑은 임베딩)
        """
        self.emb_tensor = emb_tensor

    def __len__(self):
        return self.emb_tensor.shape[0]  # 또는 len(self.emb_tensor)


    def __getitem__(self, idx):
        # doc_emb: [enc_dim], idx: 원본 인덱스 (pseudo 중복 방지용)
        return self.emb_tensor[idx], idx


In [None]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ----- mpnet 기반 class_emb 셋업 -----
num_labels, enc_dim = label_emb.shape
class_emb = torch.from_numpy(label_emb).float()   # [C, d]
class_emb = F.normalize(class_emb, dim=1)        # 선택사항이지만 보통 좋음
class_emb = class_emb.to(device)



# ----- 옵티마 / loss -----
EPOCHS           = 140                 # 5 epoch 이후부터 pseudo 넣을 거니까 20 정도로
PSEUDO_START_EP  = 5                  # 5 epoch 이후부터 pseudo 시작
PSEUDO_THRESH    = 0.45               # pseudo label threshold
lr               = 1e-3
weight_decay     = 1e-4

optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.BCEWithLogitsLoss()

best_macro = -1.0
best_state = None



In [None]:
print(len(train_loader))

In [None]:

for epoch in range(1, EPOCHS + 1):
    # ===== 1) train (labeled + 이미 추가된 pseudo 포함) =====
    model.train()
    total_loss = 0.0

    for doc_emb, labels in train_loader:   # (input_ids, attn_mask, y) 대신 (doc_emb, y)
        doc_emb = doc_emb.to(device)       # [B, enc_dim]
        labels  = labels.to(device)        # [B, C]

        optimizer.zero_grad()

        logits = model(doc_emb)            # [B, C]
        core_label_lists = []
        for i in range(labels.size(0)):
            labs = (labels[i] > 0.5).nonzero(as_tuple=False).squeeze(1).tolist()
            core_label_lists.append(labs)


        loss_sup = hierarchical_bce_loss(
            logits,
            core_label_lists,
            num_labels,
            parents,
            children,
        )
        """

        loss_sup = hierarchical_bce_loss_v2(
    logits,
    core_label_lists,
    num_labels,
    anc_t,
    desc_t,
    parents,
    # 추천 기본값 예시 (필요하면 조절)
    core_alpha=4.0, anc_alpha=2.0, neg_alpha=1.0,
    desc_alpha=0.0,            # 기존처럼 descendant는 무시
    core_target=0.9, anc_target=0.8, neg_target=0.05,
    lambda_hier=0.0,           # 원하면 0.02~0.1로 켜기
)
"""

        loss = loss_sup
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)

    # ===== 2) dev F1 =====
    micro_f1, macro_f1 = evaluate_f1(model, val_loader, device)

    print(f"[Epoch {epoch}/{EPOCHS}] "
          f"train_loss={avg_loss:.4f}  "
          f"dev_micro={micro_f1:.4f}  dev_macro={macro_f1:.4f}")

    # best 모델 저장 (macro 기준 예시)
    if macro_f1 > best_macro:
        best_macro = macro_f1
        best_state = {k: v.cpu() for k, v in model.state_dict().items()}
        print("  -> new best macro F1, saving state")




In [None]:
import os

if best_state is not None:
    save_path = os.path.join(DATA_DIR, "baba3_ablation_woutgnn_with_gpt.pt")
    torch.save(best_state, save_path)
    print("saved best model to:", save_path)




In [None]:
model.load_state_dict(torch.load(save_path, weights_only=True))

In [None]:
import os
import csv
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

test_corpus_path = os.path.join(DATA_DIR, "test", "test_corpus.txt")

test_doc_ids = []
test_texts = []

with open(test_corpus_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.rstrip("\n")
        if not line:
            continue
        doc_id, text = line.split("\t", 1)
        test_doc_ids.append(doc_id)
        test_texts.append(text)

print(f"# test docs: {len(test_doc_ids)}")


In [None]:
from sentence_transformers import SentenceTransformer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


with torch.no_grad():
    test_emb = st_model.encode(
        test_texts,
        batch_size=128,
        convert_to_tensor=True,
        show_progress_bar=True,
    ).cpu()   # ★ CPU로 내려놓기 (DataLoader worker 문제 피하려고)

print("test_emb shape:", test_emb.shape)


In [None]:
from torch.utils.data import Dataset

class TestEmbDataset(Dataset):
    def __init__(self, emb_tensor):
        # 혹시 리스트/넘파이여도 텐서로
        if not isinstance(emb_tensor, torch.Tensor):
            emb_tensor = torch.as_tensor(emb_tensor)
        # GPU로 들어왔으면 CPU로
        if emb_tensor.is_cuda:
            emb_tensor = emb_tensor.cpu()

        self.emb_tensor = emb_tensor  # [N_test, enc_dim]

    def __len__(self):
        return self.emb_tensor.shape[0]

    def __getitem__(self, idx):
        return self.emb_tensor[idx]   # [enc_dim]


test_dataset = TestEmbDataset(test_emb)
test_loader  = DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=False,
    num_workers=4,   # 에러 나면 0으로 줄이기
)


In [None]:
import numpy as np
import torch

CAND_TOP   = 50     # confidence 계산에 쓸 후보 수
CONF_ALPHA = 0.2    # path score에 confidence를 섞는 비율
CONF_MIN   = None   # 원하면 0.0 등으로 두고 conf < 0 컷

def decode_core_from_logits(
    logits: torch.Tensor,
    parents,
    siblings,
    paths,
    top_k_min: int = 2,
    top_k_max: int = 3,
    cand_top: int = CAND_TOP,
    conf_alpha: float = CONF_ALPHA,
    conf_min: float | None = CONF_MIN,
):
    """
    logits: [B, C]  (ClassModel 출력)
    return: pred_bin: [B, C] bool
    """
    probs = torch.sigmoid(logits).cpu().numpy()   # local sim(d,c) 역할
    B, C = probs.shape

    pred_bin = np.zeros((B, C), dtype=bool)

    for i in range(B):
        sim_row = probs[i]         # (C,)

        # 1) path score(d, c) = 경로 상 노드들의 평균
        path_row = np.zeros(C, dtype=np.float32)
        for cid, path in paths.items():
            path_row[cid] = float(sim_row[path].mean())

        # 2) path score 기준 1차 정렬
        idx_sorted = np.argsort(-path_row)
        cand = np.array(idx_sorted[:cand_top], dtype=np.int64)

        # 3) confidence 계산 (부모/형제 대비 여유)
        conf_vals = compute_conf_doc(
            sims=sim_row,
            cand=cand,
            parents=parents,
            siblings=siblings,
        )  # (len(cand),)

        # (옵션) confidence threshold
        if conf_min is not None:
            mask = conf_vals >= conf_min
            cand = cand[mask]
            conf_vals = conf_vals[mask]
            if cand.size == 0:
                continue

        # 4) 최종 점수: path + α * conf
        combined = path_row[cand] + conf_alpha * conf_vals
        order = np.argsort(-combined)

        # 상한: top_k_max까지
        k = min(top_k_max, len(order))
        chosen = cand[order[:k]]

        # 하한: top_k_min 보장 (부족하면 path_row 기준으로 추가)
        if len(chosen) < top_k_min:
            need = top_k_min - len(chosen)
            extra = [c for c in idx_sorted if c not in chosen][:need]
            if extra:
                chosen = np.concatenate([chosen, np.array(extra, dtype=np.int64)])

        pred_bin[i, chosen] = True

    return pred_bin


In [None]:
TOP_K_MIN = 2
TOP_K_MAX = 3

model.eval()
predictions = {}

BATCH_SIZE = 64
N = test_emb.shape[0]

with torch.no_grad():
    for start in range(0, N, BATCH_SIZE):
        end = min(start + BATCH_SIZE, N)
        doc_emb_b = test_emb[start:end].to(device)   # [B, enc_dim]

        logits = model(doc_emb_b)  # [B, C]

        # 기존: logits_to_pred_binary(...)
        # 새로: core-class 스타일 디코딩
        pred_bin = decode_core_from_logits(
            logits,
            parents=parents,
            siblings=siblings,
            paths=paths,
            top_k_min=TOP_K_MIN,
            top_k_max=TOP_K_MAX,
        )  # [B, C] bool

        for i in range(end - start):
            row_idx = start + i
            doc_id = test_doc_ids[row_idx]
            pos = np.where(pred_bin[i])[0]           # 선택된 label index들
            label_ids = [str(k) for k in pos]
            predictions[doc_id] = label_ids


In [None]:
submit_path = os.path.join(DATA_DIR, "baba3_ablation_woutgnn_with_gpt.csv")
with open(submit_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "label"])  # 포맷 맞게 조정

    for doc_id in test_doc_ids:
        label_ids = predictions[doc_id]  # ["3", "40", "169"] 같은 리스트
        label_str = ",".join(label_ids)
        writer.writerow([doc_id, label_str])

print("saved submission to:", submit_path)


In [None]:
import pandas as pd
from collections import Counter
import numpy as np
import re

df = pd.read_csv(submit_path)

print("=== 기본 정보 ===")
print("행 개수 (docs):", len(df))
print("컬럼:", list(df.columns))

# 1) 라벨 컬럼 이름 자동 선택 (label 또는 labels)
if "label" in df.columns:
    label_col = "label"
elif "labels" in df.columns:
    label_col = "labels"
else:
    raise ValueError("라벨 컬럼명을 찾을 수 없습니다. 'label' 또는 'labels'가 필요합니다.")

label_strs = df[label_col].fillna("").astype(str)

all_labels_flat  = []        # 개별 숫자 라벨
combo_counter    = Counter() # (169,199) 같은 조합 통계용
label_len_counter = Counter()# 문서당 라벨 개수 분포

for s in label_strs:
    s = s.strip(",")
    if not s:
        combo = ()
    else:
        # 콤마/공백 뭐가 와도 다 자르기: "3 40,169" -> ["3","40","169"]
        parts = re.split(r"[,\s]+", s)
        parts = [x for x in parts if x != ""]
        combo = tuple(int(x) for x in parts)

    if combo:
        combo_counter[combo] += 1
        label_len_counter[len(combo)] += 1
        all_labels_flat.extend(combo)
    else:
        label_len_counter[0] += 1

# 1) 문서당 라벨 개수 분포
print("\n=== 문서당 라벨 개수 분포 ===")
for k in sorted(label_len_counter.keys()):
    print(f"{k}개 라벨: {label_len_counter[k]} docs")

print("min labels/doc:", min(label_len_counter.keys()))
print("max labels/doc:", max(label_len_counter.keys()))

# 2) 개별 라벨 기준 통계
single_counter = Counter(all_labels_flat)
unique_single_labels = sorted(single_counter.keys())
print("\n=== 개별 라벨 기준 ===")
print("고유 라벨 개수:", len(unique_single_labels))

TOTAL_CLASSES = 531
print("coverage(single):",
      f"{len(unique_single_labels)}/{TOTAL_CLASSES} = {len(unique_single_labels)/TOTAL_CLASSES:.3f}")

print("\nTop 20 single labels:")
for lid, c in single_counter.most_common(20):
    print(f"{lid} : {c}")

# 3) 조합(세트) 기준 통계
print("\n=== 라벨 조합 기준 (tuple 그대로) ===")
print("고유 조합 개수:", len(combo_counter))

print("\nTop 20 label combos:")
for combo, c in combo_counter.most_common(20):
    print(f"{','.join(map(str, combo))} : {c}")
