In [1]:
import random
import numpy as np
import torch
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [2]:

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset
from transformers import AutoTokenizer, AutoModel

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import re



def load_label_file(path: str) -> str:
    """key: value1,value2,... 형식으로 된 .txt 파일을 통째로 읽어서 문자열로 반환"""
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def parse_key_value_lines(text: str):
    """'key:val1,val2,...' 여러 줄을 딕셔너리로 변환"""
    id2label = {}
    for line in text.splitlines():
        line = line.strip()
        if not line or ":" not in line:
            continue
        key, vals = line.split(":", 1)
        id2label[key.strip()] = vals.strip()
    return id2label

def preprocess_label_text(label_path_str: str):
    cleaned = label_path_str.lower()
    cleaned = re.sub(r"[:,]", " ", cleaned)
    cleaned = re.sub(r"_", " ", cleaned)
    cleaned = re.sub(r"[^a-z0-9 ]", " ", cleaned)
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    return cleaned

def build_tfidf_vectorizer(label_texts):
    vectorizer = TfidfVectorizer()
    label_tfidf = vectorizer.fit_transform(label_texts)
    return vectorizer, label_tfidf

def compute_lexical_similarity(doc_text, vectorizer, label_tfidf):
    doc_vec = vectorizer.transform([doc_text])
    sims = cosine_similarity(doc_vec, label_tfidf)[0]
    return sims

    
label_raw_text = load_label_file("Amazon_products/class_related_keywords.txt")  # 네 파일 이름에 맞춰 바꿔
id2label = parse_key_value_lines(label_raw_text)

# 2) 라벨 텍스트 전처리해서 TF-IDF 학습
label_keys = list(id2label.keys())
label_texts = [
    preprocess_label_text(f"{k} {id2label[k]}")
    for k in label_keys
]
vectorizer, label_tfidf = build_tfidf_vectorizer(label_texts)

# 3) 테스트용 문서 하나 넣어보기
doc = "gourmet organic chocolate snack"
doc_clean = preprocess_label_text(doc)
sims = compute_lexical_similarity(doc_clean, vectorizer, label_tfidf)

# 4) 결과 보기
label_sims = list(zip(label_keys, sims))
label_sims.sort(key=lambda x: x[1], reverse=True)

for lbl, score in label_sims:
    print(lbl, round(score, 4))


chocolate_bars 0.5059
chocolate_gifts 0.4276
chocolate 0.3754
chocolate_covered_fruit 0.3548
dried_fruit_raisins 0.2765
chocolate_pretzels 0.2373
fresh_baked_cookies 0.2332
grocery_gourmet_food 0.2266
snack_gifts 0.2254
chocolate_assortments 0.2173
candy_chocolate 0.1635
hot_cocoa 0.1609
food 0.1457
gourmet_gifts 0.1342
snack_food 0.121
trail_mix 0.1181
granola_trail_mix_bars 0.1044
fruit_leather 0.1023
toaster_pastries 0.1015
cookies 0.0996
fruit 0.0941
raisins 0.0925
meat_poultry 0.0903
marshmallows 0.087
changing_table_pads_covers 0.0863
popcorn 0.085
granola_bars 0.0846
produce 0.0833
solid_feeding 0.0831
milk 0.0827
chocolate_truffles 0.0822
rice_cakes 0.0777
nutrition_wellness 0.0746
party_mix 0.0734
p_t_s 0.0716
fruit_gifts 0.0684
sensual_delights 0.0649
foie_gras_p_t_s 0.062
sugars_sweeteners 0.0604
salsas 0.0594
eggs 0.059
cakes 0.0569
nutrition_bars_drinks 0.0569
chocolate_covered_nuts 0.0565
dessert_gifts 0.0551
spices_gifts 0.0545
meat_gifts 0.0541
crackers 0.0533
juices 0.

In [3]:
def build_label_embeddings(label_keys, label_tfidf, dense: bool = True):
    """
    label_keys: 라벨 이름 리스트 (vectorize할 때 썼던 순서와 같아야 함)
    label_tfidf: shape = (n_labels, vocab_size) 인 sparse matrix
    dense: True면 numpy array로 바꿔서 돌려줌

    return:
        dict: {label_name: embedding_vector}
    """
    embeddings = {}
    if dense:
        label_tfidf_dense = label_tfidf.toarray()
        for i, label in enumerate(label_keys):
            embeddings[label] = label_tfidf_dense[i]
    else:
        # sparse 그대로
        for i, label in enumerate(label_keys):
            embeddings[label] = label_tfidf[i]
    return embeddings

label_embeddings = build_label_embeddings(label_keys, label_tfidf, dense=True)
print(label_embeddings["grocery_gourmet_food"].shape)  # (vocab_size,)


(3466,)


In [4]:

def load_edges(path):
    edges = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            raw = line.strip()
            if not raw or raw.startswith("#"):
                continue
            parts = raw.split()
            if len(parts) < 2:
                continue
            try:
                u, v = int(parts[0]), int(parts[1])
            except ValueError:
                continue
            edges.append((u, v))
    return edges

def find_roots(edges):
    parents = set()
    children = set()
    for u, v in edges:
        parents.add(u)
        children.add(v)
    # 부모로만 나온 애들 = 루트들
    roots = parents - children
    return sorted(roots)

# --- 사용 ---
E = load_edges("Amazon_products/class_hierarchy.txt")

N = 531
A = np.zeros((N, N), dtype=np.uint8)
for u, v in E:
    A[u, v] = 1
    A[v, u] = 1   # 탐색용으로는 무방향 인접행렬 써도 됨

B = np.zeros((N, N), dtype=np.uint8)
for u, v in E:
    B[u, v] = 1

roots = find_roots(E)
print("roots:", roots)


roots: [0, 3, 10, 23, 40, 169]


In [5]:

# ---------------------------
# GAT 
# ---------------------------

class SimpleGATLayer(nn.Module):
    def __init__(self, in_dim, out_dim, heads=4, concat=True, dropout=0.2, negative_slope=0.2, residual=True):
        super().__init__()
        self.heads = heads
        self.out_dim = out_dim
        self.concat = concat
        self.dropout = nn.Dropout(dropout)
        self.leaky_relu = nn.LeakyReLU(negative_slope)
        self.lin = nn.Linear(in_dim, heads * out_dim, bias=False)
        self.a_src = nn.Parameter(torch.Tensor(heads, out_dim))
        self.a_dst = nn.Parameter(torch.Tensor(heads, out_dim))
        self.residual = residual
        if residual and (in_dim == (heads * out_dim if concat else out_dim)):
            self.res_proj = nn.Identity()
        elif residual:
            self.res_proj = nn.Linear(in_dim, heads * out_dim if concat else out_dim, bias=False)
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.lin.weight)
        nn.init.xavier_uniform_(self.a_src)
        nn.init.xavier_uniform_(self.a_dst)
        if self.residual and not isinstance(getattr(self, "res_proj", None), nn.Identity):
            nn.init.xavier_uniform_(self.res_proj.weight)

    def forward(self, x, adj):
        """
        x: [N, Fin]
        adj: [N, N] (0/1; self-loop 없음)
        """
        N = x.size(0)
        Wh = self.lin(x).view(N, self.heads, self.out_dim)  # [N, H, F]

        e_src = (Wh * self.a_src).sum(dim=-1)  # [N, H]
        e_dst = (Wh * self.a_dst).sum(dim=-1)  # [N, H]
        e = e_src.unsqueeze(1) + e_dst.unsqueeze(0)  # [N, N, H]
        e = self.leaky_relu(e)
        # --- 안전한 masked softmax ---
        mask = (adj > 0).unsqueeze(-1)                    # [N, N, 1]
        e = e.masked_fill(~mask, -1e9)                    # -inf 대신 -1e9로 NaN 방지
        alpha = torch.softmax(e, dim=1)                   # 소프트맥스
        alpha = alpha * mask.float()                      # 마스크로 0 처리
        denom = alpha.sum(dim=1, keepdim=True).clamp(min=1e-12)  # 이웃 없을 때 0 분모 방지
        alpha = alpha / denom                             # 이웃들로 정규화

        out = torch.einsum("ijh,jhf->ihf", alpha, Wh)     # [N, H, F]
        out = out.reshape(N, self.heads * self.out_dim) if self.concat else out.mean(dim=1)
        out = self.dropout(out)
        if self.residual:
            out = out + self.res_proj(x)                  # self-loop 없는 대신 residual로 자기정보 유지
        return out

class GATEncoder(nn.Module):
    def __init__(self, in_dim, hid_dim=64, out_dim=768, heads1=4, heads2=4, dropout=0.2):
        super().__init__()
        self.gat1 = SimpleGATLayer(in_dim, hid_dim, heads=heads1, concat=True,  dropout=dropout, residual=True)
        self.gat2 = SimpleGATLayer(hid_dim*heads1, out_dim, heads=heads2, concat=False, dropout=dropout, residual=True)
        self.act = nn.ELU(); self.dropout = nn.Dropout(dropout)
    def forward(self, x, adj):
        h = self.gat1(x, adj); h = self.act(h); h = self.dropout(h)
        z = self.gat2(h, adj)
        return z  # [N, out_dim]


In [6]:
import pandas as pd

In [7]:
hidden_dim=64
out_dim=3466
heads1=8
heads2=8
dropout=0.2
epochs=200
lr=1e-3
weight_decay=5e-4
neg_ratio=1.0
eval_every=20
use_full_graph_for_final=True
pad_width=2
normalize_out = True
device = "cuda" if torch.cuda.is_available() else "cpu"
X = np.vstack([label_embeddings[k] for k in label_keys]).astype(np.float32)
X = torch.tensor(X, dtype=torch.float32, device=device)

N, d0 = X.shape


gat = GATEncoder(in_dim=d0, hid_dim=hidden_dim, out_dim=out_dim, heads1=heads1, heads2=heads2, dropout=dropout).to(device)


In [8]:
def load_docs_txt(path):
    """
    'idx<TAB>text' 형태의 파일을 읽어서
    ids: [int, ...]
    texts: [str, ...]
    을 리턴
    """
    ids = []
    texts = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            # 탭 기준
            idx_str, txt = line.split("\t", 1)
            ids.append(int(idx_str))
            texts.append(txt)
    return ids, texts



def build_doc_embeddings_from_existing_vectorizer(doc_texts, vectorizer):
    """
    doc_texts: 전처리 전의 원문 리스트
    vectorizer: 라벨에 대해 fit되어 있는 TfidfVectorizer
    return: dense numpy array [N_docs, vocab]
    """
    # 라벨이랑 동일 규칙으로 전처리
    cleaned_docs = [preprocess_label_text(t) for t in doc_texts]
    doc_tfidf = vectorizer.transform(cleaned_docs)   # sparse
    doc_emb = doc_tfidf.toarray().astype(np.float32)
    return doc_emb

# 사용 예시
# 1) 문서 읽기
doc_ids, doc_texts = load_docs_txt("Amazon_products/train/train_corpus.txt")

# 2) 라벨 때 만든 vectorizer 재사용해서 임베딩 만들기
doc_embeddings = build_doc_embeddings_from_existing_vectorizer(doc_texts, vectorizer)

In [9]:
print(doc_embeddings.shape)

(29487, 3466)


In [10]:
import numpy as np
import numpy as np

def hierarchical_beam_similarity_avg(
    doc_vec: np.ndarray,
    label_emb: np.ndarray,
    adj_upper: np.ndarray,
    roots: list[int] = [0],       # 여러 루트
    beam: int = 5,
    per_parent: str | int = "l+2",
    tau: float = 0.35,
    eps: float = 1e-9,
    max_depth: int | None = None,
    normalize: bool = False,      # 필요하면 True로
):
    doc = np.asarray(doc_vec, dtype=np.float32)
    L = np.asarray(label_emb, dtype=np.float32)
    A = np.asarray(adj_upper).astype(bool)
    N, d = L.shape

    if normalize:
        doc = doc / (np.linalg.norm(doc) + eps)
        L = L / (np.linalg.norm(L, axis=1, keepdims=True) + eps)

    # 로컬 점수
    sims = L @ doc
    p = 1.0 / (1.0 + np.exp(-sims / max(tau, 1e-6)))

    children = [np.flatnonzero(A[i]) for i in range(N)]

    S = np.full(N, -np.inf, dtype=np.float32)
    K = np.full(N, -np.inf, dtype=np.float32)
    Llen = np.zeros(N, dtype=np.int32)

    roots = list(roots)
    for r in roots:
        S[r] = 0.0
        Llen[r] = 0
        K[r] = -np.inf

    levels = [roots[:]]
    cur = roots[:]
    level_id = 0

    while True:
        cand_best = {}
        k_parent = (level_id + 2) if (per_parent == "l+2") else int(per_parent)

        for par in cur:
            ch = children[par]
            if ch.size == 0:
                continue
            if ch.size > k_parent:
                idx = np.argpartition(-sims[ch], k_parent - 1)[:k_parent]
                ch = ch[idx]
            for c in ch:
                S_c = S[par] + float(p[c])
                L_c = Llen[par] + 1
                K_c = S_c / (L_c + eps)
                if (c not in cand_best) or (K_c > cand_best[c][2]):
                    cand_best[c] = (S_c, L_c, K_c)

        if not cand_best:
            break

        kept = sorted(cand_best.items(), key=lambda x: x[1][2], reverse=True)[:min(beam, len(cand_best))]
        next_level = [i for i, _ in kept]
        for i, (Si, Li, Ki) in kept:
            S[i], Llen[i], K[i] = Si, Li, Ki

        levels.append(next_level)
        cur = next_level
        level_id += 1
        if max_depth is not None and level_id >= max_depth:
            break

    return K, levels, sims, p



def topk_labels_by_avg(
    doc_vec, label_emb, adj_upper, rootㄴ=(0,), beam=5, per_parent="l+2", k=5, **kw
):
    """평균 점수 기반 최종 상위 k 라벨(루트 제외)."""
    K, levels, sims, p = hierarchical_beam_similarity_avg(
        doc_vec, label_emb, adj_upper, root=list(roots), beam=beam, per_parent=per_parent, **kw
    )
    root_set = set(roots)
    order = np.argsort(-K)
    order = [i for i in order if i not in root_set and np.isfinite(K[i])]
    top = order[:k]
    return top, K[top]

In [12]:
"""
Self-training pipeline with hierarchical silver labeling and dynamic dataloaders.

- Reads document/label embeddings CSVs (first column "id", rest feat000..feat127)
- Reads upper-triangular adjacency (A[i,j]=1 means i->j)
- Makes initial silver labels via hierarchical beam search (average score)
- Splits into train/val on silver set; keeps the rest as unlabeled pool
- Trains a multi-label classifier (Linear/MLP) with BCEWithLogitsLoss
- Each epoch, pseudo-labels unlabeled docs whose predicted probs exceed a threshold
- Adds them to the training set (up to top_k per doc), with patience-based early stopping

Run example
-----------
python self_training_pipeline.py \
  --doc_csv docs.csv \
  --label_csv labels.csv \
  --adj adj.npy \
  --val_ratio 0.2 --epochs 50 --patience 5 \
  --silver_threshold 0.60 --silver_topk 3 --beam 5 --tau 0.35 --root_id 0 \
  --pseudo_threshold 0.70 --pseudo_topk 3 --batch_size 256 --lr 1e-3
"""
from __future__ import annotations
import argparse
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
def load_embeddings_csv(path: str | Path, id_col: str = "id") -> Tuple[List[int], np.ndarray]:
    """Load embeddings from CSV where the first column is an id and the rest are feature columns.
    Returns (ids, float32 matrix).
    """
    df = pd.read_csv(path)
    cols = list(df.columns)
    if id_col in df.columns:
        id_series = df[id_col]
        X = df.drop(columns=[id_col])
    else:
        # Fallback: use the first column as id
        id_series = df.iloc[:, 0]
        X = df.iloc[:, 1:]
    ids = id_series.astype(int).tolist()
    X = X.to_numpy(dtype=np.float32)
    return ids, X


# ----------------------------- Datasets -----------------------------

class MultiLabelDataset(Dataset):
    def __init__(self, X: np.ndarray, Y: np.ndarray, indices: List[int] | None = None):
        self.X = X
        self.Y = Y
        self.indices = np.array(indices if indices is not None else np.arange(X.shape[0]), dtype=np.int64)
    def __len__(self):
        return self.indices.shape[0]
    def __getitem__(self, idx: int):
        i = int(self.indices[idx])
        x = torch.from_numpy(self.X[i])
        y = torch.from_numpy(self.Y[i])
        return x, y

class UnlabeledDataset(Dataset):
    def __init__(self, X: np.ndarray, indices: List[int]):
        self.X = X
        self.indices = np.array(indices, dtype=np.int64)
    def __len__(self):
        return self.indices.shape[0]
    def __getitem__(self, idx: int):
        i = int(self.indices[idx])
        x = torch.from_numpy(self.X[i])
        return x, i

# ----------------------------- Model -----------------------------

class MLPHead(nn.Module):
    def __init__(self, in_dim: int, out_dim: int, hidden: int | None = 256, dropout: float = 0.1):
        super().__init__()
        if hidden is None or hidden <= 0:
            self.net = nn.Sequential(
                nn.LayerNorm(in_dim),
                nn.Linear(in_dim, out_dim),
            )
        else:
            self.net = nn.Sequential(
                nn.LayerNorm(in_dim),
                nn.Linear(in_dim, hidden),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Linear(hidden, out_dim),
            )
    def forward(self, x):
        return self.net(x)

# ----------------------------- Utils -----------------------------

def to_device(batch, device):
    if isinstance(batch, (tuple, list)):
        return [b.to(device) if torch.is_tensor(b) else b for b in batch]
    return batch.to(device)


def micro_f1(y_true: np.ndarray, y_prob: np.ndarray, thr: float = 0.5, eps: float = 1e-9) -> float:
    y_pred = (y_prob >= thr).astype(np.float32)
    tp = (y_true * y_pred).sum()
    fp = ((1 - y_true) * y_pred).sum()
    fn = (y_true * (1 - y_pred)).sum()
    prec = tp / (tp + fp + eps)
    rec = tp / (tp + fn + eps)
    f1 = 2 * prec * rec / (prec + rec + eps)
    return float(f1)

# -------- Initial silver labeling (no CSV save; in-memory) --------
def make_initial_silver_hier(
    docs: np.ndarray,
    labels: np.ndarray,
    adj: np.ndarray,
    roots: list[int] = [0],
    silver_threshold: float = 0.6,    # 이건 avg(K) 기준
    silver_topk: int = 3,
    beam: int = 5,
    per_parent: str | int = "l+2",
    tau: float = 0.35,
) -> list[list[int]]:
    """
    계층 빔 서치로 각 문서의 라벨 후보를 뽑는다.
    - 계층 밖 라벨은 애초에 안 들어옴
    - 루트들은 결과에서 제외
    - K(경로 평균) >= silver_threshold 인 애들 중 top-k
    """
    N = labels.shape[0]
    silver: list[list[int]] = []
    root_set = set(roots)

    for d in docs:
        K, levels, sims, p = hierarchical_beam_similarity_avg(
            d, labels, adj,
            roots=roots,
            beam=beam,
            per_parent=per_parent,
            tau=tau,
            normalize=False,   # 너 임베딩이 이미 L2라면 False
        )
        # 평균 점수로 정렬
        order = np.argsort(-K)
        # 루트는 제외, 유한한 것만
        order = [i for i in order if (i not in root_set) and np.isfinite(K[i])]
        # threshold 통과한 것만
        cand = [i for i in order if K[i] >= silver_threshold]
        selected = cand[:silver_topk]
        silver.append(selected)

    return silver

def make_initial_silver(
    docs: np.ndarray,
    labels: np.ndarray,
    adj: np.ndarray,              # 이제 안 씀 (호환용으로만 둠)
    silver_threshold: float = 0.9,
    silver_topk: int = 3,
    beam: int = 5,                # 이제 안 씀
    tau: float = 0.35,
    root_id: int = 0,
) -> List[List[int]]:
    """
    문서마다 전 라벨 임베딩과의 유사도를 보고 초기 silver label을 만든다.
    - 트리/경로 탐색 안 함
    - root_id는 결과에서 제외
    - p >= silver_threshold인 라벨 중에서 상위 silver_topk만 남김
    """
    N = labels.shape[0]
    silver: List[List[int]] = []

    for d in docs:
        # 문서 vs 모든 라벨 점수
        sims, p = all_label_similarity(d, labels, tau=tau, normalize=True)

        # threshold 통과 + root 제외
        cand = [
            (i, float(p[i]))
            for i in range(N)
            if i != root_id and np.isfinite(p[i]) and p[i] >= silver_threshold
        ]

        # 점수 높은 순
        cand.sort(key=lambda x: x[1], reverse=True)

        # label index만 추출
        selected = [i for i, _ in cand[:silver_topk]]
        silver.append(selected)

    return silver
# ------------------------ Train / Self-Training ------------------------

def train_epoch(model, loader, optim, device, criterion):
    model.train()
    total = 0.0
    for x, y in loader:
        x, y = to_device(x, device), to_device(y, device)
        optim.zero_grad(set_to_none=True)
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optim.step()
        total += float(loss.detach().cpu().item()) * x.size(0)
    return total / max(1, len(loader.dataset))


def eval_epoch(model, loader, device, criterion, thr=0.5):
    model.eval()
    total = 0.0
    ys = []
    ps = []
    with torch.no_grad():
        for x, y in loader:
            x, y = to_device(x, device), to_device(y, device)
            logits = model(x)
            loss = criterion(logits, y)
            total += float(loss.detach().cpu().item()) * x.size(0)
            prob = torch.sigmoid(logits).detach().cpu().numpy()
            ys.append(y.detach().cpu().numpy())
            ps.append(prob)
    y_true = np.concatenate(ys, axis=0)
    y_prob = np.concatenate(ps, axis=0)
    f1 = micro_f1(y_true, y_prob, thr=thr)
    return total / max(1, len(loader.dataset)), f1, y_prob


def pseudo_label_and_grow(model, unl_ds: UnlabeledDataset,
                          num_labels: int,
                          pseudo_threshold: float = 0.9, pseudo_topk: int = 3,
                          device: str = "cpu", batch_size: int = 512):
    """Infer on unlabeled, select labels with prob>=threshold (top-k), and return new_indices and Y matrix."""
    if len(unl_ds) == 0:
        return [], np.zeros((0, num_labels), dtype=np.float32)
    loader = DataLoader(unl_ds, batch_size=batch_size, shuffle=False)
    model.eval()
    all_idx: List[int] = []
    all_y: List[np.ndarray] = []
    with torch.no_grad():
        for xb, idxs in loader:
            xb = xb.to(device)
            logits = model(xb)
            prob = torch.sigmoid(logits).detach().cpu().numpy()
            for p, i in zip(prob, idxs.numpy().tolist()):
                sel = np.flatnonzero(p >= pseudo_threshold)
                if sel.size > 0:
                    # keep at most top-k by prob
                    if sel.size > pseudo_topk:
                        top = np.argpartition(-p[sel], pseudo_topk - 1)[:pseudo_topk]
                        sel = sel[top]
                    y = np.zeros(num_labels, dtype=np.float32)
                    y[sel] = 1.0
                    all_idx.append(int(i))
                    all_y.append(y)
    if len(all_idx) == 0:
        return [], np.zeros((0, num_labels), dtype=np.float32)
    Y_new = np.stack(all_y, axis=0)
    return all_idx, Y_new






In [66]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader

doc_ids = np.arange(len(doc_embeddings), dtype=np.int64)   # 0..num_docs-1
X = doc_embeddings.astype(np.float32)                      # [num_docs, d_doc]

# 라벨 임베딩 세팅
label_ids = np.arange(len(label_keys), dtype=np.int64)     # 0..530
L = np.vstack([label_embeddings[k] for k in label_keys]).astype(np.float32)   # [531, d_label]


# 1) 라벨 순서와 B(부모->자식) 맞추기
order = np.argsort(label_ids)
label_ids = [label_ids[i] for i in order]
L = L[order]
assert B.shape == (L.shape[0], L.shape[0]), "Adjacency/label size mismatch"




# 2) 계층 silver 만들기
silver = make_initial_silver_hier(
    X,          # docs (N, d)
    L,          # label_emb (C, d)
    B,          # upper adj (C, C)
    roots=roots,
    silver_threshold=0.6,
    silver_topk=3,
    beam=5,
    per_parent="l+2",
    tau=0.35,
)


# -------------------------------------------------
# 1) 계층 정보에서 parents / children 뽑기
#    B[parent, child] = 1 이라고 했으니까 그대로 씀
# -------------------------------------------------
# B: [C, C] (parent -> child)
def build_parents_children(adj):
    C = adj.shape[0]
    parents = [np.flatnonzero(adj[:, j]).astype(np.int64) for j in range(C)]
    children = [np.flatnonzero(adj[j]).astype(np.int64) for j in range(C)]
    return parents, children

parents, children = build_parents_children(B)


# -------------------------------------------------
# 2) silver → 계층 pos/neg 마스크로 변환
# -------------------------------------------------
def build_pos_neg_masks(silver, parents, children, num_labels):
    """
    silver: list[list[int]]  # 문서마다 core label index들
    parents / children: list[np.ndarray]
    return:
      pos_masks: np.array [N_docs, C]
      neg_masks: np.array [N_docs, C]
    """
    N = len(silver)
    C = num_labels
    pos_masks = np.zeros((N, C), dtype=np.float32)
    neg_masks = np.zeros((N, C), dtype=np.float32)

    all_idx = np.arange(C)

    for i, core in enumerate(silver):
        core = list(core)
        # 1) core의 부모까지 positive
        pos_set = set(core)
        for c in core:
            for p in parents[c]:
                pos_set.add(int(p))

        # 2) children은 나중에 negative에서 제외
        child_set = set()
        for c in core:
            for ch in children[c]:
                child_set.add(int(ch))

        # pos 마스크
        for p in pos_set:
            pos_masks[i, p] = 1.0

        # neg = 전체 - pos - children
        for j in all_idx:
            if j in pos_set:
                continue
            if j in child_set:
                continue
            neg_masks[i, j] = 1.0

    return pos_masks, neg_masks

# -------------------------------------------------
# 3) Dataset: 문서 임베딩 + pos/neg 마스크
# -------------------------------------------------
class HierMultiLabelDataset(Dataset):
    def __init__(self, X, pos_masks, neg_masks, indices=None):
        self.X = X.astype(np.float32)
        self.pos = pos_masks.astype(np.float32)
        self.neg = neg_masks.astype(np.float32)
        if indices is None:
            self.indices = np.arange(self.X.shape[0], dtype=np.int64)
        else:
            self.indices = np.array(indices, dtype=np.int64)

    def __len__(self):
        return self.indices.shape[0]

    def __getitem__(self, idx):
        i = int(self.indices[idx])
        x = torch.from_numpy(self.X[i])
        pos = torch.from_numpy(self.pos[i])
        neg = torch.from_numpy(self.neg[i])
        return x, pos, neg

class UnlabeledDataset(Dataset):
    def __init__(self, X, indices):
        self.X = X.astype(np.float32)
        self.indices = np.array(indices, dtype=np.int64)
    def __len__(self):
        return len(self.indices)
    def __getitem__(self, idx):
        i = int(self.indices[idx])
        return torch.from_numpy(self.X[i]), i




In [67]:

# -------------------------------------------------
# 4) Bilinear classifier
#    doc_emb: [B, d_doc]
#    label_emb: [C, d_lab]  (미리 GAT로 만든 거)
#    점수: doc @ W @ label_emb^T
# -------------------------------------------------
class GATHierClassifier(nn.Module):
    def __init__(
        self,
        doc_dim,
        gat_encoder,       # 미리 만들어서 넣는 GATEncoder
        label_features,    # GAT 입력 X_label: [C, d0]  (TF-IDF label emb 등)
        label_adj,         # 라벨 그래프 A: [C, C]
        hidden_dim=None,
        normalize_label=True,
    ):
        super().__init__()
        self.gat = gat_encoder
        self.normalize_label = normalize_label
        dev = next(gat_encoder.parameters()).device  # 보통 cuda:0

        # label_features / adj는 매 번 같으니까 buffer로 들고 있음
        self.register_buffer(
            "label_feat",
            torch.tensor(label_features, dtype=torch.float32, device=dev)
        )
        self.register_buffer(
            "label_adj",
            torch.tensor(label_adj, dtype=torch.float32, device=dev)
        )

        # 라벨 임베딩 차원(d_lab)은 GAT의 out_dim이랑 같아야 함
        # ⇒ GATEncoder(out_dim=...)에서 쓴 값.
        # 여기서는 한 번 더 forward 없이, gat_encoder의 설정을 알고 있다고 가정.
        # 예: out_dim=3466이었다면 그걸 그대로 넣어줬다고 가정.
        # 편하게 하려면 아래처럼 실제로 한 번 뽑아서 써도 됨:
        with torch.no_grad():
            z_sample = self.gat(self.label_feat, self.label_adj)  # [C, d_lab]
        d_lab = z_sample.size(1)

        self.doc_dim = doc_dim
        self.label_dim = d_lab

        if hidden_dim is None:
            self.interaction = nn.Linear(doc_dim, d_lab, bias=False)
        else:
            self.interaction = nn.Sequential(
                nn.Linear(doc_dim, hidden_dim),
                nn.GELU(),
                nn.Linear(hidden_dim, d_lab, bias=False),
            )

    def forward(self, x):
        """
        x: [B, d_doc]
        return: logits [B, C]
        """
        # 1) 현재 GAT로부터 라벨 임베딩 계산
        label_emb = self.gat(self.label_feat, self.label_adj)  # [C, d_lab]
        if self.normalize_label:
            label_emb = F.normalize(label_emb, p=2, dim=1)

        # 2) 문서 → 라벨 공간으로 투영
        h = self.interaction(x)                                # [B, d_lab]

        # 3) bilinear 점수
        logits = torch.matmul(h, label_emb.t())                # [B, C]
        return logits

# -------------------------------------------------
# 5) loss: 계층 마스크를 씌운 BCE
# -------------------------------------------------
def hierarchical_bce_loss(logits, pos_mask, neg_mask):
    # logits: [B, C]
    # pos_mask, neg_mask: [B, C]
    loss_pos = -(pos_mask * F.logsigmoid(logits)).sum()
    loss_neg = -(neg_mask * F.logsigmoid(-logits)).sum()
    denom = (pos_mask.sum() + neg_mask.sum()).clamp(min=1.0)
    return (loss_pos + loss_neg) / denom

# -------------------------------------------------
# 6) 학습 루프 예시
# -------------------------------------------------
# 이미 있는 것들: X (문서 BERT 임베딩) : [N_docs, d_doc]
#                  L (라벨 GAT 임베딩)  : [C, d_lab]
#                  B_adj (부모->자식)   : [C, C]
#                  silver (list[list[int]]) : 문서별 core label index
def train_epoch_hier(model, loader, opt, device):
    model.train()
    total = 0.0
    for xb, posb, negb in loader:
        xb = xb.to(device)
        posb = posb.to(device)
        negb = negb.to(device)
        logits = model(xb)
        loss = hierarchical_bce_loss(logits, posb, negb)
        opt.zero_grad()
        loss.backward()
        opt.step()
        total += loss.item() * xb.size(0)
    return total / len(loader.dataset)




# 1) micro F1 계산
def micro_f1_from_logits(logits, pos_mask, thr=0.5, eps=1e-9):
    """
    logits: [B, C]
    pos_mask: [B, C]  (1: positive, 0: else)
    """
    probs = torch.sigmoid(logits)
    preds = (probs >= thr).float()

    y_true = pos_mask
    y_pred = preds

    tp = (y_true * y_pred).sum()
    fp = ((1 - y_true) * y_pred).sum()
    fn = (y_true * (1 - y_pred)).sum()

    precision = tp / (tp + fp + eps)
    recall    = tp / (tp + fn + eps)
    f1 = 2 * precision * recall / (precision + recall + eps)
    return f1.item()

# 2) eval 함수 수정: loss + f1 둘 다
def eval_epoch_hier(model, loader, device, k=3, thr=None):
    model.eval()
    total_loss = 0.0
    f1_list = []
    with torch.no_grad():
        for xb, posb, negb in loader:
            xb = xb.to(device)
            posb = posb.to(device)
            negb = negb.to(device)

            logits = model(xb)
            loss = hierarchical_bce_loss(logits, posb, negb)  # 위에 바꾼 버전
            total_loss += loss.item() * xb.size(0)

            probs = torch.sigmoid(logits)

            if thr is not None:
                pred = (probs >= thr).float()
            else:
                # top-k 방식
                B, C = probs.shape
                pred = torch.zeros_like(probs)
                topk = probs.topk(k, dim=1).indices
                pred.scatter_(1, topk, 1.0)

            # micro-f1
            y_true = posb
            y_pred = pred
            tp = (y_true * y_pred).sum().item()
            fp = ((1 - y_true) * y_pred).sum().item()
            fn = (y_true * (1 - y_pred)).sum().item()
            prec = tp / (tp + fp + 1e-9)
            rec  = tp / (tp + fn + 1e-9)
            f1   = 2 * prec * rec / (prec + rec + 1e-9)
            f1_list.append(f1)

    avg_loss = total_loss / len(loader.dataset)
    avg_f1 = float(np.mean(f1_list)) if f1_list else 0.0
    return avg_loss, avg_f1
def pseudo_label_and_grow_hier(
    model,
    unl_ds,             # UnlabeledDataset
    X_all,              # 전체 문서 임베딩 (numpy)
    parents, children,
    num_labels,
    device,
    pseudo_threshold=0.45,
    pseudo_topk=3,
    batch_size=512,
):
    if len(unl_ds) == 0:
        return [], None, None

    loader = DataLoader(unl_ds, batch_size=batch_size, shuffle=False)
    model.eval()
    new_idx = []
    new_pos_list = []
    new_neg_list = []

    with torch.no_grad():
        for xb, idxs in loader:
            xb = xb.to(device)
            logits = model(xb)
            prob = torch.sigmoid(logits).cpu().numpy()

            for p, i_doc in zip(prob, idxs.numpy().tolist()):
                order = np.argsort(-p)
                top1 = p[order[0]]
                # 1) top-1이 threshold를 못 넘으면 그냥 버린다
                if top1 < pseudo_threshold:
                    continue
                core = [j for j in order if p[j] >= pseudo_threshold][:pseudo_topk]
                if len(core) == 0:
                    # 아예 이 문서는 이번 epoch에 안 넣음
                    continue

                # 계층 pos/neg 구성
                pos = set(core)
                for c in core:
                    for pa in parents[c]:
                        pos.add(int(pa))
                child = set()
                for c in core:
                    for ch in children[c]:
                        child.add(int(ch))

                pos_mask = np.zeros(num_labels, dtype=np.float32)
                neg_mask = np.zeros(num_labels, dtype=np.float32)
                for j in pos:
                    pos_mask[j] = 1.0
                for j in range(num_labels):
                    if j in pos:    # 이미 양성
                        continue
                    if j in child:  # 모르겠음 → negative에서 제외
                        continue
                    neg_mask[j] = 1.0

                new_idx.append(int(i_doc))
                new_pos_list.append(pos_mask)
                new_neg_list.append(neg_mask)



    if len(new_idx) == 0:
        return [], None, None

    new_pos = np.stack(new_pos_list, axis=0)
    new_neg = np.stack(new_neg_list, axis=0)
    return new_idx, new_pos, new_neg



device = "cuda" if torch.cuda.is_available() else "cpu"
has_silver = np.array([len(lbls) > 0 for lbls in silver], dtype=bool)
N_docs = X.shape[0]
C = L.shape[0]

# silver 있는 문서 / 없는 문서
has_silver = np.array([len(lbls) > 0 for lbls in silver], dtype=bool)
idx_silver = np.flatnonzero(has_silver)      # 여기가 train/val 후보
idx_unl    = np.flatnonzero(~has_silver)     # 진짜 unl

print("total:", N_docs)
print("with silver:", len(idx_silver))
print("unlabeled :", len(idx_unl))

# 이제 train/val은 silver 있는 애들만 섞어서 나눈다
rng = np.random.default_rng(42)
rng.shuffle(idx_silver)
n_val = int(len(idx_silver) * 0.2)
idx_val   = idx_silver[:n_val]
idx_train = idx_silver[n_val:]

# parents, children 만들기
def build_parents_children(adj):
    C = adj.shape[0]
    parents = [np.flatnonzero(adj[:, j]).astype(np.int64) for j in range(C)]
    children = [np.flatnonzero(adj[j]).astype(np.int64) for j in range(C)]
    return parents, children

parents, children = build_parents_children(B)

pos_masks = np.zeros((N_docs, C), dtype=np.float32)
neg_masks = np.zeros((N_docs, C), dtype=np.float32)

for i in idx_silver:  # silver 있는 애만 돈다
    core = silver[i]

    # 1) core + parents
    pos = set(core)
    for c in core:
        for p in parents[c]:
            pos.add(int(p))

    # 2) children은 모름
    child = set()
    for c in core:
        for ch in children[c]:
            child.add(int(ch))

    for p in pos:
        pos_masks[i, p] = 1.0

    for j in range(C):
        if j in pos:      # 이미 양성
            continue
        if j in child:    # 모름
            continue
        neg_masks[i, j] = 1.0



train_ds = HierMultiLabelDataset(X, pos_masks, neg_masks, indices=idx_train)
val_ds   = HierMultiLabelDataset(X, pos_masks, neg_masks, indices=idx_val) if len(idx_val) > 0 else None
unl_ds   = UnlabeledDataset(X, idx_unl.tolist())
print(len(train_ds),len(val_ds),len(unl_ds))

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=256, shuffle=False)


total: 29487
with silver: 12014
unlabeled : 17473
9612 2402 17473


In [68]:

model = GATHierClassifier(doc_dim=X.shape[1], gat_encoder = gat, label_features=L,label_adj = A, hidden_dim=512).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-3)
epochs = 150

N_labels = L.shape[0]
best_f1 = -1.0
patience = 10
no_improve = 0
warmup_self = 1   # 1 epoch은 self-training 안 하게 해서 한 번 안정화

for epoch in range(1, epochs + 1):
    # train
    tr_loss = train_epoch_hier(model, train_loader, opt, device)
    

    # val: f1 기준
    if val_loader is not None and len(val_ds) > 0:
        va_loss, va_f1 = eval_epoch_hier(model, val_loader, device, k=3)
        print(f"Epoch {epoch:03d} | train_loss={tr_loss:.3f}  val_loss={va_loss:.3f}  val_f1={va_f1:.3f}")

        # early stopping을 f1로
        if va_f1 > best_f1 + 1e-6:
            best_f1 = va_f1
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                print(f"Early stopping at epoch {epoch} (best f1={best_f1:.4f})")
                break
    else:
        print(f"Epoch {epoch:03d} | train_loss={tr_loss:.3f}")

    # self-training: 1에폭에 전부 들어가는 거 방지용으로 warmup 넣음
    if epoch <= warmup_self:
        print("  + (skip pseudo-labeling on warmup epoch)")
        continue

    new_idx, new_pos, new_neg = pseudo_label_and_grow_hier(
        model,
        unl_ds,
        X,
        parents,
        children,
        C,                   # num_labels
        device=device,
        pseudo_threshold=0.4,
        pseudo_topk=3,
        batch_size=512,
    )

    if len(new_idx) > 0:
        # 전역 마스크 갱신
        pos_masks[new_idx] = new_pos
        neg_masks[new_idx] = new_neg

        # unl에서 제거
        keep_mask = ~np.isin(unl_ds.indices, np.array(new_idx, dtype=np.int64))
        unl_ds.indices = unl_ds.indices[keep_mask]

        # train에 추가
        train_ds.indices = np.concatenate([train_ds.indices, np.array(new_idx, dtype=np.int64)])
        train_loader = DataLoader(train_ds, batch_size=256, shuffle=True, drop_last=False)

        print(f"  + Added {len(new_idx)} pseudo-labeled docs (unl pool → {len(unl_ds)} left)")
    else:
        print("  + No pseudo-labeled docs added this epoch")

Epoch 001 | train_loss=0.139  val_loss=0.039  val_f1=0.164
  + (skip pseudo-labeling on warmup epoch)
Epoch 002 | train_loss=0.034  val_loss=0.031  val_f1=0.164
  + No pseudo-labeled docs added this epoch
Epoch 003 | train_loss=0.031  val_loss=0.030  val_f1=0.191
  + No pseudo-labeled docs added this epoch
Epoch 004 | train_loss=0.029  val_loss=0.028  val_f1=0.304
  + Added 303 pseudo-labeled docs (unl pool → 17170 left)
Epoch 005 | train_loss=0.026  val_loss=0.024  val_f1=0.355
  + Added 1024 pseudo-labeled docs (unl pool → 16146 left)
Epoch 006 | train_loss=0.023  val_loss=0.022  val_f1=0.385
  + Added 730 pseudo-labeled docs (unl pool → 15416 left)
Epoch 007 | train_loss=0.020  val_loss=0.019  val_f1=0.455
  + Added 489 pseudo-labeled docs (unl pool → 14927 left)
Epoch 008 | train_loss=0.018  val_loss=0.017  val_f1=0.508
  + Added 810 pseudo-labeled docs (unl pool → 14117 left)
Epoch 009 | train_loss=0.015  val_loss=0.015  val_f1=0.537
  + Added 462 pseudo-labeled docs (unl pool → 1

In [69]:
import csv, os
from pathlib import Path
import numpy as np
import pandas as pd

# ------------ Paths (edit if needed) ------------
TEST_CORPUS = "Amazon_products/test/test_corpus.txt"   # lines: pid \t text
OUT_PATH    = "submission_bda.csv"
# ------------ Hyperparams ------------
MIN_LABS  = 2
MAX_LABS  = 3
BATCH = 1024
doc_ids, doc_texts = load_docs_txt(TEST_CORPUS)

# 2) 라벨 때 만든 vectorizer 재사용해서 임베딩 만들기
test_embeddings = build_doc_embeddings_from_existing_vectorizer(doc_texts, vectorizer)
test_embeddings = test_embeddings.astype(np.float32)   # [num_test, d]
# load test pids
pids = doc_ids   # 이미 문자열 id
if "L" in globals():
    if not isinstance(L, np.ndarray):
        # 예: L이 torch.Tensor인 경우
        L = L.detach().cpu().numpy().astype(np.float32)
else:
    raise ValueError("라벨 임베딩 L이 메모리에 없어! GAT 끝난 뒤의 임베딩을 L로 둬야 해.")

# 5) 라벨 id는 0..N-1로 생성 (네가 말한 대로 adjacency랑 순서가 이미 맞다고 했으니까)
lab_ids = np.arange(L.shape[0], dtype=np.int64)

# 6) adjacency도 메모리에 있는 걸 그대로 쓴다
#    여기서 A는 531x531 같은 numpy array라고 가정
assert B.shape == (L.shape[0], L.shape[0]), "Adjacency/label size mismatch"

# 7) children 리스트 미리 만들어두기
children = [np.flatnonzero(B[i]) for i in range(B.shape[0])]



In [70]:
model.eval()

def ancestors_of(node, adj):
    # adj[parent, child] = 1 가정
    parents = np.flatnonzero(adj[:, node])  # (N,)
    return parents.tolist()

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

IN_DIM = test_embeddings.shape[1]
missing = 0  # 지금은 쓸 일 없지만 원래 코드랑 형태 맞춰둠

with open(OUT_PATH, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["id", "label"])

    buf_x, buf_pid = [], []

    def flush():
        if not buf_x:
            return
        xb = torch.from_numpy(np.stack(buf_x, axis=0).astype(np.float32)).to(device)
        with torch.inference_mode():
            prob = torch.sigmoid(model(xb)).detach().cpu().numpy()
        prob = np.nan_to_num(prob, nan=-1.0, posinf=1.0, neginf=0.0)

        for pid, p in zip(buf_pid, prob):
            order = np.argsort(-p)

            # 1) 기본 후보 뽑기
            thr_keep = [i for i in order if p[i] >= 0.5][:MAX_LABS]
            if len(thr_keep) >= MIN_LABS:
                keep = thr_keep[:MAX_LABS]
            else:
                keep = order[:max(MIN_LABS, len(thr_keep))]
                if len(keep) < MIN_LABS:
                    keep = order[:MIN_LABS]

            # 2) 부모 후보
            parent_cands = []
            for c in keep:
                pars = ancestors_of(c, B)
                for pa in pars:
                    if pa not in keep and pa not in parent_cands:
                        parent_cands.append(pa)

            parent_cands.sort(key=lambda idx: p[idx], reverse=True)

            # 3) 남는 슬롯 부모로 채우기
            final_idxs = list(keep)
            for pa in parent_cands:
                if len(final_idxs) >= MAX_LABS:
                    break
                final_idxs.append(pa)

            # 4) 그래도 모자라면 확률순
            if len(final_idxs) < MIN_LABS:
                for idx in order:
                    if idx not in final_idxs:
                        final_idxs.append(idx)
                    if len(final_idxs) >= MIN_LABS:
                        break

            labels = sorted(int(lab_ids[i]) for i in final_idxs)
            w.writerow([pid, ",".join(map(str, labels))])

        buf_x.clear()
        buf_pid.clear()

    # 여기서 바로 pids와 test_embeddings를 같이 순회
    for pid, emb in zip(pids, test_embeddings):
        x = emb
        if x.dtype != np.float32:
            x = x.astype(np.float32, copy=False)
        buf_x.append(x)
        buf_pid.append(pid)
        if len(buf_x) >= BATCH:
            flush()
    flush()

print(f"Saved: {OUT_PATH} | samples={len(pids)} | min-max labels per sample={MIN_LABS}-{MAX_LABS} | missing_pids={missing}")


Saved: submission_bda.csv | samples=19658 | min-max labels per sample=2-3 | missing_pids=0
