In [1]:
import random
import numpy as np
import torch
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [2]:

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset
from transformers import AutoTokenizer, AutoModel



class LabelEncoder(nn.Module):
    def __init__(self, model_name="bert-base-uncased", alpha=0.7, l2norm=True):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.alpha = alpha
        self.l2norm = l2norm

    @torch.no_grad()  # ÏûÑÎ≤†Îî© Ï∂îÏ∂úÎßå Ìï† Í≤ΩÏö∞ ÎìúÎ°≠ÏïÑÏõÉ/Í∑∏ÎùºÎìú off
    def forward(self, input_ids, attention_mask, token_type_ids=None):
        out = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True
        )
        last = out.last_hidden_state            # [B, T, H]
        cls = last[:, 0, :]                     # [B, H]

        # masked mean pooling (Ìå®Îî© Ï†úÏô∏, CLS Ï†úÏô∏)
        mask = attention_mask.clone()           # [B, T]
        mask[:, 0] = 0                          # CLS Ï†úÏô∏
        lengths = mask.sum(dim=1, keepdim=True).clamp(min=1)  # [B, 1]
        mean = (last * mask.unsqueeze(-1)).sum(dim=1) / lengths  # [B, H]

        h = self.alpha * cls + (1.0 - self.alpha) * mean         # [B, H]
        if self.l2norm:
            h = F.normalize(h, p=2, dim=-1)
        return h

# ÏÇ¨Ïö© ÏòàÏãú
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = LabelEncoder(model_name="bert-base-uncased", alpha=0.7).to(device).eval()

texts = [
    'This is the class "grocery_gourmet_food". It is related to snacks, condiments, beverages, specialty foods, spices, cooking oils, baking ingredients, gourmet chocolates, artisanal cheeses, and organic foods.'
]


enc = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
with torch.no_grad():
    emb = model(input_ids=enc["input_ids"].to(device),
                attention_mask=enc["attention_mask"].to(device))




2025-11-11 01:28:55.261564: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762824535.273565    2050 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762824535.277230    2050 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-11 01:28:55.289279: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from sklearn.decomposition import PCA



# --------- 1) ÌååÏùº Î°úÎçî ----------
def load_class_mapping(path):
    """classes.txt: '<id> <key>' ÌòïÏãùÏùÑ (idÏò§Î¶ÑÏ∞®Ïàú)ÏúºÎ°ú Î∞òÌôò"""
    pairs = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            raw = line.strip()
            if not raw or raw.startswith("#"):
                continue
            parts = raw.split()
            if len(parts) < 2:
                continue
            try:
                cid = int(parts[0])
            except ValueError:
                continue
            key = parts[1].strip()
            pairs.append((cid, key))
    # id Ï†ïÎ†¨ + Ï§ëÎ≥µ id Ï≤òÎ¶¨
    dedup = {}
    for cid, key in sorted(pairs, key=lambda x: x[0]):
        dedup[cid] = key
    return [ (cid, dedup[cid]) for cid in sorted(dedup.keys()) ]

def load_class_keywords(path):
    """'key: v1, v2, ...' ‚Üí dict[key]=[values]"""
    d = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            raw = line.strip()
            if not raw or raw.startswith("#"):
                continue
            if ":" not in raw:
                continue
            key, rest = raw.split(":", 1)
            key = key.strip()
            vals = [v.strip() for v in rest.split(",") if v.strip()]
            d[key] = vals
    return d

# --------- 2) ÌïòÎìú ÌîÑÎ°¨ÌîÑÌä∏ ----------
def _pretty(x): return x.replace("_", " ").strip()

def make_prompt(key, values):
    key_txt = _pretty(key)
    vals = [_pretty(v) for v in values] if values else []
    if not vals:
        return f'This is the class "{key_txt}".'
    if len(vals) == 1:
        vals_txt = vals[0]
    else:
        vals_txt = ", ".join(vals[:-1]) + f", and {vals[-1]}"
    return f'This is the class "{key_txt}". It is related to {vals_txt}.'

# --------- 3) BERT ÏûÑÎ≤†Îçî (CLS+mean) ----------
class LabelEncoder(nn.Module):
    def __init__(self, model_name="bert-base-uncased", alpha=0.7, l2norm=True):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.alpha = alpha
        self.l2norm = l2norm

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        out = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True
        )
        last = out.last_hidden_state        # [B,T,H]
        cls = last[:, 0, :]                 # [B,H]
        # Ìå®Îî©/CLS Ï†úÏô∏ mean
        mask = attention_mask.clone()       # [B,T]
        mask[:, 0] = 0
        lengths = mask.sum(1, keepdim=True).clamp(min=1)
        mean = (last * mask.unsqueeze(-1)).sum(1) / lengths
        h = self.alpha * cls + (1 - self.alpha) * mean
        if self.l2norm:
            h = F.normalize(h, p=2, dim=-1)
        return h

# --------- 4) ÏóîÎìúÌà¨ÏóîÎìú: CSV(Ïà´ÏûêÎßå) Ï†ÄÏû• ----------
@torch.no_grad()
def build_and_save_csv_with_headers(
    classes_path="classes.txt",
    keywords_path="class_related_keywords.txt",
    model_name="bert-base-uncased",
    alpha=0.7,
    batch_size=32,
    max_length=128,
    out_csv="class_embeddings_with_id.csv",
    pad_width=None,   # NoneÏù¥Î©¥ ÏûêÎèô(ÌäπÏßï ÏàòÏóê ÎßûÏ∂∞ ÏµúÏÜå ÏûêÎ¶øÏàò), Ï†ïÏàòÎ°ú Í∞ïÏ†ú Í∞ÄÎä•(Ïòà: 2 ‚Üí feat00)
):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    mapping = load_class_mapping(classes_path)           # [(id, key)]  id Ïò§Î¶ÑÏ∞®Ïàú
    kw = load_class_keywords(keywords_path)              # key -> [values]

    keys_in_order = [key for _, key in mapping]
    prompts = [make_prompt(k, kw.get(k, [])) for k in keys_in_order]

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = LabelEncoder(model_name=model_name, alpha=alpha, l2norm=True).to(device).eval()

    embs = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
        h = model(input_ids=enc["input_ids"].to(device),
                  attention_mask=enc["attention_mask"].to(device),
                  token_type_ids=enc.get("token_type_ids").to(device) if enc.get("token_type_ids") is not None else None)
        embs.append(h.cpu())
    E = torch.cat(embs, dim=0).numpy()   # [N, H]
    
    # E: [N, 768] (CSV Ï†ÄÏû• ÏßÅÏ†ÑÏùò ÏûÑÎ≤†Îî© ÌñâÎ†¨)
    # ids: classes.txtÏùò id Î¶¨Ïä§Ìä∏
    

    # ---- Ìó§Îçî ÎßåÎì§Í∏∞: feat000, feat001, ... ----
    H = E.shape[1]
    auto_width = len(str(H-1)) if H > 1 else 1
    width = max(2, auto_width) if pad_width is None else pad_width   # ÏµúÏÜå 2ÏûêÎ¶¨Îäî Î≥¥Ïû•(ÏöîÏ≤≠ Î∞òÏòÅ)
    feat_cols = [f"feat{str(i).zfill(width)}" for i in range(H)]


    ids = [cid for cid, _ in mapping]    # classes.txtÏùò id ÏàúÏÑú

    df = pd.DataFrame(E, columns=feat_cols)
    df.insert(0, "id", ids)
    df.to_csv(out_csv, index=False,encoding="utf-8")
    print(f"[OK] saved ‚Üí {out_csv}  shape={df.shape}  (rows=id order from classes.txt)")
    return df



In [4]:

build_and_save_csv_with_headers(
    classes_path="Amazon_products/classes.txt",
    keywords_path="Amazon_products/class_related_keywords.txt",
    model_name="bert-base-uncased",
    alpha=0.7,
    batch_size=32,
    max_length=128,
    out_csv="Amazon_products/class_embeddings_matrix.csv",
)

[OK] saved ‚Üí Amazon_products/class_embeddings_matrix.csv  shape=(531, 769)  (rows=id order from classes.txt)


Unnamed: 0,id,feat000,feat001,feat002,feat003,feat004,feat005,feat006,feat007,feat008,...,feat758,feat759,feat760,feat761,feat762,feat763,feat764,feat765,feat766,feat767
0,0,-0.010211,-0.011956,0.002603,-0.000184,0.000567,-0.032215,-0.016893,0.063920,-0.027164,...,-0.013105,-0.032628,-0.005374,-0.015925,0.006762,-0.012706,-0.005882,-0.013056,0.011159,0.058906
1,1,-0.017012,-0.016444,-0.000128,-0.011100,-0.025949,-0.002887,-0.025018,0.049013,-0.033802,...,0.007400,-0.008585,-0.007387,-0.026812,0.019292,0.020590,0.003997,-0.000089,0.029783,0.046100
2,2,-0.034676,-0.019762,-0.008492,-0.011459,-0.022838,-0.006900,-0.010470,0.080544,-0.028093,...,0.023415,-0.017251,-0.005550,-0.031397,0.019254,0.012294,0.000938,0.000992,0.038855,0.031516
3,3,-0.011635,-0.017314,0.028135,0.017923,-0.013000,-0.041281,-0.012311,0.041047,-0.049561,...,-0.011168,-0.017969,0.001752,-0.028430,0.036308,0.023512,-0.017785,-0.008129,-0.000691,0.092230
4,4,-0.007398,-0.023895,0.033761,0.002061,-0.029120,-0.016012,-0.012304,0.056903,-0.053605,...,-0.000004,-0.032577,0.002364,-0.035561,0.031751,0.030951,-0.018557,-0.018734,-0.009401,0.093541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526,526,-0.021016,-0.019151,0.041768,0.016444,-0.007131,-0.041068,-0.020283,0.000831,-0.017557,...,0.040430,-0.030489,0.003122,-0.030808,-0.006206,0.015905,-0.015440,-0.029978,-0.015221,0.062001
527,527,-0.032303,-0.000794,-0.006452,-0.038560,-0.058474,-0.003561,0.006136,0.049258,-0.041083,...,0.018024,-0.014804,-0.005148,-0.028593,0.047571,0.013935,-0.003430,0.007667,0.037184,0.045884
528,528,-0.044695,-0.023169,0.058748,0.001255,-0.017672,-0.035153,-0.003713,-0.009553,-0.018632,...,0.016477,-0.039585,-0.008159,-0.049479,0.033699,0.001436,-0.002274,-0.023200,-0.037373,0.054765
529,529,-0.023375,-0.003089,0.006700,-0.008055,-0.025490,0.001360,-0.010454,0.081578,-0.038161,...,-0.001798,-0.038106,-0.004135,-0.045608,0.017642,-0.030496,-0.018465,0.013859,-0.008486,0.050243


In [5]:

def load_edges(path):
    edges = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            raw = line.strip()
            if not raw or raw.startswith("#"):
                continue
            parts = raw.split()
            if len(parts) < 2:
                continue
            try:
                u, v = int(parts[0]), int(parts[1])
            except ValueError:
                continue
            edges.append((u, v))
    return edges

def find_roots(edges):
    parents = set()
    children = set()
    for u, v in edges:
        parents.add(u)
        children.add(v)
    # Î∂ÄÎ™®Î°úÎßå ÎÇòÏò® Ïï†Îì§ = Î£®Ìä∏Îì§
    roots = parents - children
    return sorted(roots)

# --- ÏÇ¨Ïö© ---
E = load_edges("Amazon_products/class_hierarchy.txt")

N = 531
A = np.zeros((N, N), dtype=np.uint8)
for u, v in E:
    A[u, v] = 1
    A[v, u] = 1   # ÌÉêÏÉâÏö©ÏúºÎ°úÎäî Î¨¥Î∞©Ìñ• Ïù∏Ï†ëÌñâÎ†¨ Ïç®ÎèÑ Îê®

roots = find_roots(E)
print("roots:", roots)


roots: [0, 3, 10, 23, 40, 169]


In [6]:
EMB_CSV  = "Amazon_products/class_embeddings_matrix.csv"   # id + featXX ... (Ï¥àÍ∏∞ ÏûÑÎ≤†Îî©)
OUT_CSV  = "Amazon_products/label_embeddings_gat.csv"

def load_initial_embeddings(path):
    df = pd.read_csv(path, encoding="utf-8-sig")  # Î®ºÏ†Ä ÏãúÎèÑ
    ids = df.iloc[:, 0].astype(int).tolist()
    X = df.iloc[:, 1:].to_numpy(dtype=np.float32)  # [N, d]
    return ids, X

# ---------------------------
# GAT 
# ---------------------------

class SimpleGATLayer(nn.Module):
    def __init__(self, in_dim, out_dim, heads=4, concat=True, dropout=0.2, negative_slope=0.2, residual=True):
        super().__init__()
        self.heads = heads
        self.out_dim = out_dim
        self.concat = concat
        self.dropout = nn.Dropout(dropout)
        self.leaky_relu = nn.LeakyReLU(negative_slope)
        self.lin = nn.Linear(in_dim, heads * out_dim, bias=False)
        self.a_src = nn.Parameter(torch.Tensor(heads, out_dim))
        self.a_dst = nn.Parameter(torch.Tensor(heads, out_dim))
        self.residual = residual
        if residual and (in_dim == (heads * out_dim if concat else out_dim)):
            self.res_proj = nn.Identity()
        elif residual:
            self.res_proj = nn.Linear(in_dim, heads * out_dim if concat else out_dim, bias=False)
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.lin.weight)
        nn.init.xavier_uniform_(self.a_src)
        nn.init.xavier_uniform_(self.a_dst)
        if self.residual and not isinstance(getattr(self, "res_proj", None), nn.Identity):
            nn.init.xavier_uniform_(self.res_proj.weight)

    def forward(self, x, adj):
        """
        x: [N, Fin]
        adj: [N, N] (0/1; self-loop ÏóÜÏùå)
        """
        N = x.size(0)
        Wh = self.lin(x).view(N, self.heads, self.out_dim)  # [N, H, F]

        e_src = (Wh * self.a_src).sum(dim=-1)  # [N, H]
        e_dst = (Wh * self.a_dst).sum(dim=-1)  # [N, H]
        e = e_src.unsqueeze(1) + e_dst.unsqueeze(0)  # [N, N, H]
        e = self.leaky_relu(e)
        # --- ÏïàÏ†ÑÌïú masked softmax ---
        mask = (adj > 0).unsqueeze(-1)                    # [N, N, 1]
        e = e.masked_fill(~mask, -1e9)                    # -inf ÎåÄÏã† -1e9Î°ú NaN Î∞©ÏßÄ
        alpha = torch.softmax(e, dim=1)                   # ÏÜåÌîÑÌä∏Îß•Ïä§
        alpha = alpha * mask.float()                      # ÎßàÏä§ÌÅ¨Î°ú 0 Ï≤òÎ¶¨
        denom = alpha.sum(dim=1, keepdim=True).clamp(min=1e-12)  # Ïù¥ÏõÉ ÏóÜÏùÑ Îïå 0 Î∂ÑÎ™® Î∞©ÏßÄ
        alpha = alpha / denom                             # Ïù¥ÏõÉÎì§Î°ú Ï†ïÍ∑úÌôî

        out = torch.einsum("ijh,jhf->ihf", alpha, Wh)     # [N, H, F]
        out = out.reshape(N, self.heads * self.out_dim) if self.concat else out.mean(dim=1)
        out = self.dropout(out)
        if self.residual:
            out = out + self.res_proj(x)                  # self-loop ÏóÜÎäî ÎåÄÏã† residualÎ°ú ÏûêÍ∏∞Ï†ïÎ≥¥ Ïú†ÏßÄ
        return out

class GATEncoder(nn.Module):
    def __init__(self, in_dim, hid_dim=64, out_dim=768, heads1=4, heads2=4, dropout=0.2):
        super().__init__()
        self.gat1 = SimpleGATLayer(in_dim, hid_dim, heads=heads1, concat=True,  dropout=dropout, residual=True)
        self.gat2 = SimpleGATLayer(hid_dim*heads1, out_dim, heads=heads2, concat=False, dropout=dropout, residual=True)
        self.act = nn.ELU(); self.dropout = nn.Dropout(dropout)
    def forward(self, x, adj):
        h = self.gat1(x, adj); h = self.act(h); h = self.dropout(h)
        z = self.gat2(h, adj)
        return z  # [N, out_dim]


In [7]:
# ---------------------------
# ÌïôÏäµ Ïú†Ìã∏: ÏùåÏÑ± Ïó£ÏßÄ ÏÉòÌîå/Î°úÏä§
# ---------------------------
def to_upper_pos_edges(A):
    pos = []
    N = A.shape[0]
    for i in range(N):
        for j in range(i+1, N):
            if A[i, j] == 1:
                pos.append((i, j))
    return pos

def sample_neg(A, k):
    N = A.shape[0]
    neg = set()
    while len(neg) < k:
        u = np.random.randint(0, N); v = np.random.randint(0, N)
        if u == v: continue
        a, b = (u, v) if u < v else (v, u)
        if A[a, b] == 0:
            neg.add((a, b))
    return list(neg)

def sample_neg_excluding(A, k, exclude_edges):
    """
    A: np.array [N,N]  (0/1)
    k: ÎΩëÏùÑ ÏùåÏÑ± Í∞úÏàò
    exclude_edges: {(u,v), ...}  Î¨¥Ï°∞Í±¥ ÎπºÏïº ÌïòÎäî ÏñëÏÑ±(ÎòêÎäî Í∏àÏßÄ) Ïó£ÏßÄÎì§ (u<v ÌòïÌÉúÎ°ú ÎÑ£Í∏∞)
    """
    N = A.shape[0]
    neg = set()
    while len(neg) < k:
        u = np.random.randint(0, N); v = np.random.randint(0, N)
        if u == v:
            continue
        a, b = (u, v) if u < v else (v, u)
        if A[a, b] == 0 and (a, b) not in exclude_edges:
            neg.add((a, b))
    return list(neg)


def edge_score(z, edges):
    u = torch.tensor([a for a, _ in edges], device=z.device, dtype=torch.long)
    v = torch.tensor([b for _, b in edges], device=z.device, dtype=torch.long)
    return (z[u] * z[v]).sum(dim=1)  # ÎÇ¥Ï†Å ÎîîÏΩîÎçî

from sklearn.metrics import roc_auc_score
def eval_auc(z, pos_edges, A_full, k_factor=1.0):
    z = F.normalize(z, p=2, dim=1)
    neg_edges = sample_neg(A_full, int(len(pos_edges) * k_factor))
    s = torch.cat([edge_score(z, pos_edges), edge_score(z, neg_edges)]).detach().cpu().numpy()
    y = np.concatenate([np.ones(len(pos_edges)), np.zeros(len(neg_edges))])
    return roc_auc_score(y, s)

hidden_dim=64
out_dim=768
heads1=8
heads2=8
dropout=0.2
epochs=700
lr=1e-3
weight_decay=5e-4
neg_ratio=1.0
eval_every=20
use_full_graph_for_final=True
pad_width=2
normalize_out = True
device = "cuda" if torch.cuda.is_available() else "cpu"

ids, X = load_initial_embeddings(EMB_CSV)      # ids: [0..530] ÏàúÏÑú, X: [N, d0]
N, d0 = X.shape
pos_edges = to_upper_pos_edges(A)

rng = np.random.default_rng(42)
idx = rng.permutation(len(pos_edges))
n_val = max(1, int(0.1 * len(pos_edges)))          # 10% val
pos_val = [pos_edges[i] for i in idx[:n_val]]
pos_train = [pos_edges[i] for i in idx[n_val:]]

# train Í∑∏ÎûòÌîÑÎßåÏúºÎ°ú ÌïôÏäµ(ÎàÑÏ∂ú Î∞©ÏßÄ)
A_train = np.zeros_like(A)
for u, v in pos_train:
    A_train[u, v] = 1; A_train[v, u] = 1

adj_train = torch.tensor(A_train, dtype=torch.float32, device=device)
# ÌÖêÏÑú
x = torch.tensor(X, dtype=torch.float32, device=device)
adj = torch.tensor(A, dtype=torch.float32, device=device)  # softmax ÎßàÏä§ÌÅ¨Ïö©

model = GATEncoder(in_dim=d0, hid_dim=hidden_dim, out_dim=out_dim, heads1=heads1, heads2=heads2, dropout=dropout).to(device)
opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
bce = nn.BCEWithLogitsLoss()

maxauc = 0
best_ckpt = "Amazon_products/best_gat.ckpt"
# Í∏àÏßÄ Ïó£ÏßÄ ÏßëÌï© (train+val Î™®Îëê)
forbidden = set()
for u, v in pos_edges:        # pos_edges = train+val Ï†ÑÏ≤¥
    a, b = (u, v) if u < v else (v, u)
    forbidden.add((a, b))

for ep in range(1, epochs+1):
    model.train()
    # üî¥ Ïó¨Í∏∞ÏÑú Ï†ÑÏ≤¥ adj ÎßêÍ≥† trainÏö© adjÎßå Î≥∏Îã§
    z = model(x, adj_train)                          # [N, out_dim]
    if normalize_out:
        z = F.normalize(z, p=2, dim=1)

    # üî¥ Ïã§Ï†úÎ°ú ÌïôÏäµÏóê Ïì∞Îäî ÏñëÏÑ± Ïàò Í∏∞Ï§ÄÏúºÎ°ú ÏùåÏÑ± Ïàò Í≤∞Ï†ï
    num_pos = len(pos_train)
    num_neg = int(num_pos * neg_ratio)
    # üî¥ train Í∑∏ÎûòÌîÑ Í∏∞Ï§ÄÏúºÎ°ú ÎΩëÎêò, train+val ÏñëÏÑ±ÏùÄ Î¨¥Ï°∞Í±¥ Ï†úÏô∏
    neg_edges = sample_neg_excluding(A_train, num_neg, forbidden)

    score_pos = edge_score(z, pos_train)
    score_neg = edge_score(z, neg_edges)
    scores = torch.cat([score_pos, score_neg], dim=0)
    labels = torch.cat([torch.ones_like(score_pos), torch.zeros_like(score_neg)], dim=0)

    loss = bce(scores, labels)

    opt.zero_grad()
    loss.backward()
    opt.step()

    # ÌèâÍ∞Ä Î∂ÄÎ∂ÑÏùÄ Í±∞Ïùò Í∑∏ÎåÄÎ°ú
    if ep % 1 == 0 or ep == 1:
        model.eval()
        with torch.no_grad():
            # valÏùÄ Ïó¨Ï†ÑÌûà train Í∑∏ÎûòÌîÑÎ°ú ÏûÑÎ≤†Îî©
            z_val = F.normalize(model(x, adj_train), p=2, dim=1)
            auc_val = eval_auc(z_val, pos_val, A, k_factor=1.0)
        print(f"[{ep:03d}/{epochs}] loss={loss.item():.4f} | "
              f"pos={score_pos.mean().item():.3f} neg={score_neg.mean().item():.3f} | "
              f"val AUC={auc_val:.4f}")
        if maxauc < auc_val:
            maxauc = auc_val
            torch.save(model.state_dict(), best_ckpt)

model.load_state_dict(torch.load(best_ckpt, weights_only=True))

# ÏµúÏ¢Ö ÏûÑÎ≤†Îî© Ï∂îÏ∂ú
model.eval()
with torch.no_grad():
    z = model(x, adj)
    if normalize_out:
        z = F.normalize(z, p=2, dim=1)
    Z = z.detach().cpu().numpy()  # [N, out_dim]

# CSV Ï†ÄÏû• (id + feat00..)
pad = max(2, len(str(out_dim-1)))
feat_cols = [f"feat{str(i).zfill(pad)}" for i in range(out_dim)]
df = pd.DataFrame(Z, columns=feat_cols)
df.insert(0, "id", ids)
df.to_csv(OUT_CSV, index=False)
print(f"[OK] saved GAT label embeddings ‚Üí {OUT_CSV}  shape={df.shape}")



[001/700] loss=0.7186 | pos=0.661 neg=0.572 | val AUC=0.2631
[002/700] loss=0.6633 | pos=0.399 neg=0.223 | val AUC=0.3791
[003/700] loss=0.6425 | pos=0.450 neg=0.181 | val AUC=0.6173
[004/700] loss=0.6169 | pos=0.489 neg=0.108 | val AUC=0.6999
[005/700] loss=0.5937 | pos=0.559 neg=0.060 | val AUC=0.7334
[006/700] loss=0.5713 | pos=0.653 neg=0.024 | val AUC=0.6256
[007/700] loss=0.5601 | pos=0.712 neg=0.012 | val AUC=0.7302
[008/700] loss=0.5584 | pos=0.755 neg=0.026 | val AUC=0.7490
[009/700] loss=0.5484 | pos=0.785 neg=-0.000 | val AUC=0.7143
[010/700] loss=0.5562 | pos=0.817 neg=0.042 | val AUC=0.7513
[011/700] loss=0.5509 | pos=0.834 neg=0.027 | val AUC=0.7589
[012/700] loss=0.5459 | pos=0.842 neg=0.011 | val AUC=0.7988
[013/700] loss=0.5391 | pos=0.842 neg=-0.014 | val AUC=0.7506
[014/700] loss=0.5331 | pos=0.853 neg=-0.035 | val AUC=0.8600
[015/700] loss=0.5366 | pos=0.870 neg=-0.015 | val AUC=0.7618
[016/700] loss=0.5396 | pos=0.885 neg=0.002 | val AUC=0.7844
[017/700] loss=0.543

In [8]:
class DocumentEncoder(nn.Module):
    def __init__(self, model_name="bert-base-uncased", alpha=0.6, l2norm=True):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.alpha = alpha
        self.l2norm = l2norm

    @torch.no_grad()  # ÏûÑÎ≤†Îî© Ï∂îÏ∂úÎßå Ìï† Í≤ΩÏö∞ ÎìúÎ°≠ÏïÑÏõÉ/Í∑∏ÎùºÎìú off
    def forward(self, input_ids, attention_mask, token_type_ids=None):
        out = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True
        )
        last = out.last_hidden_state            # [B, T, H]
        cls = last[:, 0, :]                     # [B, H]

        # masked mean pooling (Ìå®Îî© Ï†úÏô∏, CLS Ï†úÏô∏)
        mask = attention_mask.clone()           # [B, T]
        mask[:, 0] = 0                          # CLS Ï†úÏô∏
        lengths = mask.sum(dim=1, keepdim=True).clamp(min=1)  # [B, 1]
        mean = (last * mask.unsqueeze(-1)).sum(dim=1) / lengths  # [B, H]

        h = self.alpha * cls + (1.0 - self.alpha) * mean         # [B, H]
        if self.l2norm:
            h = F.normalize(h, p=2, dim=-1)

        return h

In [9]:
import io
from typing import List, Dict, Any
def read_lines_robust(path):
    trials = ("utf-8", "utf-8-sig", "cp949", "euc-kr", "latin1")
    last = None
    for enc in trials:
        try:
            with io.open(path, "r", encoding=enc) as f:
                return f.readlines()
        except Exception as e:
            last = e
    raise RuntimeError(f"ÎîîÏΩîÎî© Ïã§Ìå®: {path}") from last

def parse_id_text(line):
    s = line.strip()
    if not s or s.startswith("#"):
        return None
    if "\t" in s:
        left, right = s.split("\t", 1)
    else:
        parts = s.split(maxsplit=1)
        if len(parts) < 2:
            return None
        left, right = parts[0], parts[1]
    try:
        doc_id = int(left)
    except ValueError:
        return None
    return doc_id, right.strip()

# ========= 2) Dataset / Collate =========
class TextCorpusDataset(Dataset):
    def __init__(self, corpus_path):
        lines = read_lines_robust(corpus_path)
        pairs = [parse_id_text(ln) for ln in lines]
        pairs = [p for p in pairs if p is not None]
        # ÌïÑÏöîÌïòÎ©¥ Ï†ïÎ†¨: pairs.sort(key=lambda x: x[0])
        self.ids = [p[0] for p in pairs]
        self.texts = [p[1] for p in pairs]

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        return {"id": self.ids[idx], "text": self.texts[idx]}

def make_collate_fn(tokenizer, max_length=256):
    def collate(batch):
        ids = [b["id"] for b in batch]
        texts = [b["text"] for b in batch]
        enc = tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )
        out = {
            "doc_ids": torch.tensor(ids, dtype=torch.long),
            "input_ids": enc["input_ids"],
            "attention_mask": enc["attention_mask"],
        }
        if "token_type_ids" in enc:
            out["token_type_ids"] = enc["token_type_ids"]
        else:
            out["token_type_ids"] = None
        return out
    return collate

# ========= 3) ÏûÑÎ≤†Îî© ÏÉùÏÑ± & CSV Ï†ÄÏû• =========
@torch.no_grad()
def build_and_save_document_embeddings(
    corpus_path="text_corpus.txt",
    model_name="bert-base-uncased",
    alpha=0.6,
    max_length=256,
    batch_size=32,
    out_csv="document_embeddings_768d.csv",
    pad_width=2,     # 'feat00' Ïä§ÌÉÄÏùº(Îëê ÏûêÎ¶¨)
):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # data
    ds = TextCorpusDataset(corpus_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    collate_fn = make_collate_fn(tokenizer, max_length=max_length)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    # model
    model = DocumentEncoder(model_name=model_name, alpha=alpha, l2norm=True).to(device).eval()

    all_ids, all_embs = [], []
    for batch in dl:
        input_ids = batch["input_ids"].to(device)
        attn = batch["attention_mask"].to(device)
        tt = batch["token_type_ids"]
        if tt is not None:
            tt = tt.to(device)

        emb = model(input_ids=input_ids, attention_mask=attn, token_type_ids=tt)  # [B,out_dim]
        all_embs.append(emb.cpu())
        all_ids.extend(batch["doc_ids"].tolist())

    E = torch.cat(all_embs, dim=0).numpy()  # [N, out_dim]

    # CSV: id + featXX...
    feat_cols = [f"feat{str(i).zfill(pad_width)}" for i in range(E.shape[1])]
    df = pd.DataFrame(E, columns=feat_cols)
    df.insert(0, "id", all_ids)
    df.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"[OK] saved document embeddings ‚Üí {out_csv}  shape={df.shape}")

In [10]:
build_and_save_document_embeddings(
    corpus_path="Amazon_products/train/train_corpus.txt",
    model_name="bert-base-uncased",
    alpha=0.7,
    max_length=256,    # Î¨∏ÏÑú Í∏∏Ïù¥Ïóê ÎßûÍ≤å Ï°∞Ï†à
    batch_size=256,
    out_csv="Amazon_products/document_embeddings_768d.csv",
    pad_width=2,
)

[OK] saved document embeddings ‚Üí Amazon_products/document_embeddings_768d.csv  shape=(29487, 769)


In [10]:
N = 531 
B = np.zeros((N, N), dtype=np.uint8)

for u, v in E:
    B[u, v] = 1
print(B)
print(roots)

[[0 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[0, 3, 10, 23, 40, 169]


In [11]:
import numpy as np
import numpy as np

def hierarchical_beam_similarity_avg(
    doc_vec: np.ndarray,
    label_emb: np.ndarray,
    adj_upper: np.ndarray,
    roots: list[int] = [0],       # Ïó¨Îü¨ Î£®Ìä∏
    beam: int = 5,
    per_parent: str | int = "l+2",
    tau: float = 0.35,
    eps: float = 1e-9,
    max_depth: int | None = None,
    normalize: bool = False,      # ÌïÑÏöîÌïòÎ©¥ TrueÎ°ú
):
    doc = np.asarray(doc_vec, dtype=np.float32)
    L = np.asarray(label_emb, dtype=np.float32)
    A = np.asarray(adj_upper).astype(bool)
    N, d = L.shape

    if normalize:
        doc = doc / (np.linalg.norm(doc) + eps)
        L = L / (np.linalg.norm(L, axis=1, keepdims=True) + eps)

    # Î°úÏª¨ Ï†êÏàò
    sims = L @ doc
    p = 1.0 / (1.0 + np.exp(-sims / max(tau, 1e-6)))

    children = [np.flatnonzero(A[i]) for i in range(N)]

    S = np.full(N, -np.inf, dtype=np.float32)
    K = np.full(N, -np.inf, dtype=np.float32)
    Llen = np.zeros(N, dtype=np.int32)

    roots = list(roots)
    for r in roots:
        S[r] = 0.0
        Llen[r] = 0
        K[r] = -np.inf

    levels = [roots[:]]
    cur = roots[:]
    level_id = 0

    while True:
        cand_best = {}
        k_parent = (level_id + 2) if (per_parent == "l+2") else int(per_parent)

        for par in cur:
            ch = children[par]
            if ch.size == 0:
                continue
            if ch.size > k_parent:
                idx = np.argpartition(-sims[ch], k_parent - 1)[:k_parent]
                ch = ch[idx]
            for c in ch:
                S_c = S[par] + float(p[c])
                L_c = Llen[par] + 1
                K_c = S_c / (L_c + eps)
                if (c not in cand_best) or (K_c > cand_best[c][2]):
                    cand_best[c] = (S_c, L_c, K_c)

        if not cand_best:
            break

        kept = sorted(cand_best.items(), key=lambda x: x[1][2], reverse=True)[:min(beam, len(cand_best))]
        next_level = [i for i, _ in kept]
        for i, (Si, Li, Ki) in kept:
            S[i], Llen[i], K[i] = Si, Li, Ki

        levels.append(next_level)
        cur = next_level
        level_id += 1
        if max_depth is not None and level_id >= max_depth:
            break

    return K, levels, sims, p



def topk_labels_by_avg(
    doc_vec, label_emb, adj_upper, root„Ñ¥=(0,), beam=5, per_parent="l+2", k=5, **kw
):
    """ÌèâÍ∑† Ï†êÏàò Í∏∞Î∞ò ÏµúÏ¢Ö ÏÉÅÏúÑ k ÎùºÎ≤®(Î£®Ìä∏ Ï†úÏô∏)."""
    K, levels, sims, p = hierarchical_beam_similarity_avg(
        doc_vec, label_emb, adj_upper, root=list(roots), beam=beam, per_parent=per_parent, **kw
    )
    root_set = set(roots)
    order = np.argsort(-K)
    order = [i for i in order if i not in root_set and np.isfinite(K[i])]
    top = order[:k]
    return top, K[top]

In [12]:
# silver label
# documnet bert ÎßàÏßÄÎßâÏ∏µ unfrozenÌïòÍ≥†
# ÌïôÏäµ ÎèåÎ¶¨Î©¥ÏÑú label Ï∂îÍ∞ÄÌïòÍ∏∞

from __future__ import annotations
import argparse
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd

# ----------------------------- IO Utils -----------------------------

def l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-12) -> np.ndarray:
    n = np.linalg.norm(x, axis=axis, keepdims=True)
    return x / (n + eps)


def load_embeddings_csv(path: str | Path, id_col: str = "id") -> Tuple[List[int], np.ndarray]:
    """Load embeddings from CSV where the first column is an id and the rest are feature columns.
    Returns (ids, float32 matrix).
    """
    df = pd.read_csv(path)
    cols = list(df.columns)
    if id_col in df.columns:
        id_series = df[id_col]
        X = df.drop(columns=[id_col])
    else:
        # Fallback: use the first column as id
        id_series = df.iloc[:, 0]
        X = df.iloc[:, 1:]
    ids = id_series.astype(int).tolist()
    X = X.to_numpy(dtype=np.float32)
    return ids, X



# ---------------- Hierarchical beam search (average score) ----------------


def all_label_similarity(
    doc_vec: np.ndarray,
    label_emb: np.ndarray,
    tau: float = 0.35,
    normalize: bool = True,
) -> Tuple[np.ndarray, np.ndarray]:
    """Î¨∏ÏÑú ÏûÑÎ≤†Îî© vs Î™®Îì† ÎùºÎ≤® ÏûÑÎ≤†Îî© Ï†êÏàò Ìïú Î≤àÏóê Í≥ÑÏÇ∞.

    Î∞òÌôò:
        sims: (N,) ÏõêÏãú Ïú†ÏÇ¨ÎèÑ (cosine Í∏∞Î∞ò)
        p:    (N,) sigmoid Ï†êÏàò
    """
    doc = np.asarray(doc_vec, dtype=np.float32)
    L = np.asarray(label_emb, dtype=np.float32)

    # Î™®Îì† ÎùºÎ≤®Ïóê ÎåÄÌï¥ Ìïú Î≤àÏóê
    sims = L @ doc  # (N,)
    p = 1.0 / (1.0 + np.exp(-sims / max(tau, 1e-6)))
    return sims, p


def silver_labeling(
    doc_ids: List[int],
    docs: np.ndarray,
    label_ids: List[int],
    label_emb: np.ndarray,
    threshold: float = 0.8,
    top_k: int = 3,
    tau: float = 0.35,
    root_id: int = 0,
    normalize: bool = True,
) -> pd.DataFrame:
    """Î¨∏ÏÑúÎßàÎã§ 'Î™®Îì†' ÎùºÎ≤® ÏûÑÎ≤†Îî©ÏùÑ ÎπÑÍµêÌï¥ÏÑú Ï†êÏàò ÎÜíÏùÄ ÎùºÎ≤®ÏùÑ ÎΩëÎäî Î≤ÑÏ†Ñ.

    - Ìä∏Î¶¨(adj) Ïïà ÌÉê.
    - root_idÎäî Í≤∞Í≥ºÏóêÏÑú Ï†úÏô∏.
    - p >= threshold Ïù∏ Ïï†Îì§ Ï§ëÏóêÏÑú top_kÎßå Í≥†Î¶Ñ.
    """
    # Í∏∞Î≥∏ Ï†ïÌï©ÏÑ± Ï≤¥ÌÅ¨
    label_ids = list(label_ids)
    N = label_emb.shape[0]
    if len(label_ids) != N:
        raise ValueError(f"label_ids Í∏∏Ïù¥({len(label_ids)})ÏôÄ label_emb Ìñâ({N})Ïù¥ Îã§Î¶ÖÎãàÎã§.")

    rows = []
    for idx, (doc_id, d) in enumerate(zip(doc_ids, docs)):
        sims, p = all_label_similarity(d, label_emb, tau=tau, normalize=normalize)

        # rootÎäî Ï†úÏô∏ÌïòÍ≥†, threshold Ïù¥ÏÉÅÎßå ÌõÑÎ≥¥Î°ú
        candidates = [
            (i, float(p[i]))
            for i in range(N)
            if i != root_id and np.isfinite(p[i]) and p[i] >= threshold
        ]

        # Ï†êÏàò ÎÇ¥Î¶ºÏ∞®Ïàú Ï†ïÎ†¨
        candidates.sort(key=lambda x: x[1], reverse=True)

        selected = candidates[:top_k]

        row = {"doc_id": int(doc_id)}
        for j in range(top_k):
            if j < len(selected):
                li, sc = selected[j]
                row[f"label_id_{j+1}"] = int(label_ids[li])
                row[f"score_{j+1}"] = float(sc)
            else:
                row[f"label_id_{j+1}"] = np.nan
                row[f"score_{j+1}"] = np.nan

        rows.append(row)

        if (idx + 1) % 100 == 0:
            print(f"Processed {idx+1} / {len(doc_ids)} docs...")

    return pd.DataFrame(rows)





In [13]:
"""
Self-training pipeline with hierarchical silver labeling and dynamic dataloaders.

- Reads document/label embeddings CSVs (first column "id", rest feat000..feat127)
- Reads upper-triangular adjacency (A[i,j]=1 means i->j)
- Makes initial silver labels via hierarchical beam search (average score)
- Splits into train/val on silver set; keeps the rest as unlabeled pool
- Trains a multi-label classifier (Linear/MLP) with BCEWithLogitsLoss
- Each epoch, pseudo-labels unlabeled docs whose predicted probs exceed a threshold
- Adds them to the training set (up to top_k per doc), with patience-based early stopping

Run example
-----------
python self_training_pipeline.py \
  --doc_csv docs.csv \
  --label_csv labels.csv \
  --adj adj.npy \
  --val_ratio 0.2 --epochs 50 --patience 5 \
  --silver_threshold 0.60 --silver_topk 3 --beam 5 --tau 0.35 --root_id 0 \
  --pseudo_threshold 0.70 --pseudo_topk 3 --batch_size 256 --lr 1e-3
"""
from __future__ import annotations
import argparse
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
def load_embeddings_csv(path: str | Path, id_col: str = "id") -> Tuple[List[int], np.ndarray]:
    """Load embeddings from CSV where the first column is an id and the rest are feature columns.
    Returns (ids, float32 matrix).
    """
    df = pd.read_csv(path)
    cols = list(df.columns)
    if id_col in df.columns:
        id_series = df[id_col]
        X = df.drop(columns=[id_col])
    else:
        # Fallback: use the first column as id
        id_series = df.iloc[:, 0]
        X = df.iloc[:, 1:]
    ids = id_series.astype(int).tolist()
    X = X.to_numpy(dtype=np.float32)
    return ids, X


# ----------------------------- Datasets -----------------------------

class MultiLabelDataset(Dataset):
    def __init__(self, X: np.ndarray, Y: np.ndarray, indices: List[int] | None = None):
        self.X = X
        self.Y = Y
        self.indices = np.array(indices if indices is not None else np.arange(X.shape[0]), dtype=np.int64)
    def __len__(self):
        return self.indices.shape[0]
    def __getitem__(self, idx: int):
        i = int(self.indices[idx])
        x = torch.from_numpy(self.X[i])
        y = torch.from_numpy(self.Y[i])
        return x, y

class UnlabeledDataset(Dataset):
    def __init__(self, X: np.ndarray, indices: List[int]):
        self.X = X
        self.indices = np.array(indices, dtype=np.int64)
    def __len__(self):
        return self.indices.shape[0]
    def __getitem__(self, idx: int):
        i = int(self.indices[idx])
        x = torch.from_numpy(self.X[i])
        return x, i

# ----------------------------- Model -----------------------------

class MLPHead(nn.Module):
    def __init__(self, in_dim: int, out_dim: int, hidden: int | None = 256, dropout: float = 0.1):
        super().__init__()
        if hidden is None or hidden <= 0:
            self.net = nn.Sequential(
                nn.LayerNorm(in_dim),
                nn.Linear(in_dim, out_dim),
            )
        else:
            self.net = nn.Sequential(
                nn.LayerNorm(in_dim),
                nn.Linear(in_dim, hidden),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Linear(hidden, out_dim),
            )
    def forward(self, x):
        return self.net(x)

# ----------------------------- Utils -----------------------------

def to_device(batch, device):
    if isinstance(batch, (tuple, list)):
        return [b.to(device) if torch.is_tensor(b) else b for b in batch]
    return batch.to(device)


def micro_f1(y_true: np.ndarray, y_prob: np.ndarray, thr: float = 0.5, eps: float = 1e-9) -> float:
    y_pred = (y_prob >= thr).astype(np.float32)
    tp = (y_true * y_pred).sum()
    fp = ((1 - y_true) * y_pred).sum()
    fn = (y_true * (1 - y_pred)).sum()
    prec = tp / (tp + fp + eps)
    rec = tp / (tp + fn + eps)
    f1 = 2 * prec * rec / (prec + rec + eps)
    return float(f1)

# -------- Initial silver labeling (no CSV save; in-memory) --------
def make_initial_silver_hier(
    docs: np.ndarray,
    labels: np.ndarray,
    adj: np.ndarray,
    roots: list[int] = [0],
    silver_threshold: float = 0.6,    # Ïù¥Í±¥ avg(K) Í∏∞Ï§Ä
    silver_topk: int = 3,
    beam: int = 5,
    per_parent: str | int = "l+2",
    tau: float = 0.35,
) -> list[list[int]]:
    """
    Í≥ÑÏ∏µ Îπî ÏÑúÏπòÎ°ú Í∞Å Î¨∏ÏÑúÏùò ÎùºÎ≤® ÌõÑÎ≥¥Î•º ÎΩëÎäîÎã§.
    - Í≥ÑÏ∏µ Î∞ñ ÎùºÎ≤®ÏùÄ Ïï†Ï¥àÏóê Ïïà Îì§Ïñ¥Ïò¥
    - Î£®Ìä∏Îì§ÏùÄ Í≤∞Í≥ºÏóêÏÑú Ï†úÏô∏
    - K(Í≤ΩÎ°ú ÌèâÍ∑†) >= silver_threshold Ïù∏ Ïï†Îì§ Ï§ë top-k
    """
    N = labels.shape[0]
    silver: list[list[int]] = []
    root_set = set(roots)

    for d in docs:
        K, levels, sims, p = hierarchical_beam_similarity_avg(
            d, labels, adj,
            roots=roots,
            beam=beam,
            per_parent=per_parent,
            tau=tau,
            normalize=False,   # ÎÑà ÏûÑÎ≤†Îî©Ïù¥ Ïù¥ÎØ∏ L2ÎùºÎ©¥ False
        )
        # ÌèâÍ∑† Ï†êÏàòÎ°ú Ï†ïÎ†¨
        order = np.argsort(-K)
        # Î£®Ìä∏Îäî Ï†úÏô∏, Ïú†ÌïúÌïú Í≤ÉÎßå
        order = [i for i in order if (i not in root_set) and np.isfinite(K[i])]
        # threshold ÌÜµÍ≥ºÌïú Í≤ÉÎßå
        cand = [i for i in order if K[i] >= silver_threshold]
        selected = cand[:silver_topk]
        silver.append(selected)

    return silver

def make_initial_silver(
    docs: np.ndarray,
    labels: np.ndarray,
    adj: np.ndarray,              # Ïù¥Ï†ú Ïïà ÏîÄ (Ìò∏ÌôòÏö©ÏúºÎ°úÎßå Îë†)
    silver_threshold: float = 0.9,
    silver_topk: int = 3,
    beam: int = 5,                # Ïù¥Ï†ú Ïïà ÏîÄ
    tau: float = 0.35,
    root_id: int = 0,
) -> List[List[int]]:
    """
    Î¨∏ÏÑúÎßàÎã§ Ï†Ñ ÎùºÎ≤® ÏûÑÎ≤†Îî©Í≥ºÏùò Ïú†ÏÇ¨ÎèÑÎ•º Î≥¥Í≥† Ï¥àÍ∏∞ silver labelÏùÑ ÎßåÎì†Îã§.
    - Ìä∏Î¶¨/Í≤ΩÎ°ú ÌÉêÏÉâ Ïïà Ìï®
    - root_idÎäî Í≤∞Í≥ºÏóêÏÑú Ï†úÏô∏
    - p >= silver_thresholdÏù∏ ÎùºÎ≤® Ï§ëÏóêÏÑú ÏÉÅÏúÑ silver_topkÎßå ÎÇ®ÍπÄ
    """
    N = labels.shape[0]
    silver: List[List[int]] = []

    for d in docs:
        # Î¨∏ÏÑú vs Î™®Îì† ÎùºÎ≤® Ï†êÏàò
        sims, p = all_label_similarity(d, labels, tau=tau, normalize=True)

        # threshold ÌÜµÍ≥º + root Ï†úÏô∏
        cand = [
            (i, float(p[i]))
            for i in range(N)
            if i != root_id and np.isfinite(p[i]) and p[i] >= silver_threshold
        ]

        # Ï†êÏàò ÎÜíÏùÄ Ïàú
        cand.sort(key=lambda x: x[1], reverse=True)

        # label indexÎßå Ï∂îÏ∂ú
        selected = [i for i, _ in cand[:silver_topk]]
        silver.append(selected)

    return silver
# ------------------------ Train / Self-Training ------------------------

def train_epoch(model, loader, optim, device, criterion):
    model.train()
    total = 0.0
    for x, y in loader:
        x, y = to_device(x, device), to_device(y, device)
        optim.zero_grad(set_to_none=True)
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optim.step()
        total += float(loss.detach().cpu().item()) * x.size(0)
    return total / max(1, len(loader.dataset))


def eval_epoch(model, loader, device, criterion, thr=0.5):
    model.eval()
    total = 0.0
    ys = []
    ps = []
    with torch.no_grad():
        for x, y in loader:
            x, y = to_device(x, device), to_device(y, device)
            logits = model(x)
            loss = criterion(logits, y)
            total += float(loss.detach().cpu().item()) * x.size(0)
            prob = torch.sigmoid(logits).detach().cpu().numpy()
            ys.append(y.detach().cpu().numpy())
            ps.append(prob)
    y_true = np.concatenate(ys, axis=0)
    y_prob = np.concatenate(ps, axis=0)
    f1 = micro_f1(y_true, y_prob, thr=thr)
    return total / max(1, len(loader.dataset)), f1, y_prob


def pseudo_label_and_grow(model, unl_ds: UnlabeledDataset,
                          num_labels: int,
                          pseudo_threshold: float = 0.9, pseudo_topk: int = 3,
                          device: str = "cpu", batch_size: int = 512):
    """Infer on unlabeled, select labels with prob>=threshold (top-k), and return new_indices and Y matrix."""
    if len(unl_ds) == 0:
        return [], np.zeros((0, num_labels), dtype=np.float32)
    loader = DataLoader(unl_ds, batch_size=batch_size, shuffle=False)
    model.eval()
    all_idx: List[int] = []
    all_y: List[np.ndarray] = []
    with torch.no_grad():
        for xb, idxs in loader:
            xb = xb.to(device)
            logits = model(xb)
            prob = torch.sigmoid(logits).detach().cpu().numpy()
            for p, i in zip(prob, idxs.numpy().tolist()):
                sel = np.flatnonzero(p >= pseudo_threshold)
                if sel.size > 0:
                    # keep at most top-k by prob
                    if sel.size > pseudo_topk:
                        top = np.argpartition(-p[sel], pseudo_topk - 1)[:pseudo_topk]
                        sel = sel[top]
                    y = np.zeros(num_labels, dtype=np.float32)
                    y[sel] = 1.0
                    all_idx.append(int(i))
                    all_y.append(y)
    if len(all_idx) == 0:
        return [], np.zeros((0, num_labels), dtype=np.float32)
    Y_new = np.stack(all_y, axis=0)
    return all_idx, Y_new






In [14]:

doc_ids, X = load_embeddings_csv("Amazon_products/document_embeddings_768d.csv")
label_ids, L = load_embeddings_csv("Amazon_products/label_embeddings_gat.csv")
B

# Ensure label order matches adjacency
order = np.argsort(label_ids)
label_ids_sorted = [label_ids[i] for i in order]
L = L[order]
assert B.shape == (L.shape[0], L.shape[0]), "Adjacency/label size mismatch"

# Initial silver labeling
# silver = make_initial_silver(X, L, B, silver_threshold=0.9, silver_topk=3, tau=0.35, root_id=0)
silver = make_initial_silver_hier(
    X,          # docs
    L,          # label_emb
    B,          # upper adj
    roots=roots,  # Ïó¨Îü¨ Í∞úÎ©¥ [0, 10, 25] Ïù¥Îü∞ Ïãù
    silver_threshold=0.55,
    silver_topk=3,
    beam=5,
    per_parent="l+2",
    tau=0.35,
)

val_ratio  = 0.2
epochs = 50
patience = 5
batch_size = 256
lr = 1e-3
hidden = 256
dropout = 0.1
pseudo_threshold = 0.6
pseudo_topk = 3
seed = 42
# Build multi-hot targets for silver docs; split into train/val; keep unlabeled indices
N_labels = L.shape[0]
Y = np.zeros((X.shape[0], N_labels), dtype=np.float32)
silver_mask = np.zeros(X.shape[0], dtype=bool)
for i, lab_list in enumerate(silver):
    if len(lab_list) > 0:
        Y[i, lab_list] = 1.0
        silver_mask[i] = True

idx_silver = np.flatnonzero(silver_mask)
idx_unl = np.flatnonzero(~silver_mask)

# Train/val split
rng = np.random.default_rng(seed)
rng.shuffle(idx_silver)
n_val = max(1, int(len(idx_silver) * val_ratio)) if len(idx_silver) > 0 else 0
idx_val = idx_silver[:n_val]
idx_train = idx_silver[n_val:]

train_ds = MultiLabelDataset(X, Y, indices=idx_train)
val_ds = MultiLabelDataset(X, Y, indices=idx_val) if n_val > 0 else None
unl_ds = UnlabeledDataset(X, idx_unl.tolist())

print(len(train_ds))
print(len(val_ds))
print(len(unl_ds))
# Model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = MLPHead(in_dim=X.shape[1], out_dim=N_labels, hidden=hidden, dropout=dropout).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=lr)
crit = nn.BCEWithLogitsLoss()

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False) if val_ds is not None else None

best_val = float("inf")
best_epoch = -1
no_improve = 0


5650
1412
22425


In [None]:
print(len(train_ds))
print(len(val_ds))

print(len(unl_ds))

In [23]:

for epoch in range(1, epochs + 1):
    tr_loss = train_epoch(model, train_loader, opt, device, crit)
    if val_loader is not None and len(val_ds) > 0:
        va_loss, va_f1, _ = eval_epoch(model, val_loader, device, crit, thr=0.5)
        print(f"Epoch {epoch:03d} | train_loss={tr_loss:.3f}  val_loss={va_loss:.3f}  val_f1={va_f1:.3f}")
        # Early stopping on val_loss
        if va_loss + 1e-6 < best_val:
            best_val = va_loss
            best_epoch = epoch
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                print(f"Early stopping at epoch {epoch} (best@{best_epoch})")
                break
    else:
        print(f"Epoch {epoch:03d} | train_loss={tr_loss:.3f}")

    # Pseudo-labeling step (after each epoch)
    new_idx, Y_new = pseudo_label_and_grow(
        model, unl_ds, N_labels,
        pseudo_threshold=pseudo_threshold,
        pseudo_topk=pseudo_topk,
        device=device,
        batch_size=batch_size,
    )
    if len(new_idx) > 0:
        # Extend training set
        # Update Y with new labels
        for i, y in zip(new_idx, Y_new):
            Y[i] = np.maximum(Y[i], y)  # merge if any
        # Move indices from unlabeled to train
        keep_mask = ~np.isin(unl_ds.indices, np.array(new_idx, dtype=np.int64))
        unl_ds.indices = unl_ds.indices[keep_mask]
        train_ds.indices = np.concatenate([train_ds.indices, np.array(new_idx, dtype=np.int64)])
        # Rebuild train loader to reflect length change
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)
        print(f"  + Added {len(new_idx)} pseudo-labeled docs (unl pool ‚Üí {len(unl_ds)} left)")
    else:
        print("  + No pseudo-labeled docs added this epoch")

print("Training complete.")

Epoch 001 | train_loss=0.183  val_loss=0.021  val_f1=0.000
  + No pseudo-labeled docs added this epoch
Epoch 002 | train_loss=0.020  val_loss=0.018  val_f1=0.001
  + No pseudo-labeled docs added this epoch
Epoch 003 | train_loss=0.017  val_loss=0.015  val_f1=0.203
  + Added 827 pseudo-labeled docs (unl pool ‚Üí 21598 left)
Epoch 004 | train_loss=0.014  val_loss=0.014  val_f1=0.257
  + Added 1096 pseudo-labeled docs (unl pool ‚Üí 20502 left)
Epoch 005 | train_loss=0.012  val_loss=0.013  val_f1=0.277
  + Added 2385 pseudo-labeled docs (unl pool ‚Üí 18117 left)
Epoch 006 | train_loss=0.009  val_loss=0.013  val_f1=0.329
  + Added 1878 pseudo-labeled docs (unl pool ‚Üí 16239 left)
Epoch 007 | train_loss=0.008  val_loss=0.013  val_f1=0.342
  + Added 2058 pseudo-labeled docs (unl pool ‚Üí 14181 left)
Epoch 008 | train_loss=0.006  val_loss=0.013  val_f1=0.345
  + Added 2849 pseudo-labeled docs (unl pool ‚Üí 11332 left)
Epoch 009 | train_loss=0.005  val_loss=0.012  val_f1=0.369
  + Added 2377 p

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader

# 0) Î¨∏ÏÑú/ÎùºÎ≤® ÏûÑÎ≤†Îî© Î°úÎìú
doc_ids, X = load_embeddings_csv("Amazon_products/document_embeddings_768d.csv")
label_ids, L = load_embeddings_csv("Amazon_products/label_embeddings_gat.csv")

# 1) ÎùºÎ≤® ÏàúÏÑúÏôÄ B(Î∂ÄÎ™®->ÏûêÏãù) ÎßûÏ∂îÍ∏∞
order = np.argsort(label_ids)
label_ids = [label_ids[i] for i in order]
L = L[order]
assert B.shape == (L.shape[0], L.shape[0]), "Adjacency/label size mismatch"

# 2) Í≥ÑÏ∏µ silver ÎßåÎì§Í∏∞
silver = make_initial_silver_hier(
    X,          # docs (N, d)
    L,          # label_emb (C, d)
    B,          # upper adj (C, C)
    roots=roots,
    silver_threshold=0.55,
    silver_topk=3,
    beam=5,
    per_parent="l+2",
    tau=0.35,
)


# -------------------------------------------------
# 1) Í≥ÑÏ∏µ Ï†ïÎ≥¥ÏóêÏÑú parents / children ÎΩëÍ∏∞
#    B[parent, child] = 1 Ïù¥ÎùºÍ≥† ÌñàÏúºÎãàÍπå Í∑∏ÎåÄÎ°ú ÏîÄ
# -------------------------------------------------
# B: [C, C] (parent -> child)
def build_parents_children(adj):
    C = adj.shape[0]
    parents = [np.flatnonzero(adj[:, j]).astype(np.int64) for j in range(C)]
    children = [np.flatnonzero(adj[j]).astype(np.int64) for j in range(C)]
    return parents, children

parents, children = build_parents_children(B)


# -------------------------------------------------
# 2) silver ‚Üí Í≥ÑÏ∏µ pos/neg ÎßàÏä§ÌÅ¨Î°ú Î≥ÄÌôò
# -------------------------------------------------
def build_pos_neg_masks(silver, parents, children, num_labels):
    """
    silver: list[list[int]]  # Î¨∏ÏÑúÎßàÎã§ core label indexÎì§
    parents / children: list[np.ndarray]
    return:
      pos_masks: np.array [N_docs, C]
      neg_masks: np.array [N_docs, C]
    """
    N = len(silver)
    C = num_labels
    pos_masks = np.zeros((N, C), dtype=np.float32)
    neg_masks = np.zeros((N, C), dtype=np.float32)

    all_idx = np.arange(C)

    for i, core in enumerate(silver):
        core = list(core)
        # 1) coreÏùò Î∂ÄÎ™®ÍπåÏßÄ positive
        pos_set = set(core)
        for c in core:
            for p in parents[c]:
                pos_set.add(int(p))

        # 2) childrenÏùÄ ÎÇòÏ§ëÏóê negativeÏóêÏÑú Ï†úÏô∏
        child_set = set()
        for c in core:
            for ch in children[c]:
                child_set.add(int(ch))

        # pos ÎßàÏä§ÌÅ¨
        for p in pos_set:
            pos_masks[i, p] = 1.0

        # neg = Ï†ÑÏ≤¥ - pos - children
        for j in all_idx:
            if j in pos_set:
                continue
            if j in child_set:
                continue
            neg_masks[i, j] = 1.0

    return pos_masks, neg_masks

# -------------------------------------------------
# 3) Dataset: Î¨∏ÏÑú ÏûÑÎ≤†Îî© + pos/neg ÎßàÏä§ÌÅ¨
# -------------------------------------------------
class HierMultiLabelDataset(Dataset):
    def __init__(self, X, pos_masks, neg_masks, indices=None):
        self.X = X.astype(np.float32)
        self.pos = pos_masks.astype(np.float32)
        self.neg = neg_masks.astype(np.float32)
        if indices is None:
            self.indices = np.arange(self.X.shape[0], dtype=np.int64)
        else:
            self.indices = np.array(indices, dtype=np.int64)

    def __len__(self):
        return self.indices.shape[0]

    def __getitem__(self, idx):
        i = int(self.indices[idx])
        x = torch.from_numpy(self.X[i])
        pos = torch.from_numpy(self.pos[i])
        neg = torch.from_numpy(self.neg[i])
        return x, pos, neg

class UnlabeledDataset(Dataset):
    def __init__(self, X, indices):
        self.X = X.astype(np.float32)
        self.indices = np.array(indices, dtype=np.int64)
    def __len__(self):
        return len(self.indices)
    def __getitem__(self, idx):
        i = int(self.indices[idx])
        return torch.from_numpy(self.X[i]), i


# -------------------------------------------------
# 4) Bilinear classifier
#    doc_emb: [B, d_doc]
#    label_emb: [C, d_lab]  (ÎØ∏Î¶¨ GATÎ°ú ÎßåÎì† Í±∞)
#    Ï†êÏàò: doc @ W @ label_emb^T
# -------------------------------------------------
class BilinearHierClassifier(nn.Module):
    def __init__(self, doc_dim, label_emb, hidden_dim=None):
        super().__init__()
        # label_embÎäî ÌååÎùºÎØ∏ÌÑ∞Î°ú Îì§Í≥†ÏûàÎêò, ÏóÖÎç∞Ïù¥Ìä∏ Ïïà ÌïúÎã§Í≥† Í∞ÄÏ†ï(ÏõêÌïòÎ©¥ nn.ParameterÎ°ú)
        self.register_buffer("label_emb", torch.tensor(label_emb, dtype=torch.float32))
        C, d_lab = self.label_emb.shape
        self.doc_dim = doc_dim
        self.label_dim = d_lab

        if hidden_dim is None:
            # Î∞îÎ°ú doc_dim -> label_dim
            self.interaction = nn.Linear(doc_dim, d_lab, bias=False)
            self.proj = None
        else:
            # doc_dim -> hidden -> label_dim Í∞ôÏùÄ Í≤ÉÎèÑ Í∞ÄÎä•
            self.interaction = nn.Sequential(
                nn.Linear(doc_dim, hidden_dim),
                nn.GELU(),
                nn.Linear(hidden_dim, d_lab, bias=False),
            )

    def forward(self, x):
        """
        x: [B, d_doc]
        return: logits [B, C]
        """
        # x -> same dim as label
        h = self.interaction(x)                             # [B, d_lab]
        # [B, d_lab] @ [d_lab, C] -> [B, C]
        logits = torch.matmul(h, self.label_emb.t())
        return logits

# -------------------------------------------------
# 5) loss: Í≥ÑÏ∏µ ÎßàÏä§ÌÅ¨Î•º ÏîåÏö¥ BCE
# -------------------------------------------------
def hierarchical_bce_loss(logits, pos_mask, neg_mask):
    # logits: [B, C]
    # pos_mask, neg_mask: [B, C]
    loss_pos = -(pos_mask * F.logsigmoid(logits)).sum()
    loss_neg = -(neg_mask * F.logsigmoid(-logits)).sum()
    denom = (pos_mask.sum() + neg_mask.sum()).clamp(min=1.0)
    return (loss_pos + loss_neg) / denom

# -------------------------------------------------
# 6) ÌïôÏäµ Î£®ÌîÑ ÏòàÏãú
# -------------------------------------------------
# Ïù¥ÎØ∏ ÏûàÎäî Í≤ÉÎì§: X (Î¨∏ÏÑú BERT ÏûÑÎ≤†Îî©) : [N_docs, d_doc]
#                  L (ÎùºÎ≤® GAT ÏûÑÎ≤†Îî©)  : [C, d_lab]
#                  B_adj (Î∂ÄÎ™®->ÏûêÏãù)   : [C, C]
#                  silver (list[list[int]]) : Î¨∏ÏÑúÎ≥Ñ core label index
def train_epoch_hier(model, loader, opt, device):
    model.train()
    total = 0.0
    for xb, posb, negb in loader:
        xb = xb.to(device)
        posb = posb.to(device)
        negb = negb.to(device)
        logits = model(xb)
        loss = hierarchical_bce_loss(logits, posb, negb)
        opt.zero_grad()
        loss.backward()
        opt.step()
        total += loss.item() * xb.size(0)
    return total / len(loader.dataset)

# 1) micro F1 Í≥ÑÏÇ∞
def micro_f1_from_logits(logits, pos_mask, thr=0.5, eps=1e-9):
    """
    logits: [B, C]
    pos_mask: [B, C]  (1: positive, 0: else)
    """
    probs = torch.sigmoid(logits)
    preds = (probs >= thr).float()

    y_true = pos_mask
    y_pred = preds

    tp = (y_true * y_pred).sum()
    fp = ((1 - y_true) * y_pred).sum()
    fn = (y_true * (1 - y_pred)).sum()

    precision = tp / (tp + fp + eps)
    recall    = tp / (tp + fn + eps)
    f1 = 2 * precision * recall / (precision + recall + eps)
    return f1.item()

# 2) eval Ìï®Ïàò ÏàòÏ†ï: loss + f1 Îëò Îã§
def eval_epoch_hier(model, loader, device, k=3, thr=None):
    model.eval()
    total_loss = 0.0
    f1_list = []
    with torch.no_grad():
        for xb, posb, negb in loader:
            xb = xb.to(device)
            posb = posb.to(device)
            negb = negb.to(device)

            logits = model(xb)
            loss = hierarchical_bce_loss(logits, posb, negb)  # ÏúÑÏóê Î∞îÍæº Î≤ÑÏ†Ñ
            total_loss += loss.item() * xb.size(0)

            probs = torch.sigmoid(logits)

            if thr is not None:
                pred = (probs >= thr).float()
            else:
                # top-k Î∞©Ïãù
                B, C = probs.shape
                pred = torch.zeros_like(probs)
                topk = probs.topk(k, dim=1).indices
                pred.scatter_(1, topk, 1.0)

            # micro-f1
            y_true = posb
            y_pred = pred
            tp = (y_true * y_pred).sum().item()
            fp = ((1 - y_true) * y_pred).sum().item()
            fn = (y_true * (1 - y_pred)).sum().item()
            prec = tp / (tp + fp + 1e-9)
            rec  = tp / (tp + fn + 1e-9)
            f1   = 2 * prec * rec / (prec + rec + 1e-9)
            f1_list.append(f1)

    avg_loss = total_loss / len(loader.dataset)
    avg_f1 = float(np.mean(f1_list)) if f1_list else 0.0
    return avg_loss, avg_f1
def pseudo_label_and_grow_hier(
    model,
    unl_ds,             # UnlabeledDataset
    X_all,              # Ï†ÑÏ≤¥ Î¨∏ÏÑú ÏûÑÎ≤†Îî© (numpy)
    parents, children,
    num_labels,
    device,
    pseudo_threshold=0.6,
    pseudo_topk=3,
    batch_size=512,
):
    if len(unl_ds) == 0:
        return [], None, None

    loader = DataLoader(unl_ds, batch_size=batch_size, shuffle=False)
    model.eval()
    new_idx = []
    new_pos_list = []
    new_neg_list = []

    with torch.no_grad():
        for xb, idxs in loader:
            xb = xb.to(device)
            logits = model(xb)
            prob = torch.sigmoid(logits).cpu().numpy()

            for p, i_doc in zip(prob, idxs.numpy().tolist()):
                order = np.argsort(-p)
                top1 = p[order[0]]
                # 1) top-1Ïù¥ thresholdÎ•º Î™ª ÎÑòÏúºÎ©¥ Í∑∏ÎÉ• Î≤ÑÎ¶∞Îã§
                if top1 < pseudo_threshold:
                    continue
                core = [j for j in order if p[j] >= pseudo_threshold][:pseudo_topk]
                if len(core) == 0:
                    # ÏïÑÏòà Ïù¥ Î¨∏ÏÑúÎäî Ïù¥Î≤à epochÏóê Ïïà ÎÑ£Ïùå
                    continue

                # Í≥ÑÏ∏µ pos/neg Íµ¨ÏÑ±
                pos = set(core)
                for c in core:
                    for pa in parents[c]:
                        pos.add(int(pa))
                child = set()
                for c in core:
                    for ch in children[c]:
                        child.add(int(ch))

                pos_mask = np.zeros(num_labels, dtype=np.float32)
                neg_mask = np.zeros(num_labels, dtype=np.float32)
                for j in pos:
                    pos_mask[j] = 1.0
                for j in range(num_labels):
                    if j in pos:    # Ïù¥ÎØ∏ ÏñëÏÑ±
                        continue
                    if j in child:  # Î™®Î•¥Í≤†Ïùå ‚Üí negativeÏóêÏÑú Ï†úÏô∏
                        continue
                    neg_mask[j] = 1.0

                new_idx.append(int(i_doc))
                new_pos_list.append(pos_mask)
                new_neg_list.append(neg_mask)



    if len(new_idx) == 0:
        return [], None, None

    new_pos = np.stack(new_pos_list, axis=0)
    new_neg = np.stack(new_neg_list, axis=0)
    return new_idx, new_pos, new_neg



device = "cuda" if torch.cuda.is_available() else "cpu"
has_silver = np.array([len(lbls) > 0 for lbls in silver], dtype=bool)
N_docs = X.shape[0]
C = L.shape[0]

# silver ÏûàÎäî Î¨∏ÏÑú / ÏóÜÎäî Î¨∏ÏÑú
has_silver = np.array([len(lbls) > 0 for lbls in silver], dtype=bool)
idx_silver = np.flatnonzero(has_silver)      # Ïó¨Í∏∞Í∞Ä train/val ÌõÑÎ≥¥
idx_unl    = np.flatnonzero(~has_silver)     # ÏßÑÏßú unl

print("total:", N_docs)
print("with silver:", len(idx_silver))
print("unlabeled :", len(idx_unl))

# Ïù¥Ï†ú train/valÏùÄ silver ÏûàÎäî Ïï†Îì§Îßå ÏÑûÏñ¥ÏÑú ÎÇòÎààÎã§
rng = np.random.default_rng(42)
rng.shuffle(idx_silver)
n_val = int(len(idx_silver) * 0.2)
idx_val   = idx_silver[:n_val]
idx_train = idx_silver[n_val:]

# parents, children ÎßåÎì§Í∏∞
def build_parents_children(adj):
    C = adj.shape[0]
    parents = [np.flatnonzero(adj[:, j]).astype(np.int64) for j in range(C)]
    children = [np.flatnonzero(adj[j]).astype(np.int64) for j in range(C)]
    return parents, children

parents, children = build_parents_children(B)

pos_masks = np.zeros((N_docs, C), dtype=np.float32)
neg_masks = np.zeros((N_docs, C), dtype=np.float32)

for i in idx_silver:  # silver ÏûàÎäî Ïï†Îßå ÎèàÎã§
    core = silver[i]

    # 1) core + parents
    pos = set(core)
    for c in core:
        for p in parents[c]:
            pos.add(int(p))

    # 2) childrenÏùÄ Î™®Î¶Ñ
    child = set()
    for c in core:
        for ch in children[c]:
            child.add(int(ch))

    for p in pos:
        pos_masks[i, p] = 1.0

    for j in range(C):
        if j in pos:      # Ïù¥ÎØ∏ ÏñëÏÑ±
            continue
        if j in child:    # Î™®Î¶Ñ
            continue
        neg_masks[i, j] = 1.0



train_ds = HierMultiLabelDataset(X, pos_masks, neg_masks, indices=idx_train)
val_ds   = HierMultiLabelDataset(X, pos_masks, neg_masks, indices=idx_val) if len(idx_val) > 0 else None
unl_ds   = UnlabeledDataset(X, idx_unl.tolist())
print(len(train_ds),len(val_ds),len(unl_ds))

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=256, shuffle=False)

model = BilinearHierClassifier(doc_dim=X.shape[1], label_emb=L, hidden_dim=256).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-3)
epochs = 150

N_labels = L.shape[0]
best_f1 = -1.0
patience = 15
no_improve = 0
warmup_self = 1   # 1 epochÏùÄ self-training Ïïà ÌïòÍ≤å Ìï¥ÏÑú Ìïú Î≤à ÏïàÏ†ïÌôî

for epoch in range(1, epochs + 1):
    # train
    tr_loss = train_epoch_hier(model, train_loader, opt, device)

    # val: f1 Í∏∞Ï§Ä
    if val_loader is not None and len(val_ds) > 0:
        va_loss, va_f1 = eval_epoch_hier(model, val_loader, device, k=3)
        print(f"Epoch {epoch:03d} | train_loss={tr_loss:.3f}  val_loss={va_loss:.3f}  val_f1={va_f1:.3f}")

        # early stoppingÏùÑ f1Î°ú
        if va_f1 > best_f1 + 1e-6:
            best_f1 = va_f1
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                print(f"Early stopping at epoch {epoch} (best f1={best_f1:.4f})")
                break
    else:
        print(f"Epoch {epoch:03d} | train_loss={tr_loss:.3f}")

    # self-training: 1ÏóêÌè≠Ïóê Ï†ÑÎ∂Ä Îì§Ïñ¥Í∞ÄÎäî Í±∞ Î∞©ÏßÄÏö©ÏúºÎ°ú warmup ÎÑ£Ïùå
    if epoch <= warmup_self:
        print("  + (skip pseudo-labeling on warmup epoch)")
        continue

    new_idx, new_pos, new_neg = pseudo_label_and_grow_hier(
        model,
        unl_ds,
        X,
        parents,
        children,
        C,                   # num_labels
        device=device,
        pseudo_threshold=0.60,
        pseudo_topk=3,
        batch_size=512,
    )

    if len(new_idx) > 0:
        # Ï†ÑÏó≠ ÎßàÏä§ÌÅ¨ Í∞±Ïã†
        pos_masks[new_idx] = new_pos
        neg_masks[new_idx] = new_neg

        # unlÏóêÏÑú Ï†úÍ±∞
        keep_mask = ~np.isin(unl_ds.indices, np.array(new_idx, dtype=np.int64))
        unl_ds.indices = unl_ds.indices[keep_mask]

        # trainÏóê Ï∂îÍ∞Ä
        train_ds.indices = np.concatenate([train_ds.indices, np.array(new_idx, dtype=np.int64)])
        train_loader = DataLoader(train_ds, batch_size=256, shuffle=True, drop_last=False)

        print(f"  + Added {len(new_idx)} pseudo-labeled docs (unl pool ‚Üí {len(unl_ds)} left)")
    else:
        print("  + No pseudo-labeled docs added this epoch")


total: 29487
with silver: 7062
unlabeled : 22425
5650 1412 22425
Epoch 001 | train_loss=0.607  val_loss=0.444  val_f1=0.000
  + (skip pseudo-labeling on warmup epoch)
Epoch 002 | train_loss=0.269  val_loss=0.110  val_f1=0.072
  + No pseudo-labeled docs added this epoch
Epoch 003 | train_loss=0.076  val_loss=0.058  val_f1=0.063
  + No pseudo-labeled docs added this epoch
Epoch 004 | train_loss=0.055  val_loss=0.051  val_f1=0.005
  + No pseudo-labeled docs added this epoch
Epoch 005 | train_loss=0.049  val_loss=0.047  val_f1=0.018
  + No pseudo-labeled docs added this epoch
Epoch 006 | train_loss=0.045  val_loss=0.043  val_f1=0.100
  + No pseudo-labeled docs added this epoch
Epoch 007 | train_loss=0.043  val_loss=0.041  val_f1=0.192
  + No pseudo-labeled docs added this epoch
Epoch 008 | train_loss=0.041  val_loss=0.039  val_f1=0.203
  + Added 1 pseudo-labeled docs (unl pool ‚Üí 22424 left)
Epoch 009 | train_loss=0.039  val_loss=0.037  val_f1=0.209
  + Added 2 pseudo-labeled docs (unl po

In [25]:
# Minimal submission generator: pick 2‚Äì3 labels per sample via hierarchical beam scoring
import csv, os
from pathlib import Path
import numpy as np
import pandas as pd

# ------------ Paths (edit if needed) ------------
TEST_CORPUS = "Amazon_products/test/test_corpus.txt"   # lines: pid \t text
DOC_CSV     = "Amazon_products/test_doc_embeddings.csv"  # first col: id (pid), rest: feat000..feat127
LABEL_CSV   = "Amazon_products/label_embeddings_gat.csv" # first col: id (== node index 0..N-1)
OUT_PATH    = "submission.csv"
DOC_CSV = "Amazon_products/test_doc_embeddings.csv" # first col: id (pid), rest: feat000..feat127
build_and_save_document_embeddings(
    corpus_path=TEST_CORPUS,
    model_name="bert-base-uncased",
    alpha=0.7,
    max_length=256,    # Î¨∏ÏÑú Í∏∏Ïù¥Ïóê ÎßûÍ≤å Ï°∞Ï†à
    batch_size=32,
    out_csv=DOC_CSV,
    pad_width=2,
)
# ------------ Hyperparams ------------
MIN_LABS  = 2
MAX_LABS  = 3
BATCH = 1024

# load test pids
pids = []
with open(TEST_CORPUS, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.rstrip("\n").split("\t", 1)
        if len(parts) == 2:
            pids.append(parts[0])

# load doc embeddings (map pid->vec)
df_doc = pd.read_csv(DOC_CSV)
doc_ids = df_doc.iloc[:,0].astype(str).tolist()
D = df_doc.iloc[:,1:].to_numpy(dtype=np.float32)
D = l2_normalize(D)
pid2idx = {pid: i for i, pid in enumerate(doc_ids)}

# load label embeddings (ensure order aligns with adjacency rows)
df_lab = pd.read_csv(LABEL_CSV)
lab_ids = df_lab.iloc[:,0].astype(int).to_numpy()
L = df_lab.iloc[:,1:].to_numpy(dtype=np.float32)
ord = np.argsort(lab_ids)
lab_ids = lab_ids[ord]
L = L[ord]

# precompute children lists
children = [np.flatnonzero(A[i]) for i in range(A.shape[0])]



[OK] saved document embeddings ‚Üí Amazon_products/test_doc_embeddings.csv  shape=(19658, 769)


In [27]:
import csv, os
from pathlib import Path
import numpy as np
import pandas as pd

# ------------ Paths (edit if needed) ------------
TEST_CORPUS = "Amazon_products/test/test_corpus.txt"   # lines: pid \t text
DOC_CSV     = "Amazon_products/test_doc_embeddings.csv"  # first col: id (pid), rest: feat000..feat127
LABEL_CSV   = "Amazon_products/label_embeddings_gat.csv" # first col: id (== node index 0..N-1)
OUT_PATH    = "submission.csv"
DOC_CSV = "Amazon_products/test_doc_embeddings.csv" # first col: id (pid), rest: feat000..feat127
# ------------ Hyperparams ------------
MIN_LABS  = 2
MAX_LABS  = 3
BATCH = 1024

# load test pids
pids = []
with open(TEST_CORPUS, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.rstrip("\n").split("\t", 1)
        if len(parts) == 2:
            pids.append(parts[0])

# load doc embeddings (map pid->vec)
df_doc = pd.read_csv(DOC_CSV)
doc_ids = df_doc.iloc[:,0].astype(str).tolist()
D = df_doc.iloc[:,1:].to_numpy(dtype=np.float32)
pid2idx = {pid: i for i, pid in enumerate(doc_ids)}

# load label embeddings (ensure order aligns with adjacency rows)
df_lab = pd.read_csv(LABEL_CSV)
lab_ids = df_lab.iloc[:,0].astype(int).to_numpy()
L = df_lab.iloc[:,1:].to_numpy(dtype=np.float32)
ord = np.argsort(lab_ids)
lab_ids = lab_ids[ord]
L = L[ord]

# precompute children lists
children = [np.flatnonzero(A[i]) for i in range(A.shape[0])]


In [28]:
model.eval()

def ancestors_of(node, adj):
    # adj[parent, child] = 1 Í∞ÄÏ†ï
    parents = np.flatnonzero(adj[:, node])  # (N,)
    return parents.tolist()

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

IN_DIM = D.shape[1]   # ‚ùóÔ∏èÏó¨Í∏∞ DÎ°ú
missing = 0  # ÏóÜÎäî pid Ïπ¥Ïö¥Ìä∏

with open(OUT_PATH, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["id", "label"])

    buf_x, buf_pid = [], []

    def flush():
        if not buf_x:
            return
        xb = torch.from_numpy(np.stack(buf_x, axis=0).astype(np.float32)).to(device)
        with torch.inference_mode():
            prob = torch.sigmoid(model(xb)).detach().cpu().numpy()
        prob = np.nan_to_num(prob, nan=-1.0, posinf=1.0, neginf=0.0)
    
        for pid, p in zip(buf_pid, prob):
            order = np.argsort(-p)
    
            # 1) Í∏∞Î≥∏ ÌõÑÎ≥¥ ÎΩëÍ∏∞
            thr_keep = [i for i in order if p[i] >= 0.5][:MAX_LABS]
            if len(thr_keep) >= MIN_LABS:
                keep = thr_keep[:MAX_LABS]
            else:
                keep = order[:max(MIN_LABS, len(thr_keep))]
                if len(keep) < MIN_LABS:
                    keep = order[:MIN_LABS]
    
            # 2) Î∂ÄÎ™® ÌõÑÎ≥¥ Î™®ÏúºÍ∏∞ (keepÏóêÎäî ÏïÑÏßÅ Ïïà Îì§Ïñ¥Í∞Ñ Í≤ÉÎßå)
            parent_cands = []
            for c in keep:
                pars = ancestors_of(c, B)
                for pa in pars:
                    if pa not in keep and pa not in parent_cands:
                        parent_cands.append(pa)
    
            # Î∂ÄÎ™® ÌõÑÎ≥¥Î•º ÌôïÎ•† ÏàúÏúºÎ°ú Ï†ïÎ†¨ (ÏûàÏúºÎ©¥ Îçî ÎÜíÏùÄ Î∂ÄÎ™®Î∂ÄÌÑ∞)
            parent_cands.sort(key=lambda idx: p[idx], reverse=True)
    
            # 3) ÎÇ®Îäî Ïä¨Î°ØÎßåÌÅº Î∂ÄÎ™® Ï±ÑÏö∞Í∏∞
            final_idxs = list(keep)
            for pa in parent_cands:
                if len(final_idxs) >= MAX_LABS:
                    break
                final_idxs.append(pa)
    
            # 4) ÌòπÏãúÎùºÎèÑ ÏµúÏÜå Í∞úÏàò Î™ª Ï±ÑÏõ†ÏúºÎ©¥ orderÏóêÏÑú Ï±ÑÏö∞Í∏∞
            if len(final_idxs) < MIN_LABS:
                for idx in order:
                    if idx not in final_idxs:
                        final_idxs.append(idx)
                    if len(final_idxs) >= MIN_LABS:
                        break
    
            labels = sorted(int(lab_ids[i]) for i in final_idxs)
            w.writerow([pid, ",".join(map(str, labels))])
    
        buf_x.clear()
        buf_pid.clear()

    for pid in pids:
        j = pid2idx.get(pid, None)
        if j is None:
            x = np.zeros(IN_DIM, dtype=np.float32)
            missing += 1
        else:
            x = D[j]
            if x.dtype != np.float32:
                x = x.astype(np.float32, copy=False)
        buf_x.append(x)
        buf_pid.append(pid)
        if len(buf_x) >= BATCH:
            flush()
    flush()

print(f"Saved: {OUT_PATH} | samples={len(pids)} | min-max labels per sample={MIN_LABS}-{MAX_LABS} | missing_pids={missing}")


Saved: submission.csv | samples=19658 | min-max labels per sample=2-3 | missing_pids=0


In [89]:
"""
# ------------------------
# Dummy baseline for Kaggle submission
# Generates random multi-label predictions
# ------------------------
import os
import csv
import random
from tqdm import tqdm

# --- Paths ---
TEST_DIR = "Amazon_products/test"  # modify if needed
TEST_CORPUS_PATH = os.path.join(TEST_DIR, "test_corpus.txt")  # product_id \t text
SUBMISSION_PATH = "submission.csv"  # output file

# --- Constants ---
NUM_CLASSES = 531  # total number of classes (0‚Äì530)
MIN_LABELS = 1     # minimum number of labels per sample
MAX_LABELS = 3     # maximum number of labels per sample

# --- Load test corpus ---
def load_corpus(path):
    """Load test corpus into {pid: text} dictionary."""
"""
    pid2text = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                pid2text[pid] = text
    return pid2text

pid2text_test = load_corpus(TEST_CORPUS_PATH)
pid_list_test = list(pid2text_test.keys())

# --- Generate random predictions ---
all_pids, all_labels = [], []
for pid in tqdm(pid_list_test, desc="Generating dummy predictions"):
    n_labels = random.randint(MIN_LABELS, MAX_LABELS)
    labels = random.sample(range(NUM_CLASSES), n_labels)
    labels = sorted(labels)
    all_pids.append(pid)
    all_labels.append(labels)

# --- Save submission file ---
with open(SUBMISSION_PATH, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "label"])
    for pid, labels in zip(all_pids, all_labels):
        writer.writerow([pid, ",".join(map(str, labels))])

print(f"Dummy submission file saved to: {SUBMISSION_PATH}")
print(f"Total samples: {len(all_pids)}, Classes per sample: {MIN_LABELS}-{MAX_LABELS}")"""

Generating dummy predictions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 19658/19658 [00:00<00:00, 419908.88it/s]

Dummy submission file saved to: submission.csv
Total samples: 19658, Classes per sample: 1-3



