In [70]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
import json
from tqdm import tqdm
from pathlib import Path
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset

device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(42)
np.random.seed(42)

In [72]:
# Default paths
ROOT = Path("/content/drive/MyDrive/BDA Project/Amazon_products")
TRAIN_PATH = ROOT / "train"
TEST_PATH = ROOT / "test"

TRAIN_CORPUS_PATH = TRAIN_PATH / "train_corpus.txt"
TEST_CORPUS_PATH = TEST_PATH  / "test_corpus.txt"
CLASSES_PATH = ROOT / "classes.txt"
HIERARCHY_PATH = ROOT / "class_hierarchy.txt"
REL_KEYWORDS_PATH = ROOT / "class_related_keywords.txt"

In [73]:
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
import itertools
from pathlib import Path
import json

# ------------------------
# Function for loads
# ------------------------

def load_lines(p: Path):
    with p.open("r", encoding="utf-8") as f:
        return [line.rstrip("\n") for line in f]

def load_pid2text(p: Path):
    """TSV: pid \\t text  -> dict[pid]=text"""
    pid2text = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            parts = line.rstrip("\n").split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                pid2text[pid] = text
    return pid2text

def load_classes_int(p: Path):
    class_dict = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            label_int, label_str = line.rstrip("\n").split("\t")
            class_dict[int(label_int)] = label_str
    return class_dict

def load_classes_str(p: Path):
    class_dict = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            label_int, label_str = line.rstrip("\n").split("\t")
            class_dict[label_str] = int(label_int)
    return class_dict

def load_keywords(p: Path):
    keywords = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            key, items = line.rstrip("\n").split(":")
            item_list = [item for item in items.split(",")]
            keywords[key] = item_list
    return keywords

def load_class_graph(p: Path):
    edges = []
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            p, c = map(int, line.rstrip("\n").split("\t"))
            edges.append((p, c))
    return edges

def load_json(path):
    """Load JSON file into Python object."""
    with open(path) as f:
        return json.load(f)

# ------------------------
# Visualization
# ------------------------

def plot_results(results_dict, split="valid", metric='Accuracy'):
    """
    Plot metric values over epochs for multiple models.

    Args:
        results_dict: dict of dicts.
            Example: results_dict["valid"]["mlp_partial"] = [0.8, 0.82, ...]
        split: "valid" or "test"
        metric: name of the metric to display
    """
    assert split in results_dict, f"{split} not in results_dict"

    plt.figure(figsize=(8, 5))
    for label, acc_list in results_dict[split].items():
        plt.plot(acc_list, label=label)

    plt.title(f"{split.capitalize()} {metric} over Epochs")
    plt.xlabel("Epoch")
    plt.ylabel(metric)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# ------------------------
# Printing evaluation results
# ------------------------

def print_eval_result(metrics: dict, stage="val", is_improved=False):
    """
    Print evaluation results (accuracy, F1-macro).

    Args:
        metrics: dict with keys 'accuracy' and 'f1_macro'
        stage: string label (e.g., "val", "test")
        is_improved: mark with '*' if results improved
    """
    star = " *" if is_improved else ""
    print(f"[{stage.upper():4}] Acc: {metrics['accuracy']:.4f} | "
          f"F1-macro: {metrics['f1_macro']:.4f}{star}")


def print_eval_result_esci(metrics: dict, stage="val", is_improved=False):
    """
    Print evaluation results including per-class accuracy for ESCI labels.

    Args:
        metrics: dict with 'accuracy', 'f1_macro', and optionally 'per_class_accuracy'
        stage: string label (e.g., "val", "test")
        is_improved: mark with '*' if results improved
    """
    star = " *" if is_improved else ""
    print(f"[{stage.upper():4}] Acc: {metrics['accuracy']:.4f} | "
          f"F1-macro: {metrics['f1_macro']:.4f}{star}")

    # Print per-class accuracy if available
    if "per_class_accuracy" in metrics:
        id2label = {0: "E", 1: "S", 2: "C", 3: "I"}
        per_class_acc_str = [
            f"{id2label[cls_id]}: {acc:.4f}"
            for cls_id, acc in metrics["per_class_accuracy"].items()
        ]
        print("        " + " | ".join(per_class_acc_str))

In [74]:
# ---------- Read-only loads ----------

train_pid2text    = load_pid2text(TRAIN_CORPUS_PATH)
test_pid2text     = load_pid2text(TEST_CORPUS_PATH)
classes_int       = load_classes_int(CLASSES_PATH)
classes_str       = load_classes_str(CLASSES_PATH)
rel_keywords      = load_keywords(REL_KEYWORDS_PATH)
class_graph_edges = load_class_graph(HIERARCHY_PATH)

print(f"#train={len(train_pid2text):,}  #test={len(test_pid2text):,}")

#train=29,487  #test=19,658


In [75]:
EMB_PATH = Path("/content/drive/MyDrive/BDA Project/Embeddings3")

train_emb_dict = load_json(EMB_PATH / "train_embeddings.json")
test_emb_dict  = load_json(EMB_PATH / "test_embeddings.json")
class_emb_dict = load_json(EMB_PATH / "class_embeddings.json")

train_emb = torch.tensor(list(train_emb_dict.values())) # (N, d)
test_emb  = torch.tensor(list(test_emb_dict.values()))  # (N', d)
class_emb = torch.tensor(list(class_emb_dict.values())) # (C, d)

In [76]:
doc_embs   = F.normalize(train_emb, p=2, dim=1)   # (N, d)
class_embs = F.normalize(class_emb, p=2, dim=1)   # (C, d)

num_docs = len(doc_embs)
num_classes = len(class_embs)

def compute_doc_class_sim(doc_embs, class_embs, batch_size=512):
    """
    doc_embs: (N_docs, d)
    class_embs: (N_classes, d)
    return: sim_matrix (N_docs, N_classes)  ㅡ 코사인 유사도
    """
    sims = []
    n_docs = doc_embs.size(0)
    for start in range(0, n_docs, batch_size):
        end = min(start + batch_size, n_docs)
        batch = doc_embs[start:end]           # (B, d)
        # 코사인 유사도 == 정규화 후 matmul
        sim_batch = batch @ class_embs.T      # (B, N_classes)
        sims.append(sim_batch)
    sims = torch.cat(sims, dim=0)             # (N_docs, N_classes)
    return sims

In [77]:
# 예시: parent[c] = 그 클래스의 부모, root는 -1
# 실제 값은 네 taxonomy에 맞게 채우면 됨.
# parent = [...]  # 길이 = num_classes

def build_class_paths(parents):
    """
    parents: List[List[int]]
        parents[c] = []       (root)
                    = [p_idx] (부모 1개)
    return:
        paths: List[List[int]]
            paths[c] = [root, ..., c]
    """
    num_classes = len(parents)
    paths = [[] for _ in range(num_classes)]

    for cid in range(num_classes):
        path = [cid]
        cur = cid
        # 부모가 있으면 타고 올라가기 (최대 1개라고 가정)
        while parents[cur]:
            cur = parents[cur][0]   # 유일한 부모
            path.append(cur)
        path.reverse()
        paths[cid] = path

    return paths


In [78]:
def compute_path_scores(sims, paths):
    """
    sims:  (N_docs, N_classes), sims[i, c] = cos(doc_i, class_c)
    paths: List[List[int]], paths[c] = [root, ..., c]

    return:
        path_scores: (N_docs, N_classes)
            path_scores[i, c] = mean_{k in paths[c]} sims[i, k]
    """
    N_docs, N_classes = sims.size()
    path_scores = torch.empty_like(sims)
    device = sims.device

    for c in range(N_classes):
        idx = torch.tensor(paths[c], device=device, dtype=torch.long)  # (path_len,)
        # sims[:, idx]: (N_docs, path_len)
        path_scores[:, c] = sims[:, idx].mean(dim=1)

    return path_scores


In [79]:
from collections import deque

num_classes = len(classes_int)

# 1) parent / children 초기화
parents  = [[] for _ in range(num_classes)]   # 각 노드의 부모 리스트 (0 또는 1개 들어감)
children = [[] for _ in range(num_classes)]  # 각 노드의 자식들

# 2) parent->child edges 그대로 반영
for p, c in class_graph_edges:   # p: parent, c: child
    children[p].append(c)
    parents[c].append(p)

# 3) root 찾기: 부모가 없는 노드들
root_classes = [cid for cid in range(num_classes) if len(parents[cid]) == 0]

print("roots:", root_classes)

roots: [0, 3, 10, 23, 40, 169]


In [80]:
# 1) 문서-클래스 코사인 유사도
sims = compute_doc_class_sim(doc_embs, class_embs)   # (N_docs, C)

# 2) 클래스 path 구축 (parents는 이미 parent->child edges로 만든 상태)
paths = build_class_paths(parents)                  # List[List[int]]

# 3) path score 계산
path_scores = compute_path_scores(sims, paths)      # (N_docs, C)

path_scores

tensor([[0.1224, 0.1776, 0.1828,  ..., 0.2621, 0.1382, 0.2010],
        [0.2853, 0.2496, 0.2305,  ..., 0.2192, 0.3404, 0.2348],
        [0.3100, 0.3251, 0.3104,  ..., 0.2018, 0.3650, 0.2239],
        ...,
        [0.1477, 0.1185, 0.1248,  ..., 0.1707, 0.1679, 0.1814],
        [0.1482, 0.1312, 0.1361,  ..., 0.1846, 0.1413, 0.1666],
        [0.2024, 0.2311, 0.2399,  ..., 0.2425, 0.2726, 0.2556]])

In [81]:
# 4) 문서별 core label 선택 (Taxo-style path score 기반)
k = 3
top_k_scores, top_k_labels = path_scores.topk(k=k, dim=1) # (N_docs, k), (N_docs, k)

# 이제 top_k_labels[i] 가 "path score 기준 top k 라벨"
# top_k_scores[i] 는 그 문서의 최종 top k path score

In [83]:
import os, csv
from pathlib import Path
import pandas as pd

SUBMISSION_PATH = Path("/content/drive/MyDrive/BDA Project/outputs") / "(alibaba)path-core-silver.csv"

df = pd.DataFrame()
df.to_csv(SUBMISSION_PATH, index=False)

with open(SUBMISSION_PATH, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "labels"])
    for row_i in range(num_docs):
        # Convert tensor of labels to a list of integers
        labels = top_k_labels[row_i].tolist()
        # Join labels with a space
        label_str = ",".join(map(str, labels))
        writer.writerow([row_i, label_str])

print("File saved at", SUBMISSION_PATH)

File saved at /content/drive/MyDrive/BDA Project/outputs/(alibaba)path-core-silver.csv
