In [11]:
from pathlib import Path
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [12]:
# Default paths
ROOT = Path("Amazon_products")
TRAIN_PATH = ROOT / "train"
TEST_PATH = ROOT / "test"

TRAIN_CORPUS_PATH = TRAIN_PATH / "train_corpus.txt"
TEST_CORPUS_PATH = TEST_PATH  / "test_corpus.txt"
CLASSES_PATH = ROOT / "classes.txt"
HIERARCHY_PATH = ROOT / "class_hierarchy.txt"
REL_KEYWORDS_PATH = ROOT / "class_related_keywords.txt"

In [13]:
# ---------- Function for loads ----------

def load_lines(p: Path):
    with p.open("r", encoding="utf-8") as f:
        return [line.rstrip("\n") for line in f]

def load_pid2text(p: Path):
    """TSV: pid \\t text  -> dict[pid]=text"""
    pid2text = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            parts = line.rstrip("\n").split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                pid2text[pid] = text
    return pid2text

def load_classes_int(p: Path):
    class_dict = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            label_int, label_str = line.rstrip("\n").split("\t")
            class_dict[int(label_int)] = label_str
    return class_dict

def load_classes_str(p: Path):
    class_dict = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            label_int, label_str = line.rstrip("\n").split("\t")
            class_dict[label_str] = int(label_int)
    return class_dict

def load_keywords(p: Path):
    keywords = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            key, items = line.rstrip("\n").split(":")
            item_list = [item for item in items.split(",")]
            keywords[key] = item_list
    return keywords

def load_class_graph(p: Path):
    edges = []
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            p, c = map(int, line.rstrip("\n").split("\t"))
            edges.append((p, c))
    return edges

In [14]:
# ---------- Read-only loads ----------
import pprint

train_pid2text    = load_pid2text(TRAIN_CORPUS_PATH)
test_pid2text     = load_pid2text(TEST_CORPUS_PATH)
classes_int       = load_classes_int(CLASSES_PATH)
classes_str       = load_classes_str(CLASSES_PATH)
rel_keywords      = load_keywords(REL_KEYWORDS_PATH)
class_graph_edges = load_class_graph(HIERARCHY_PATH)

print(f"#train={len(train_pid2text):,}  #test={len(test_pid2text):,}")
pprint.pprint(class_graph_edges)

#train=29,487  #test=19,658
[(0, 1),
 (0, 8),
 (0, 208),
 (0, 211),
 (0, 213),
 (0, 216),
 (0, 229),
 (0, 255),
 (0, 265),
 (0, 218),
 (0, 271),
 (0, 277),
 (0, 249),
 (0, 288),
 (0, 313),
 (0, 357),
 (1, 2),
 (1, 434),
 (1, 413),
 (1, 483),
 (1, 464),
 (1, 520),
 (1, 527),
 (3, 4),
 (3, 5),
 (3, 13),
 (3, 15),
 (3, 17),
 (3, 21),
 (3, 28),
 (3, 30),
 (3, 34),
 (3, 35),
 (3, 50),
 (3, 51),
 (3, 53),
 (3, 85),
 (3, 111),
 (3, 120),
 (3, 147),
 (4, 7),
 (4, 19),
 (4, 70),
 (4, 174),
 (4, 176),
 (4, 183),
 (4, 188),
 (4, 210),
 (4, 238),
 (4, 294),
 (4, 375),
 (4, 392),
 (4, 407),
 (4, 433),
 (4, 435),
 (5, 6),
 (5, 57),
 (5, 97),
 (5, 133),
 (5, 228),
 (5, 331),
 (5, 385),
 (8, 9),
 (8, 206),
 (8, 235),
 (8, 270),
 (8, 285),
 (8, 308),
 (8, 327),
 (8, 391),
 (8, 405),
 (8, 410),
 (8, 514),
 (10, 11),
 (10, 44),
 (10, 54),
 (10, 60),
 (10, 64),
 (10, 220),
 (11, 12),
 (11, 69),
 (11, 45),
 (11, 109),
 (11, 205),
 (11, 67),
 (11, 419),
 (13, 14),
 (13, 20),
 (13, 37),
 (13, 122),
 (13, 150

In [15]:
# 부모/자식 dict (정수 라벨 ID 기준)
from collections import defaultdict

parents = defaultdict(list)
children = defaultdict(list)
for p_id, c_id in class_graph_edges:
    parents[c_id].append(p_id)
    children[p_id].append(c_id)

roots = [cid for cid in classes_int.keys() if cid not in parents]  # 부모 없는 라벨
print(f"#nodes={len(classes_int)}  #edges={len(class_graph_edges)}  #roots={len(roots)}")

#nodes=531  #edges=568  #roots=6


In [16]:
# class_related_keywords.txt 는 "class_str:kw1,kw2,..." 형식
# → 정수 라벨 ID -> 텍스트 로 매핑
def build_label_texts(classes_int, rel_keywords):
    id2text = {}
    for cid, cname in classes_int.items():          # cid=int, cname=str
        kws = rel_keywords.get(cname, [])           # 키는 class_str
        id2text[cid] = (cname + " " + " ".join(kws)).strip()
    return id2text

label_texts = build_label_texts(classes_int, rel_keywords)
print(list(label_texts.items())[:3])


[(0, 'grocery_gourmet_food snacks condiments beverages specialty_foods spices cooking_oils baking_ingredients gourmet_chocolates artisanal_cheeses organic_foods'), (1, 'meat_poultry butcher cuts marination grilling roasting seasoning halal organic deli marbling'), (2, 'jerky beef turkey chicken venison buffalo kangaroo elk ostrich bison spicy')]


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 문서 텍스트(학습은 train으로, 변환은 test로)
train_ids, train_texts = zip(*train_pid2text.items())
test_ids,  test_texts  = zip(*test_pid2text.items())

vec = TfidfVectorizer(max_features=100_000, ngram_range=(1,2), min_df=2)
X_train = vec.fit_transform(train_texts)   # 학습
X_test  = vec.transform(test_texts)        # 변환

# 라벨 텍스트도 같은 vocab으로
# 라벨 ID의 고정 순서를 확보 (정렬하면 나중에 index-라벨ID 매핑이 쉬움)
label_ids_sorted = sorted(classes_int.keys())   # [0,1,2,...]
label_text_list  = [label_texts[cid] for cid in label_ids_sorted]
X_labels = vec.transform(label_text_list)

# 점수 행렬 (n_test x n_labels)
S = X_test @ X_labels.T
S = S.tocsr()   # 행별 접근이 쉬움
S.shape

(19658, 531)

In [18]:
def force_include_parents(pred_set_ids):
    """부모를 모두 포함시키는 규칙(정수 라벨 ID 사용)"""
    stack = list(pred_set_ids)
    while stack:
        v = stack.pop()
        for p in parents.get(v, []):
            if p not in pred_set_ids:
                pred_set_ids.add(p)
                stack.append(p)
    return pred_set_ids

def decode_row(scores_csr_row, k=3):
    # scores_csr_row: CSR 행 (1 x n_labels)
    scores = scores_csr_row.toarray().ravel()
    if k <= 0 or scores.size == 0:
        return []
    # top-k 인덱스(라벨 index, label_ids_sorted 기준)
    idx = np.argpartition(scores, -k)[-k:]
    idx = idx[np.argsort(-scores[idx])]
    # index -> 실제 정수 라벨 ID로 변환
    pred_ids = { label_ids_sorted[i] for i in idx }
    # 부모 강제 포함
    pred_ids = force_include_parents(pred_ids)
    # 너무 늘어나면 점수 상위 k개만 남기기
    # (강제 포함된 부모들 중 점수 낮은 것들은 잘리지만, 필요하면 k를 늘려도 됨)
    keep = sorted(list(pred_ids), key=lambda cid: scores[label_ids_sorted.index(cid)], reverse=True)[:k]
    return keep


In [20]:
import os, csv
os.makedirs("outputs", exist_ok=True)

SUBMISSION_PATH = "outputs/tf-idf-submission.csv"

with open(SUBMISSION_PATH, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "labels"])
    for row_i, pid in enumerate(test_ids):  # test 파일의 원 순서를 그대로 유지
        labels = decode_row(S[row_i], k=3)  # [정수 라벨ID,...]
        writer.writerow([pid, ",".join(map(str, labels))])

print("File saved at", SUBMISSION_PATH)


File saved at outputs/tf-idf-submission.csv
