In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import json, os
from tqdm import tqdm
from pathlib import Path
from utils import *
import numpy as np
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
# Default paths
ROOT = Path("/content/drive/MyDrive/BDA Project/Amazon_products")
TRAIN_PATH = ROOT / "train"
TEST_PATH = ROOT / "test"

TRAIN_CORPUS_PATH = TRAIN_PATH / "train_corpus.txt"
TEST_CORPUS_PATH = TEST_PATH  / "test_corpus.txt"
CLASSES_PATH = ROOT / "classes.txt"
HIERARCHY_PATH = ROOT / "class_hierarchy.txt"
REL_KEYWORDS_PATH = ROOT / "class_related_keywords.txt"

In [5]:
# ---------- Function for loads ----------

def load_lines(p: Path):
    with p.open("r", encoding="utf-8") as f:
        return [line.rstrip("\n") for line in f]

def load_pid2text(p: Path):
    """TSV: pid \\t text  -> dict[pid]=text"""
    pid2text = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            parts = line.rstrip("\n").split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                pid2text[pid] = text
    return pid2text

def load_classes_int(p: Path):
    class_dict = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            label_int, label_str = line.rstrip("\n").split("\t")
            class_dict[int(label_int)] = label_str
    return class_dict

def load_classes_str(p: Path):
    class_dict = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            label_int, label_str = line.rstrip("\n").split("\t")
            class_dict[label_str] = int(label_int)
    return class_dict

def load_keywords(p: Path):
    keywords = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            key, items = line.rstrip("\n").split(":")
            item_list = [item for item in items.split(",")]
            keywords[key] = item_list
    return keywords

def load_class_graph(p: Path):
    edges = []
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            p, c = map(int, line.rstrip("\n").split("\t"))
            edges.append((p, c))
    return edges

In [6]:
# ---------- Read-only loads ----------

train_pid2text    = load_pid2text(TRAIN_CORPUS_PATH)
test_pid2text     = load_pid2text(TEST_CORPUS_PATH)
classes_int       = load_classes_int(CLASSES_PATH)
classes_str       = load_classes_str(CLASSES_PATH)
rel_keywords      = load_keywords(REL_KEYWORDS_PATH)
class_graph_edges = load_class_graph(HIERARCHY_PATH)

print(f"#train={len(train_pid2text):,}  #test={len(test_pid2text):,}")

#train=29,487  #test=19,658


## Using BERT to make embeddings

In [7]:
# ===== BERT Tokenizer and Model =====
from transformers import BertTokenizer, BertModel

MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
bert_model = BertModel.from_pretrained(MODEL_NAME).eval().to(device)   # renamed from 'model' to 'bert_model'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [8]:
def mean_pooling(model_output, attention_mask):
    """
    Apply mean pooling on BERT token embeddings, masking out padding tokens.

    Args:
        model_output: Output object from a BERT model (contains last_hidden_state).
        attention_mask (torch.Tensor): Attention mask of shape (batch_size, seq_len),
                                       where 1 = real token and 0 = padding.

    Returns:
        torch.Tensor: Sentence embeddings of shape (batch_size, hidden_size).
    """
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1)
    sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
    return sum_embeddings / sum_mask

def encode_texts(texts, batch_size=64):
    """
    Encode a list of texts into mean-pooled BERT embeddings.

    Args:
        texts (list of str): Input texts to encode.
        batch_size (int, optional): Batch size for encoding. Default is 64.

    Returns:
        torch.Tensor: Tensor of shape (len(texts), hidden_size) containing embeddings.
    """
    all_embeddings = []

    # Process texts in mini-batches
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]

        # Tokenize and move to model device
        encoded = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt",
        ).to(bert_model.device)   # use bert_model

        # Forward pass through BERT
        with torch.no_grad():
            output = bert_model(**encoded)   # use bert_model

        # Mean pooling (exclude padding tokens)
        embeddings = mean_pooling(output, encoded["attention_mask"])
        all_embeddings.append(embeddings.cpu())

    # Concatenate all batch embeddings
    return torch.cat(all_embeddings, dim=0)

In [10]:
with torch.no_grad():
    train_embeddings = encode_texts(list(train_pid2text.values()))
    test_embeddings = encode_texts(list(test_pid2text.values()))

100%|██████████| 461/461 [12:48<00:00,  1.67s/it]
100%|██████████| 308/308 [08:29<00:00,  1.65s/it]


In [13]:
# 임베딩 파일을 저장할 전체 경로를 정의합니다.
EMBEDDINGS_SAVE_DIR = "/content/drive/MyDrive/BDA Project/Embeddings"

os.makedirs(EMBEDDINGS_SAVE_DIR, exist_ok=True)

# train_embeddings, test_embeddings 변수와 pid 순서가 존재해야 합니다.
# train_pid2text, test_pid2text의 keys() 순서가 encode_texts 호출 순서와 일치하는지 확인하세요.
train_pids = list(train_pid2text.keys())
test_pids = list(test_pid2text.keys())

# numpy list 변환 (메모리 주의: 대규모일 경우 파일 크기 큼)
train_arr = train_embeddings.detach().numpy() if isinstance(train_embeddings, torch.Tensor) else np.array(train_embeddings)
test_arr  = test_embeddings.detach().numpy()  if isinstance(test_embeddings, torch.Tensor)  else np.array(test_embeddings)

# pid -> embedding(리스트) 매핑 생성 및 저장
train_map = {pid: emb.tolist() for pid, emb in zip(train_pids, train_arr)}
test_map  = {pid: emb.tolist() for pid, emb in zip(test_pids,  test_arr)}

with open(EMBEDDINGS_SAVE_DIR / "train_embeddings.json", "w", encoding="utf-8") as f:
    json.dump(train_map, f, ensure_ascii=False)

with open(EMBEDDINGS_SAVE_DIR / "test_embeddings.json", "w", encoding="utf-8") as f:
    json.dump(test_map, f, ensure_ascii=False)

print(f"Saved train_embeddings -> {EMBEDDINGS_SAVE_DIR}/train_embeddings.json (items={len(train_map)})")
print(f"Saved test_embeddings  -> {EMBEDDINGS_SAVE_DIR}/test_embeddings.json  (items={len(test_map)})")

Saved train_embeddings -> /content/drive/MyDrive/BDA Project/Amazon_products/Embeddings/train_embeddings.json (items=29487)
Saved test_embeddings  -> /content/drive/MyDrive/BDA Project/Amazon_products/Embeddings/test_embeddings.json  (items=19658)
