In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import json, os
from tqdm import tqdm
from pathlib import Path
import numpy as np
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
# Default paths
ROOT = Path("/content/drive/MyDrive/BDA Project/Amazon_products")
TRAIN_PATH = ROOT / "train"
TEST_PATH = ROOT / "test"

TRAIN_CORPUS_PATH = TRAIN_PATH / "train_corpus.txt"
TEST_CORPUS_PATH = TEST_PATH  / "test_corpus.txt"
CLASSES_PATH = ROOT / "classes.txt"
HIERARCHY_PATH = ROOT / "class_hierarchy.txt"
REL_KEYWORDS_PATH = ROOT / "class_related_keywords.txt"

In [7]:
# ---------- Function for loads ----------

def load_lines(p: Path):
    with p.open("r", encoding="utf-8") as f:
        return [line.rstrip("\n") for line in f]

def load_pid2text(p: Path):
    """TSV: pid \\t text  -> dict[pid]=text"""
    pid2text = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            parts = line.rstrip("\n").split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                pid2text[pid] = text
    return pid2text

def load_classes_int(p: Path):
    class_dict = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            label_int, label_str = line.rstrip("\n").split("\t")
            class_dict[int(label_int)] = label_str
    return class_dict

def load_classes_str(p: Path):
    class_dict = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            label_int, label_str = line.rstrip("\n").split("\t")
            class_dict[label_str] = int(label_int)
    return class_dict

def load_keywords(p: Path):
    keywords = {}
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            key, items = line.rstrip("\n").split(":")
            item_list = [item for item in items.split(",")]
            keywords[key] = item_list
    return keywords

def load_class_graph(p: Path):
    edges = []
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            p, c = map(int, line.rstrip("\n").split("\t"))
            edges.append((p, c))
    return edges

In [8]:
# ---------- Read-only loads ----------

train_pid2text    = load_pid2text(TRAIN_CORPUS_PATH)
test_pid2text     = load_pid2text(TEST_CORPUS_PATH)
classes_int       = load_classes_int(CLASSES_PATH)
classes_str       = load_classes_str(CLASSES_PATH)
rel_keywords      = load_keywords(REL_KEYWORDS_PATH)
class_graph_edges = load_class_graph(HIERARCHY_PATH)

print(f"#train={len(train_pid2text):,}  #test={len(test_pid2text):,}")

#train=29,487  #test=19,658


## Using BERT to make embeddings

In [9]:
# ===== BERT Tokenizer and Model =====
import torch
from transformers import AutoTokenizer, AutoModel

MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [10]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state          # (B, L, H)
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    embeddings = (token_embeddings * input_mask_expanded).sum(dim=1) / input_mask_expanded.sum(dim=1).clamp(min=1e-9)
    return embeddings

@torch.no_grad()
def encode_texts_mpnet(texts, batch_size=64):
    all_embs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=128,   # 필요하면 더 길게
            return_tensors="pt"
        ).to(device)

        outputs = model(**enc)
        sent_emb = mean_pooling(outputs, enc["attention_mask"])
        # 검색용이면 정규화
        sent_emb = torch.nn.functional.normalize(sent_emb, p=2, dim=1)
        all_embs.append(sent_emb.cpu())
    return torch.cat(all_embs, dim=0)  # (N, hidden_size)

In [12]:
with torch.no_grad():
    train_embeddings = encode_texts_mpnet(list(train_pid2text.values()))
    test_embeddings = encode_texts_mpnet(list(test_pid2text.values()))

KeyboardInterrupt: 

In [None]:
# 새 셀: 생성된 BERT 임베딩을 JSON 파일로 저장

EMBEDDINGS_SAVE_DIR = Path("/content/drive/MyDrive/BDA Project/Embeddings2")

os.makedirs(EMBEDDINGS_SAVE_DIR, exist_ok=True)

# train_embeddings, test_embeddings 변수와 pid 순서가 존재해야 합니다.
# train_pid2text, test_pid2text의 keys() 순서가 encode_texts 호출 순서와 일치하는지 확인하세요.
train_pids = list(train_pid2text.keys())
test_pids = list(test_pid2text.keys())

# numpy list 변환 (메모리 주의: 대규모일 경우 파일 크기 큼)
train_arr = train_embeddings.detach().numpy() if isinstance(train_embeddings, torch.Tensor) else np.array(train_embeddings)
test_arr  = test_embeddings.detach().numpy()  if isinstance(test_embeddings, torch.Tensor)  else np.array(test_embeddings)

# pid -> embedding(리스트) 매핑 생성 및 저장
train_map = {pid: emb.tolist() for pid, emb in zip(train_pids, train_arr)}
test_map  = {pid: emb.tolist() for pid, emb in zip(test_pids,  test_arr)}

with open(EMBEDDINGS_SAVE_DIR / "train_embeddings.json", "w", encoding="utf-8") as f:
    json.dump(train_map, f, ensure_ascii=False)

with open(EMBEDDINGS_SAVE_DIR / "test_embeddings.json", "w", encoding="utf-8") as f:
    json.dump(test_map, f, ensure_ascii=False)

print(f"Saved train_embeddings -> {EMBEDDINGS_SAVE_DIR}/train_embeddings.json (items={len(train_map)})")
print(f"Saved test_embeddings  -> {EMBEDDINGS_SAVE_DIR}/test_embeddings.json  (items={len(test_map)})")

In [13]:
class_texts = []
for i in range(len(list(classes_int.values()))):
  class_i = classes_int[i]
  text = class_i + " : " + ", ".join(rel_keywords[class_i])
  class_texts.append(text)

In [15]:
with torch.no_grad():
    class_embeddings = encode_texts_mpnet(class_texts)

In [16]:
EMBEDDINGS_SAVE_DIR = Path("/content/drive/MyDrive/BDA Project/Embeddings2")

class_pids = list(classes_int.keys())

# numpy list 변환 (메모리 주의: 대규모일 경우 파일 크기 큼)
class_arr = class_embeddings.detach().numpy() if isinstance(class_embeddings, torch.Tensor) else np.array(class_embeddings)

# pid -> embedding(리스트) 매핑 생성 및 저장
class_map = {pid: emb.tolist() for pid, emb in zip(class_pids, class_arr)}

with open(EMBEDDINGS_SAVE_DIR / "class_embeddings.json", "w", encoding="utf-8") as f:
    json.dump(class_map, f, ensure_ascii=False)

print(f"Saved class_embeddings  -> {EMBEDDINGS_SAVE_DIR}/class_embeddings.json  (items={len(class_map)})")

Saved class_embeddings  -> /content/drive/MyDrive/BDA Project/Embeddings2/class_embeddings.json  (items=531)
