In [None]:
# Step 1: Install Required Packages (run this in your terminal or notebook)
# !pip install bertopic[all] transformers torch tqdm

# Step 2: Import Libraries
from transformers import AutoTokenizer, AutoModel
from bertopic import BERTopic
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import re

In [None]:
# Step 3: Load Pretrained Indonesian BERT
model_name = "cahya/bert-base-indonesian-522M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)
bert_model.eval()  # set to eval mode

In [None]:
# Step 4: Text Preprocessing Function (simple version)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|@\S+|#[A-Za-z0-9_]+", "", text)  # remove links, mentions, hashtags
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # remove punctuation
    return text.strip()

In [None]:
# Step 5: Load Your Data
df = pd.read_csv("./data/indonesian_texts_labeled_relaxed.csv")
df.head()

In [None]:
# Clean text
docs = df["normalized_text"].astype(str).apply(preprocess_text).tolist()


In [None]:
# Step 6: Define CLS-based Embedding Function
def get_cls_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token
    return cls_embedding.squeeze().numpy()

In [None]:
# Step 7: Generate Embeddings
embeddings = np.array([get_cls_embedding(text) for text in tqdm(docs)])