
# IndoBERT Sentiment — Inference-Only Notebook (V1)

## Process Overview
1) **Install Dependencies** — Make sure required libraries are available.
2) **Mount Google Drive & Set Paths** — Use the trained model at `/content/drive/MyDrive/Proyek/Sentiment_IndoBERT/best_model`.
3) **Load Model & Label Maps** — Load tokenizer/model from the saved directory, get `id2label` mapping.
4) **Define Batch Inference Helper** — `predict_texts()` for fast batched predictions.
5) **(Optional) CSV Batch Prediction** — Read a CSV with a text column, write predictions to a new CSV.
6) **Quick Demo** — Run a few example sentences.


In [None]:

# === Process 1: Install Dependencies ===
!pip install -q --upgrade transformers datasets accelerate scikit-learn sentencepiece


In [None]:

# === Process 2: Mount Google Drive & Set Paths ===
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
MODEL_DIR = Path("/content/drive/MyDrive/Proyek/Sentiment_IndoBERT/best_model")
OUTPUT_DIR = Path("/content/drive/MyDrive/Proyek/Sentiment_IndoBERT")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("MODEL_DIR:", MODEL_DIR)
print("OUTPUT_DIR:", OUTPUT_DIR)


In [None]:

# === Process 3: Load Model & Label Maps ===
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR.as_posix())
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR.as_posix())
model.eval()

label_maps_path = OUTPUT_DIR / "label_maps.json"
if label_maps_path.exists():
    with open(label_maps_path, "r", encoding="utf-8") as f:
        maps = json.load(f)
        id2label = {int(k): v for k, v in maps.get("id2label", {}).items()}
else:
    id2label = getattr(model.config, "id2label", None)
    if isinstance(id2label, dict):
        id2label = {int(k): v for k, v in id2label.items()}
    else:
        id2label = {i: str(i) for i in range(model.config.num_labels)}
print("id2label:", id2label)


In [None]:

# === Process 4: Define Batch Inference Helper ===
import torch
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import DataCollatorWithPadding

MAX_LEN = 128
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def predict_texts(texts, batch_size=64):
    ds = Dataset.from_dict({"text": list(texts)}).map(
        lambda b: tokenizer(b["text"], truncation=True, max_length=MAX_LEN, padding=False),
        batched=True
    )
    ds.set_format(type="torch")
    loader = DataLoader(ds, batch_size=batch_size, collate_fn=data_collator)
    preds = []
    model.eval()
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(model.device) for k, v in batch.items()}
            logits = model(**batch).logits
            batch_preds = torch.argmax(logits, dim=-1).cpu().tolist()
            preds.extend(batch_preds)
    return [id2label[int(i)] for i in preds]


In [None]:

# === Process 5 (Optional): CSV Batch Prediction ===
import pandas as pd
from datetime import datetime

CSV_PATH = "/content/drive/MyDrive/Proyek/Sentiment_IndoBERT/Data/tweet.csv"
TEXT_COL = "tweet"

df = pd.read_csv(CSV_PATH)
if TEXT_COL not in df.columns:
    raise ValueError(f"Column '{TEXT_COL}' not found in {CSV_PATH}. Available: {list(df.columns)}")

texts = df[TEXT_COL].astype(str).fillna("").tolist()
preds = predict_texts(texts, batch_size=128)

out_name = f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
out_path = (OUTPUT_DIR / out_name).as_posix()

df_out = df.copy()
df_out["prediction"] = preds
df_out.to_csv(out_path, index=False, encoding="utf-8-sig")
print("Saved predictions to:", out_path)


In [None]:

# === Process 6: Quick Demo ===
predict_texts([
    "Pelayanannya sangat memuaskan, terima kasih!",
    "Biasa saja sih, tidak terlalu istimewa.",
    "Sangat buruk, saya kecewa."
])
