
# BERT2BERT Indonesian Summarization ‚Äî Liputan6 (Colab, T4-ready)

**Features:**
- Fixed column config (no autodetect): `id`, `title`, `content` (customizable)
- Works with Google Drive or direct upload
- IndoBART summarization via Hugging Face
- Batched inference, chunking for long articles
- Optional ROUGE evaluation (if references available)
- Saves results to Drive (`summarized_results.csv`) and sample previews


**Model:** `indobenchmark/indobart-v2`  
**Notes:** Tuned for T4, float16 on GPU, ROUGE enabled by default.

**Model:** `cahya/bert2bert-indonesian-summarization`  
**Notes:** T4-ready, float16 on GPU, ROUGE enabled by default.

In [1]:
#@title ‚öôÔ∏è Config ‚Äî paths & parameters (pre-wired to your Drive)
from pathlib import Path
import os

# === I/O PATHS ===
PROJECT_DIR_IN_DRIVE = "/content/drive/MyDrive/Proyek/Liputan6"  #@param {"type":"string"}

# Default dataset path (Drive). Change if your file is elsewhere.
DATA_PATH = f"{PROJECT_DIR_IN_DRIVE}/data/liputan6_clean_ready.csv"
OUTPUT_DIR = f"{PROJECT_DIR_IN_DRIVE}/outputs"

# and set DATA_PATH to that local path.

# === COLUMN NAMES ===
ID_COL = "id"  # optional; will be auto-created if missing        #@param {"type":"string"}
TITLE_COL = "clean_summary_text"  # optional; used only for previews if present  #@param {"type":"string"}
CONTENT_COL = "clean_article_text"  # main article text  #@param {"type":"string"}

# === DATA SUBSET (for quick runs) ===
MAX_DOCS = 200  #@param {"type":"integer"}

# === SUMMARIZATION PARAMS ===
MODEL_NAME = "cahya/bert2bert-indonesian-summarization"  #@param { "type":"string" }  # Valid example. Avoid using "-cnn" variants.

MODEL_DIR = f"{PROJECT_DIR_IN_DRIVE}/models/indobart-large"  # local cache dir for model (optional)
BATCH_SIZE = 6   #@param {"type":"integer"}
MAX_INPUT_WORDS = 900   # hard cap per chunk to avoid OOM
MAX_SUMMARY_TOKENS = 140  #@param {"type":"integer"}
MIN_SUMMARY_TOKENS = 32   #@param {"type":"integer"}
USE_GPU = True  #@param {"type":"boolean"}

# === EVALUATION ===
RUN_ROUGE = True  #@param {"type":"boolean"}

# Create output directory if needed
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
print("Configured. OUTPUT_DIR =", OUTPUT_DIR)

Configured. OUTPUT_DIR = /content/drive/MyDrive/Proyek/Liputan6/outputs


In [2]:
#@title ü©∫ Quick diagnostics
from pathlib import Path
import os
print("PROJECT_DIR_IN_DRIVE:", PROJECT_DIR_IN_DRIVE)
print("DATA_PATH exists?:", Path(DATA_PATH).exists())
print("OUTPUT_DIR exists?:", Path(OUTPUT_DIR).exists())
print("MODEL_DIR exists?:", Path(MODEL_DIR).exists())
print("HF_HOME:", os.environ.get("HF_HOME"))
print("HF_DATASETS_CACHE:", os.environ.get("HF_DATASETS_CACHE"))

PROJECT_DIR_IN_DRIVE: /content/drive/MyDrive/Proyek/Liputan6
DATA_PATH exists?: False
OUTPUT_DIR exists?: True
MODEL_DIR exists?: False
HF_HOME: None
HF_DATASETS_CACHE: None


In [3]:
!fusermount -u /content/drive  # unmount if mounted
!rm -rf /content/drive         # remove the folder if it exists
from google.colab import drive
drive.mount("/content/drive", force_remount=False)


fusermount: failed to unmount /content/drive: Invalid argument
Mounted at /content/drive


In [4]:
#@title ‚ö° GPU check & settings ‚Äî Prefer NVIDIA T4 on Colab
import os, torch, subprocess

def show_smi():
    try:
        print("nvidia-smi:")
        _ = subprocess.run(["nvidia-smi"], check=False, text=True, capture_output=False)
    except Exception as e:
        print("Could not run nvidia-smi:", e)

show_smi()

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print("Detected GPU:", gpu_name)
    if "T4" not in gpu_name.upper():
        print("‚ö†Ô∏è This notebook is tuned for NVIDIA T4. Reducing BATCH_SIZE to be safe.")
        try:
            BATCH_SIZE = min(6, BATCH_SIZE if 'BATCH_SIZE' in globals() else 6)
            print("BATCH_SIZE set to:", BATCH_SIZE)
        except Exception:
            pass
    else:
        print("‚úÖ T4 detected. Using optimized defaults.")
    try:
        torch.set_float32_matmul_precision("high")
        import torch.backends.cuda as cuda_back
        if hasattr(cuda_back.matmul, "allow_tf32"):
            cuda_back.matmul.allow_tf32 = True
    except Exception:
        pass
else:
    print("‚ö†Ô∏è CUDA GPU not detected. Proceeding on CPU (will be slower).")

nvidia-smi:
Detected GPU: Tesla T4
‚úÖ T4 detected. Using optimized defaults.


In [5]:
#@title üì¶ Install dependencies (CUDA 12.1 / Colab)
import os, sys, subprocess
from pathlib import Path

# Use project caches to persist models/datasets
HF_HOME = f"{PROJECT_DIR_IN_DRIVE}/models"
HF_DATASETS_CACHE = f"{PROJECT_DIR_IN_DRIVE}/datasets_cache"
os.environ["HF_HOME"] = HF_HOME
os.environ["HF_DATASETS_CACHE"] = HF_DATASETS_CACHE
os.environ["TOKENIZERS_PARALLELISM"] = "false"

Path(HF_HOME).mkdir(parents=True, exist_ok=True)
Path(HF_DATASETS_CACHE).mkdir(parents=True, exist_ok=True)
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

def pip_install(args):
    print(">>", " ".join(args))
    return subprocess.run(args, check=False)

# Try CUDA 12.1 wheels; if fail, fallback to CPU wheels.
r = pip_install([sys.executable, "-m", "pip", "install", "-U",
                 "torch", "torchvision", "torchaudio",
                 "--index-url", "https://download.pytorch.org/whl/cu121"])
if r.returncode != 0:
    print("CUDA wheels failed, trying CPU-only wheels...")
    pip_install([sys.executable, "-m", "pip", "install", "-U",
                 "torch", "torchvision", "torchaudio"])

pip_install([sys.executable, "-m", "pip", "install", "-U", "transformers>=4.44.0", "accelerate", "sentencepiece", "huggingface_hub>=0.23.0",
             "datasets", "evaluate", "rouge-score"])

import transformers, torch
print("transformers:", transformers.__version__)
print("torch:", torch.__version__)
# IndoBART v2 requires `trust_remote_code=True` for IndoNLGTokenizer.


>> /usr/bin/python3 -m pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
>> /usr/bin/python3 -m pip install -U transformers>=4.44.0 accelerate sentencepiece huggingface_hub>=0.23.0 datasets evaluate rouge-score
transformers: 4.57.1
torch: 2.8.0+cu126


In [6]:
#@title üì• Load dataset (CSV)
import pandas as pd
from pathlib import Path

assert Path(DATA_PATH).exists(), f"File not found: {DATA_PATH}"

df = pd.read_csv(DATA_PATH)
print("Columns:", list(df.columns))

# --- Column handling ---
# CONTENT is required
if CONTENT_COL not in df.columns:
    raise ValueError(f"Required content column '{CONTENT_COL}' not found in CSV. Found: {list(df.columns)}")

# TITLE and ID are optional
has_title = TITLE_COL in df.columns if isinstance(TITLE_COL, str) and len(TITLE_COL)>0 else False
has_id = ID_COL in df.columns if isinstance(ID_COL, str) and len(ID_COL)>0 else False

# Keep needed columns if present
keep_cols = [c for c in [ID_COL if has_id else None, TITLE_COL if has_title else None, CONTENT_COL] if c]
df = df[keep_cols].copy()

# Fill/derive optional columns
df[CONTENT_COL] = df[CONTENT_COL].fillna("").astype(str)

if not has_id:
    df["auto_id"] = range(1, len(df)+1)
    ID_COL = "auto_id"

if not has_title:
    # Create a simple title from the first 12 words of content (for preview only)
    df["auto_title"] = df[CONTENT_COL].str.split().str[:12].str.join(" ")
    TITLE_COL = "auto_title"

# Drop too-short content
df = df[df[CONTENT_COL].str.strip().str.len() > 30].reset_index(drop=True)

# Subset for quick runs
if MAX_DOCS and MAX_DOCS > 0:
    df = df.head(MAX_DOCS).copy()

print("Loaded rows:", len(df))
df.head(3)

Columns: ['clean_article_text', 'clean_summary_text']
Loaded rows: 200


Unnamed: 0,clean_summary_text,clean_article_text,auto_id
0,"menurut presiden susilo bambang yudhoyono, kem...",jakarta: presiden susilo bambang yudhoyono men...,1
1,pada masa silam jepang terlalu ambisius untuk ...,jakarta: perdana menteri jepang junichiro koiz...,2
2,puluhan hektare areal persawahan yang sebagian...,kutai: banjir dengan ketinggian dua meter di k...,3


In [7]:

#@title üßπ Basic text cleaning & length control
import re

def clean_text(t: str) -> str:
    t = str(t)
    # Remove multiple spaces and weird whitespace
    t = re.sub(r"\s+", " ", t).strip()
    return t

def cap_words(text: str, max_words: int) -> str:
    words = text.split()
    if len(words) <= max_words:
        return text
    return " ".join(words[:max_words])

df[CONTENT_COL] = df[CONTENT_COL].map(clean_text)
print("After cleaning. Example length:", len(df.iloc[0][CONTENT_COL].split()), "words")

# Just preview lengths
lengths = df[CONTENT_COL].str.split().map(len)
print("Avg words:", int(lengths.mean()), "| 95th pct:", int(lengths.quantile(0.95)))


After cleaning. Example length: 293 words
Avg words: 209 | 95th pct: 471


In [8]:

#@title ü§ñ Load model & tokenizer for summarization (BERT2BERT; prefers local MODEL_DIR)
import torch, os, sys, traceback
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

device = 0 if torch.cuda.is_available() else -1
print("Using device:", "GPU" if device==0 else "CPU")

DEFAULT_MODEL = "cahya/bert2bert-indonesian-summarization"
user_model = MODEL_NAME.strip()
load_path = MODEL_DIR if (os.path.exists(MODEL_DIR) and len(os.listdir(MODEL_DIR))>0) else user_model
cache_dir = os.environ.get("HF_HOME", MODEL_DIR)

def try_load(model_id_or_path, try_fp16=True):
    tok = AutoTokenizer.from_pretrained(
        model_id_or_path,
        use_fast=True,
        cache_dir=cache_dir
    )
    if try_fp16 and device==0:
        try:
            mdl = AutoModelForSeq2SeqLM.from_pretrained(
                model_id_or_path,
                torch_dtype=torch.float16,
                cache_dir=cache_dir
            )
        except Exception:
            mdl = AutoModelForSeq2SeqLM.from_pretrained(
                model_id_or_path,
                cache_dir=cache_dir
            )
    else:
        mdl = AutoModelForSeq2SeqLM.from_pretrained(
            model_id_or_path,
            cache_dir=cache_dir
        )
    return tok, mdl

try:
    print("Attempting to load:", load_path if load_path==MODEL_DIR else user_model)
    tokenizer, model = try_load(load_path if load_path==MODEL_DIR else user_model)
except Exception as e_user:
    print("‚ö†Ô∏è Could not load the requested model:", user_model)
    print("Error:", repr(e_user))
    print("Falling back to default model:", DEFAULT_MODEL)
    tokenizer, model = try_load(DEFAULT_MODEL)

summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    device=device
)
print("Summarizer ready.")


Using device: GPU
Attempting to load: cahya/bert2bert-indonesian-summarization


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/999M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/999M [00:00<?, ?B/s]

Device set to use cuda:0


Summarizer ready.


In [30]:
#@title üß© Chunking & Batched Summarization Helpers
from typing import List, Dict
import torch # Import torch for tensor operations

def chunk_by_words(text: str, max_words: int) -> List[str]:
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_words):
        chunks.append(" ".join(words[i:i+max_words]))
    return chunks if chunks else [""]

def summarize_text_long(text: str) -> str:
    """Summarize long text by chunking then merging summaries."""
    chunks = chunk_by_words(text, MAX_INPUT_WORDS)
    partial_summaries = []
    for ch in chunks:
        ch = ch.strip()
        if not ch:
            continue
        # Tokenize with padding and truncation
        inputs = tokenizer(ch, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        # Move input tensor to the correct device
        input_ids = inputs["input_ids"].to(summarizer.device)
        attention_mask = inputs["attention_mask"].to(summarizer.device) if "attention_mask" in inputs else None

        # Generate summary using the model directly with prepared inputs
        # Note: Using model.generate directly gives more control over inputs
        # and might be more robust than relying solely on the pipeline's
        # internal handling for complex cases like chunking.
        try:
            output_ids = summarizer.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=MAX_SUMMARY_TOKENS,
                min_length=MIN_SUMMARY_TOKENS,
                do_sample=False,
                # Add other generation parameters as needed, e.g., num_beams=4
            )
            out = tokenizer.decode(output_ids[0], skip_special_tokens=True)
            partial_summaries.append(out.strip())
        except Exception as e:
            print(f"Error during summarization of a chunk: {e}")
            partial_summaries.append("") # Append empty string or handle error appropriately

    if not partial_summaries:
        return ""

    # Optionally summarize the concatenated partial summaries for a tighter final result
    merged = " ".join(partial_summaries)
    if len(merged.split()) > MAX_INPUT_WORDS:
         # Tokenize merged text with padding and truncation
        merged_inputs = tokenizer(merged, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        merged_input_ids = merged_inputs["input_ids"].to(summarizer.device)
        merged_attention_mask = merged_inputs["attention_mask"].to(summarizer.device) if "attention_mask" in merged_inputs else None

        try:
             merged_output_ids = summarizer.model.generate(
                input_ids=merged_input_ids,
                attention_mask=merged_attention_mask,
                max_length=MAX_SUMMARY_TOKENS,
                min_length=MIN_SUMMARY_TOKENS,
                do_sample=False,
                # Add other generation parameters as needed
            )
             merged = tokenizer.decode(merged_output_ids[0], skip_special_tokens=True)
        except Exception as e:
            print(f"Error during summarization of merged text: {e}")
            merged = "" # Handle error

    return merged.strip()


def batched_summarize(texts: List[str], batch_size: int) -> List[str]:
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_out = []
        for t in batch:
            batch_out.append(summarize_text_long(t))
        results.extend(batch_out)
        print(f"Processed {min(i+batch_size, len(texts))}/{len(texts)}")
    return results

In [31]:

#@title ‚ñ∂Ô∏è Run summarization
texts = df[CONTENT_COL].tolist()
summaries = batched_summarize(texts, BATCH_SIZE)

df_out = df.copy()
df_out["summary"] = summaries
print("Summarization done. Preview:")
df_out.head(3)


Processed 6/200
Processed 12/200
Processed 18/200
Processed 24/200
Processed 30/200
Processed 36/200
Processed 42/200
Processed 48/200
Processed 54/200
Processed 60/200
Processed 66/200
Processed 72/200
Processed 78/200
Processed 84/200
Processed 90/200
Processed 96/200
Processed 102/200
Processed 108/200
Processed 114/200
Processed 120/200
Processed 126/200
Processed 132/200
Processed 138/200
Processed 144/200
Processed 150/200
Processed 156/200
Processed 162/200
Processed 168/200
Processed 174/200
Processed 180/200
Processed 186/200
Processed 192/200
Processed 198/200
Processed 200/200
Summarization done. Preview:


Unnamed: 0,clean_summary_text,clean_article_text,auto_id,summary
0,"menurut presiden susilo bambang yudhoyono, kem...",jakarta: presiden susilo bambang yudhoyono men...,1,presiden susilo bambang yudhoyono menekankan b...
1,pada masa silam jepang terlalu ambisius untuk ...,jakarta: perdana menteri jepang junichiro koiz...,2,pm jepang junichiro koizumi meminta maaf atas ...
2,puluhan hektare areal persawahan yang sebagian...,kutai: banjir dengan ketinggian dua meter di k...,3,puluhan hektare areal persawahan di kabupaten ...


In [32]:
#@title üíæ Save outputs
import time
from pathlib import Path

ts = time.strftime("%Y%m%d-%H%M%S")

# Fixed filenames in outputs/
fixed_csv = Path(OUTPUT_DIR) / "summarized_results.csv"
ts_csv = Path(OUTPUT_DIR) / f"summarized_results_{ts}.csv"
df_out.to_csv(fixed_csv, index=False, encoding="utf-8")
df_out.to_csv(ts_csv, index=False, encoding="utf-8")
print("Saved:", fixed_csv)
print("Saved:", ts_csv)

# save 5 examples as text files (title + summary)
examples_dir = Path(OUTPUT_DIR) / "examples"
examples_dir.mkdir(parents=True, exist_ok=True)
for i, row in df_out.head(5).iterrows():
    fname = examples_dir / f"sample_article_{i+1}_summary.txt"
    with open(fname, "w", encoding="utf-8") as f:
        title = str(row.get(TITLE_COL, ""))
        content = str(row.get(CONTENT_COL, ""))[:500]
        summary = str(row.get("summary",""))
        f.write(f"TITLE: {title}\n\nSUMMARY:\n{summary}\n\nCONTENT (first 500 chars):\n{content}")
print("Saved examples to:", examples_dir)

Saved: /content/drive/MyDrive/Proyek/Liputan6/outputs/summarized_results.csv
Saved: /content/drive/MyDrive/Proyek/Liputan6/outputs/summarized_results_20251101-140453.csv
Saved examples to: /content/drive/MyDrive/Proyek/Liputan6/outputs/examples


In [33]:
#@title üìä Evaluate with ROUGE if reference column exists (auto, best-effort)
REFERENCE_COL = "clean_summary_text"  # default reference for Liputan6
do_eval = RUN_ROUGE

if do_eval:
    try:
        import evaluate, json, time, pandas as pd
        from pathlib import Path
        if REFERENCE_COL in df_out.columns:
            rouge = evaluate.load("rouge")
            references = df_out[REFERENCE_COL].fillna("").astype(str).tolist()
            predictions = df_out["summary"].fillna("").astype(str).tolist()
            scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
            print("ROUGE:", scores)
            ts = __import__("time").strftime("%Y%m%d-%H%M%S")
            json_path = Path(OUTPUT_DIR) / f"rouge_scores_{ts}.json"
            with open(json_path, "w", encoding="utf-8") as f:
                json.dump(scores, f, ensure_ascii=False, indent=2)
            print("Saved ROUGE scores JSON:", json_path)
            pd.DataFrame([scores]).to_csv(Path(OUTPUT_DIR) / "rouge_scores.csv", index=False)
            print("Saved ROUGE scores CSV.")
        else:
            print(f"Reference column '{REFERENCE_COL}' not found in output. Skipping ROUGE.")
    except Exception as eval_err:
        print("‚ö†Ô∏è ROUGE evaluation failed (skipping). Error:", eval_err)
else:
    print("ROUGE disabled. Set RUN_ROUGE=True to enable.")

Downloading builder script: 0.00B [00:00, ?B/s]

ROUGE: {'rouge1': np.float64(0.559009325418721), 'rouge2': np.float64(0.4060605064334391), 'rougeL': np.float64(0.4901725090907041), 'rougeLsum': np.float64(0.4904836101793354)}
Saved ROUGE scores JSON: /content/drive/MyDrive/Proyek/Liputan6/outputs/rouge_scores_20251101-140504.json
Saved ROUGE scores CSV.


In [34]:

#@title üßæ Environment & Version Info (for reproducibility)
import sys, platform, transformers
import pandas as pd

print("Python:", sys.version)
print("Platform:", platform.platform())
print("transformers:", transformers.__version__)

print("Params snapshot:")
print({
    "MODEL_NAME": MODEL_NAME,
    "BATCH_SIZE": BATCH_SIZE,
    "MAX_INPUT_WORDS": MAX_INPUT_WORDS,
    "MAX_SUMMARY_TOKENS": MAX_SUMMARY_TOKENS,
    "MIN_SUMMARY_TOKENS": MIN_SUMMARY_TOKENS,
    "MAX_DOCS": MAX_DOCS,
    "DATA_PATH": DATA_PATH,
    "OUTPUT_DIR": OUTPUT_DIR,
})


Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
Platform: Linux-6.6.105+-x86_64-with-glibc2.35
transformers: 4.57.1
Params snapshot:
{'MODEL_NAME': 'cahya/bert2bert-indonesian-summarization', 'BATCH_SIZE': 6, 'MAX_INPUT_WORDS': 900, 'MAX_SUMMARY_TOKENS': 140, 'MIN_SUMMARY_TOKENS': 32, 'MAX_DOCS': 200, 'DATA_PATH': '/content/drive/MyDrive/Proyek/Liputan6/data/liputan6_clean_ready.csv', 'OUTPUT_DIR': '/content/drive/MyDrive/Proyek/Liputan6/outputs'}
