### Cell 1: Instalasi & Imports

In [None]:
# Install library yang dibutuhkan
%pip install pandas requests beautifulsoup4 pdfminer.six lxml tqdm > /dev/null 2>&1

In [None]:
import os
import time
import logging
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

# Direktori (notebook di notebooks/, naik satu tingkat)
HTML_DIR = os.path.join("..", "data", "raw", "html")  # untuk simpan HTML
TEXT_DIR = os.path.join("..", "data", "raw", "text") # untuk simpan teks mentah & bersih
LOG_DIR  = os.path.join("..", "logs")
for d in (HTML_DIR, TEXT_DIR, LOG_DIR):
    os.makedirs(d, exist_ok=True)

# Setup logging
logging.basicConfig(
    filename=os.path.join(LOG_DIR, "cleaning.log"),
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s"
)

# Konfigurasi scraping
SEARCH_URL = (
    "https://putusan3.mahkamahagung.go.id/search.html"
    "?q=&jenis_doc=putusan&cat=8dff1a19444a2f2d63becf72c08c2fdd"
    "&court=098111PN340&t_put=2022"
)
BASE_URL = "https://putusan3.mahkamahagung.go.id"
HEADERS  = {"User-Agent": "Mozilla/5.0"}

### Cell 2: Seleksi & Unduh HTML

In [None]:
def get_case_links(search_url: str, max_pages: int = 10, target: int = None) -> list:
    # Kumpulkan link detail putusan hingga `target`.
    links = []
    for page in range(1, max_pages+1):
        url = f"{search_url}&page={page}"
        resp = requests.get(url, headers=HEADERS)
        print(f"[Page {page}] status={resp.status_code}")
        soup = BeautifulSoup(resp.text, "lxml")
        anchors = soup.select("a[href*='/direktori/putusan/']")
        for a in anchors:
            href = a['href']
            full = href if href.startswith("http") else BASE_URL + href
            if full not in links:
                links.append(full)
                if target and len(links) >= target:
                    return links
        time.sleep(1)
    return links

# Ambil dan unduh 50 HTML
links = get_case_links(SEARCH_URL, max_pages=10, target=50)
print(f"Found {len(links)} links")
for idx, url in enumerate(tqdm(links, desc="Downloading HTML"), 1):
    resp = requests.get(url, headers=HEADERS)
    with open(os.path.join(HTML_DIR, f"case_{idx:03d}.html"), "w", encoding="utf-8") as f:
        f.write(resp.text)
    time.sleep(0.5)

[Page 1] status=200
[Page 2] status=200
[Page 3] status=200
Found 50 links


Downloading HTML: 100%|██████████| 50/50 [06:32<00:00,  7.85s/it]


### Cell 3: Ekstraksi Teks Mentah dari HTML

In [5]:
from bs4 import BeautifulSoup

def extract_raw_text(html_path: str) -> str:
    """
    Ekstrak container utama teks sebelum cleaning.
    """
    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "lxml")
    return soup.get_text(separator="\n")

# Simpan raw teks
for idx, fname in enumerate(sorted(os.listdir(HTML_DIR)), 1):
    raw = extract_raw_text(os.path.join(HTML_DIR, fname))
    with open(os.path.join(TEXT_DIR, f"raw_{idx:03d}.txt"), "w", encoding="utf-8") as f:
        f.write(raw)

### Cell 4: Pembersihan Teks

In [18]:
# Pastikan direktori clean ada
CLEAN_TEXT_DIR = os.path.join("..", "data", "raw", "clean_text") 
os.makedirs(CLEAN_TEXT_DIR, exist_ok=True)
os.makedirs(CLEAN_TEXT_DIR, exist_ok=True)

In [29]:
import os
from bs4 import BeautifulSoup
import re, html

# Setup logging
logging.basicConfig(
    filename=os.path.join(LOG_DIR, "cleaning.log"),
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s"
)

def clean_text(raw_html: str) -> (str, int): # type: ignore
    """
    Bersihkan HTML dan slice teks,
    kembalikan (cleaned_text, bullets_removed_count)
    """
    # 1) Hapus tag noise
    soup = BeautifulSoup(raw_html, "lxml")
    for tag in soup(["script","style","noscript","header","footer","nav","iframe","table"]):
        tag.decompose()

    # 2) Ambil isi utama
    main = soup.select_one("div#tabs-1")
    text = main.get_text("\n") if main else soup.get_text("\n")
    text = html.unescape(text)

    # 3) Koreksi typo bulan
    text = re.sub(r"\bnopember\b", "november", text, flags=re.I)

    # 4) Slicing dari 'putusan pn surabaya' hingga sebelum 'statistik'
    low = text.lower()
    s = low.find("putusan pn surabaya")
    e = low.rfind("statistik")
    if s != -1 and e != -1 and e > s:
        text = text[s:e]

    # 5) Hapus nomor urut bullet & filter noise
    lines = []
    bullets_removed = 0
    for ln in text.splitlines():
        original = ln.strip()
        # hapus "1. ", "2. ", dll.
        cleaned_ln = re.sub(r"^\d+\.\s*", "", original)
        if cleaned_ln != original:
            bullets_removed += 1
        ln = cleaned_ln
        # buang baris terlalu pendek atau berisi noise
        if len(ln) < 5:
            continue
        if re.search(r"http|download pdf|mahkamah agung", ln, re.I):
            continue
        lines.append(ln)

    # 6) Normalize whitespace
    joined  = "\n".join(lines)
    cleaned = re.sub(r"\n+", "\n", joined)
    cleaned = re.sub(r"[ \t]+", " ", cleaned)

    # 7) Pastikan ada spasi setelah ")"
    cleaned = re.sub(r"\)\s*([a-z])", r") \1", cleaned)

    return cleaned.strip().lower(), bullets_removed

# ─── Eksekusi Cleaning ─────────────────────────────────────────────────────
raw_files = sorted(f for f in os.listdir(TEXT_DIR) if f.startswith("raw_"))
out_idx = 0

for raw_fname in raw_files:
    raw_path = os.path.join(TEXT_DIR, raw_fname)
    raw_html = open(raw_path, encoding="utf-8").read()

    # Skip jika bukan kasus penipuan
    if not re.search(r"\bpenipuan\b", raw_html, re.I):
        logging.info(f"Skip {raw_fname}: bukan kasus penipuan")
        continue

    out_idx += 1
    raw_len = len(raw_html)
    cleaned, bullets_removed = clean_text(raw_html)
    clean_len = len(cleaned)

    out_fname = f"clean_{out_idx:03d}.txt"
    out_path  = os.path.join(CLEAN_TEXT_DIR, out_fname)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(cleaned)

    logging.info(
        f"Berkas {out_fname} tersimpan (raw: {raw_len} → clean: {clean_len}); "
        f"bullet dihapus: {bullets_removed}"
    )

print(f"Selesai membersihkan {out_idx} berkas penipuan. Cek logs/cleaning.log untuk detail.")

Selesai membersihkan 47 berkas penipuan. Cek logs/cleaning.log untuk detail.


### Cell 5: Validasi & Logging

In [24]:
def validate_text(text: str, min_len: int = 100) -> bool:
    if len(text) < min_len:
        logging.warning(f"Validasi gagal (panjang: {len(text)})")
        return False
    return True

failed = []
for fname in sorted(os.listdir(CLEAN_TEXT_DIR)):
    if not fname.startswith("clean_"):
        continue
    txt = open(os.path.join(CLEAN_TEXT_DIR, fname), encoding="utf-8").read()
    if not validate_text(txt):
        failed.append(fname)

print(f"Validasi selesai: {len(failed)} \nberkas gagal (cek cleaning.log).")

Validasi selesai: 0 
berkas gagal (cek cleaning.log).
