In [1]:
%pip install pdfminer.six requests beautifulsoup4 lxml

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Part 1: Import Libraries
import os
import re
import time
import urllib.request
from datetime import date
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, wait
from pdfminer.high_level import extract_text
from io import BytesIO
import logging
import shutil
import threading

# Configuration: Define base directory and paths
BASE_DIR = os.getcwd()  # Use current working directory
PATH_PDF = os.path.join(BASE_DIR, 'PDF', 'Kepailitan')  # Where PDFs are stored
PATH_OUTPUT = os.path.join(BASE_DIR, 'data', 'raw')    # Where processed text files are saved
LOG_DIR = os.path.join(BASE_DIR, 'logs')
LOG_PATH = os.path.join(LOG_DIR, 'cleaning.log')

# Thread-safe counter for processed files
processed_files = 0
file_lock = threading.Lock()

# Ensure directories exist
for path in [PATH_PDF, PATH_OUTPUT, LOG_DIR]:
    os.makedirs(path, exist_ok=True)

# Ensure log path is a file
if os.path.isdir(LOG_PATH):
    print(f"Error: {LOG_PATH} is a directory. Removing and recreating as file.")
    logging.warning(f"Error: {LOG_PATH} is a directory. Removing and recreating as file.")
    shutil.rmtree(LOG_PATH, ignore_errors=True)
with open(LOG_PATH, 'w') as f:
    f.write('')

# Initialize logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(message)s',
    handlers=[
        logging.FileHandler(LOG_PATH),
        logging.StreamHandler()  # Also log to console
    ]
)
logging.info("Logging initialized at %s", LOG_PATH)

2025-06-21 14:21:13,827 - Logging initialized at e:\Praktikum sem 6\COMPUTER_REASONING_CBR\CBR\logs\cleaning.log


In [3]:
# Part 2: Utility Functions for Scraping
def open_page(link):
    count = 0
    while count < 3:
        try:
            response = requests.get(link)
            response.raise_for_status()
            return BeautifulSoup(response.text, "lxml")
        except Exception as e:
            count += 1
            logging.warning(f"Attempt {count} failed for {link}: {e}")
            time.sleep(5)
    logging.error(f"Could not open {link} after 3 attempts")
    return None

def get_pdf_url(soup):
    try:
        pdf_link = soup.find("a", href=re.compile(r"/pdf/"))["href"]
        if not pdf_link.startswith("http"):
            pdf_link = f"https://putusan3.mahkamahagung.go.id{pdf_link}"
        return pdf_link
    except Exception as e:
        logging.error(f"Failed to get PDF URL: {e}")
        return None

def is_url_already_processed(url, path_pdf):
    processed_files_list = [f for f in os.listdir(path_pdf) if f.endswith('.pdf')]
    return any(url.split('/')[-1] in f for f in processed_files_list)

def download_pdf(url, path_pdf):
    try:
        file = urllib.request.urlopen(url)
        file_name = file.info().get_filename() or url.split('/')[-1]
        # Simplify file name to avoid length issues
        base_name = re.match(r'putusan_\d+_[^_]+_\d{4}', file_name)
        if base_name:
            file_name = f"{base_name.group(0)}_{date.today().strftime('%Y-%m-%d')}.pdf"
        else:
            file_name = f"putusan_{int(time.time())}_{date.today().strftime('%Y-%m-%d')}.pdf"
        save_path = os.path.join(path_pdf, file_name)
        with open(save_path, "wb") as out_file:
            out_file.write(file.read())
        logging.info(f"Successfully downloaded: {file_name}")
        return file_name
    except Exception as e:
        logging.error(f"Failed to download {url}: {e}")
        raise  # Raise exception to stop further processing on failure

In [4]:
# Part 3: Data Extraction Function
def extract_data(link, keyword_url, max_files):
    global processed_files
    with file_lock:
        if processed_files >= max_files:
            return False

    if is_url_already_processed(link, PATH_PDF):
        logging.info(f"Skipping duplicate URL: {link}")
        return True

    soup = open_page(link)
    if not soup:
        return True

    link_pdf = get_pdf_url(soup)
    if not link_pdf:
        logging.info(f"No PDF found for {link}")
        return True

    try:
        file_name = download_pdf(link_pdf, PATH_PDF)
        with file_lock:
            processed_files += 1
            logging.info(f"Processed file {processed_files}/{max_files}: {file_name}")
            if processed_files >= max_files:
                return False
    except Exception as e:
        logging.error(f"Download failed, stopping: {e}")
        return False
    return True

In [5]:
# Part 4: Page Processing Function
def run_process(keyword_url, page, sort_page, max_files):
    global processed_files
    with file_lock:
        if processed_files >= max_files:
            return False

    if keyword_url.startswith("https"):
        link = f"{keyword_url}&page={page}"
    else:
        link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword_url}&page={page}"
    if sort_page:
        link = f"{link}&obf=TANGGAL_PUTUS&obm=desc"

    soup = open_page(link)
    if not soup:
        return False

    links = soup.find_all("a", {"href": re.compile("/direktori/putusan")})
    for link in links:
        with file_lock:
            if processed_files >= max_files:
                return False
        full_link = link["href"]
        if not full_link.startswith("http"):
            full_link = f"https://putusan3.mahkamahagung.go.id{full_link}"
        continue_processing = extract_data(full_link, keyword_url, max_files)
        if not continue_processing:
            return False
    return True

In [6]:
# Part 5: Main Scraper Function
def run_scraper(url=None, max_files=50):
    global processed_files
    if not url or not url.startswith("https://"):
        logging.error("Please provide a valid URL")
        return

    soup = open_page(url)
    if not soup:
        return

    try:
        last_page = int(soup.find_all("a", {"class": "page-link"})[-1].get("data-ci-pagination-page"))
    except:
        last_page = 1
    logging.info(f"Scraping with url: {url} - {20 * last_page} potential data - {last_page} pages")

    with file_lock:
        if processed_files >= max_files:
            logging.info(f"Already have {processed_files} files, max limit reached")
            return

    with ThreadPoolExecutor(max_workers=1) as executor:  # Reduced to 1 worker to avoid race conditions
        futures = []
        for page in range(1, last_page + 1):
            with file_lock:
                if processed_files >= max_files:
                    break
            future = executor.submit(run_process, url, page, True, max_files)
            futures.append(future)
        wait(futures)

    with file_lock:
        final_count = processed_files
    logging.info(f"Scraping complete. Downloaded {final_count} PDFs.")

In [7]:
# Part 6: Utility Functions for PDF Processing
def extract_pdf_text(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            text = extract_text(BytesIO(file.read()))
        logging.info(f"Extracted text from {os.path.basename(pdf_path)}")
        return text
    except Exception as e:
        logging.error(f"Failed to extract text from {pdf_path}: {e}")
        return ""

def clean_text(text):
    if not text:
        logging.info("No text to clean")
        return ""
    text = text.replace("M a h ka m a h A g u n g R e p u blik In d o n esia\n", "")
    text = text.replace("Disclaimer\n", "")
    text = text.replace(
        "Kepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\n", "")
    text = text.replace(
        "pelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\n", "")
    text = text.replace(
        "Dalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\n", "")
    text = text.replace(
        "Email : kepaniteraan@mahkamahagung.go.id    Telp : 021-384 3348 (ext.318)\n", "")
    text = text.replace("Direktori Putusan Mahkamah Agung Republik Indonesia", "")
    text = text.replace("putusan.mahkamahagung.go.id", "")
    text = text.replace("Pid.I.A.3", "")
    text = text.replace("Mahkamah Agung Republik Indonesia", "")
    text = re.sub(r'Halaman \d+', '', text)
    text = text.lower()
    text = re.sub(r'[^a-z0-9]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    original_length = len(re.sub(r'[^a-z0-9]', '', text.replace(" ", "")))
    cleaned_length = len(text.replace(" ", ""))
    integrity = (cleaned_length / original_length) * 100 if original_length > 0 else 0
    if integrity < 80:
        logging.info(f"Text integrity below 80% for document: {integrity}%")
        return ""
    logging.info(f"Cleaned text successfully, integrity: {integrity}%")
    return text

def save_text_file(text, index, path):
    if not text:
        return None
    file_name = f"case_{index:03d}.txt"
    save_path = os.path.join(path, file_name)
    with open(save_path, 'w', encoding='utf-8') as f:
        f.write(text)
    logging.info(f"Saved/overwritten cleaned text: {file_name}")
    return file_name

def get_next_index(pdf_files):
    return len(pdf_files) + 1

In [8]:
# Part 7: Main Processing Function
def process_pdfs(max_files=50):
    logging.info("Starting PDF processing with max_files=%d", max_files)

    # Clear existing files in output directory
    for file in os.listdir(PATH_OUTPUT):
        file_path = os.path.join(PATH_OUTPUT, file)
        if os.path.isfile(file_path):
            os.remove(file_path)
    logging.info("Cleared all existing files in %s", PATH_OUTPUT)

    pdf_files = [f for f in os.listdir(PATH_PDF) if f.endswith('.pdf')]
    logging.info(f"Found {len(pdf_files)} PDF files to process")

    if not pdf_files:
        logging.info("No PDF files found in %s", PATH_PDF)
        print("No PDF files found.")
        return

    processed_files_text = 0
    for index, pdf_file in enumerate(pdf_files, start=1):
        if processed_files_text >= max_files:
            logging.info("Max file limit reached, stopping processing")
            break

        pdf_path = os.path.join(PATH_PDF, pdf_file)
        logging.info(f"Processing PDF: {pdf_file}")
        text = extract_pdf_text(pdf_path)
        cleaned_text = clean_text(text)
        if cleaned_text:
            saved_file = save_text_file(cleaned_text, index, PATH_OUTPUT)
            if saved_file:
                processed_files_text += 1
        else:
            logging.info(f"No valid text after cleaning for {pdf_file}")

    final_count = len([f for f in os.listdir(PATH_OUTPUT) if f.startswith('case_') and f.endswith('.txt')])
    logging.info(f"Processing complete. Generated {final_count} text files.")
    print(f"Processing complete. Generated {final_count} text files.")

In [9]:
# Part 8: Run the Scraper and Processor
scraper_url = "https://putusan3.mahkamahagung.go.id/search.html?q=&jenis_doc=putusan&cat=35ba6ef3a483c9085cd94932ea0ada61&jd=&court=8bb6198cd9528aaac4199a1d5627bbb9&t_put=&t_reg=&t_upl=&t_pr="
run_scraper(url=scraper_url, max_files=50)
process_pdfs(max_files=50)

2025-06-21 14:21:22,695 - Scraping with url: https://putusan3.mahkamahagung.go.id/search.html?q=&jenis_doc=putusan&cat=35ba6ef3a483c9085cd94932ea0ada61&jd=&court=8bb6198cd9528aaac4199a1d5627bbb9&t_put=&t_reg=&t_upl=&t_pr= - 2320 potential data - 116 pages
2025-06-21 14:21:37,716 - Successfully downloaded: putusan_1750490497_2025-06-21.pdf
2025-06-21 14:21:37,719 - Processed file 1/50: putusan_1750490497_2025-06-21.pdf
2025-06-21 14:21:51,080 - Successfully downloaded: putusan_1750490510_2025-06-21.pdf
2025-06-21 14:21:51,081 - Processed file 2/50: putusan_1750490510_2025-06-21.pdf
2025-06-21 14:22:04,797 - Successfully downloaded: putusan_1750490524_2025-06-21.pdf
2025-06-21 14:22:04,798 - Processed file 3/50: putusan_1750490524_2025-06-21.pdf
2025-06-21 14:22:17,202 - Successfully downloaded: putusan_1750490537_2025-06-21.pdf
2025-06-21 14:22:17,204 - Processed file 4/50: putusan_1750490537_2025-06-21.pdf
2025-06-21 14:22:24,865 - Successfully downloaded: putusan_1750490544_2025-06-21

Processing complete. Generated 50 text files.
