In [10]:
%pip install pdfminer.six requests beautifulsoup4 lxml

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Part 1: Import Libraries & Directory Initialization

In [None]:
import os
import re
import time
import urllib.request
from datetime import date
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, wait
from pdfminer.high_level import extract_text
from io import BytesIO
import logging
import shutil
import threading

# Configuration: Define base directory and paths
BASE_DIR = os.path.dirname(os.getcwd())  # Parent directory of 'notebooks'
PATH_PDF = os.path.join(BASE_DIR, 'PDF', 'Korupsi')  # Where PDFs are stored
PATH_OUTPUT = os.path.join(BASE_DIR, 'data', 'raw')    # Where processed text files are saved
LOG_DIR = os.path.join(BASE_DIR, 'logs')
LOG_PATH = os.path.join(LOG_DIR, 'cleaning.log')

# Thread-safe counter for processed files
processed_files = 0
file_lock = threading.Lock()

# Maximum path length for Windows
MAX_PATH_LENGTH = 260

# Validate path length
def validate_path(path):
    if len(path) > MAX_PATH_LENGTH:
        raise ValueError(f"Path {path} exceeds Windows maximum length of {MAX_PATH_LENGTH} characters")
    return path

# Ensure directories exist and validate their paths
for path in [PATH_PDF, PATH_OUTPUT, LOG_DIR]:
    try:
        validate_path(path)
        os.makedirs(path, exist_ok=True)
        logging.info(f"Directory ensured: {path}")
    except ValueError as e:
        logging.error(f"Path validation failed: {e}")
        raise
    except Exception as e:
        logging.error(f"Failed to create directory {path}: {e}")
        raise

# Ensure the log directory exists
log_dir = "../logs"
os.makedirs(log_dir, exist_ok=True)

# Initialize logging with write mode to overwrite the log file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(os.path.join(log_dir, "cleaning.log"), mode='w', encoding='utf-8'),  # Write mode to overwrite
    ],
    force=True  # Remove existing handlers to avoid conflicts
)

logging.getLogger().handlers[0].flush()  # Ensure FileHandler flushes immediately
logging.info("Logging initialized at %s", LOG_PATH)

# Part 2: Utility Functions for Scraping

In [None]:
def open_page(link):
    count = 0
    while count < 3:
        try:
            response = requests.get(link)
            response.raise_for_status()
            return BeautifulSoup(response.text, "lxml")
        except Exception as e:
            count += 1
            logging.warning(f"Attempt {count} failed for {link}: {e}")
            time.sleep(5)
    logging.error(f"Could not open {link} after 3 attempts")
    return None

def get_pdf_url(soup):
    try:
        pdf_link = soup.find("a", href=re.compile(r"/pdf/"))["href"]
        if not pdf_link.startswith("http"):
            pdf_link = f"https://putusan3.mahkamahagung.go.id{pdf_link}"
        return pdf_link
    except Exception as e:
        logging.error(f"Failed to get PDF URL: {e}")
        return None

def is_url_already_processed(url, path_pdf):
    processed_files_list = [f for f in os.listdir(path_pdf) if f.endswith('.pdf')]
    return any(url.split('/')[-1] in f for f in processed_files_list)

def download_pdf(url, path_pdf, keyword_url):
    try:
        file = urllib.request.urlopen(url)
        file_name = file.info().get_filename() or url.split('/')[-1]
        # Extract a short keyword from keyword_url, default to 'case' if not found
        keyword = 'kerugian keuangan negara' if 'kerugian keuangan negara' in keyword_url.lower() else 'case'
        # Replace invalid characters and append keyword and date
        file_name = re.sub(r'[^\w\-_\.]', '_', file_name.replace(".pdf", ""))
        # Truncate base file name to ensure total path length is safe
        max_base_length = MAX_PATH_LENGTH - len(path_pdf) - len(f"_{keyword}_{date.today().strftime('%Y-%m-%d')}.pdf") - 10
        file_name = file_name[:max_base_length]
        file_name = f"{file_name}_{keyword}_{date.today().strftime('%Y-%m-%d')}.pdf"
        save_path = os.path.join(path_pdf, file_name)
        # Final path length check
        if len(save_path) > MAX_PATH_LENGTH:
            file_name = f"putusan_{int(time.time())}_{keyword}_{date.today().strftime('%Y-%m-%d')}.pdf"
            save_path = os.path.join(path_pdf, file_name)
        file_content = file.read()
        with open(save_path, "wb") as out_file:
            out_file.write(file_content)
        logging.info(f"Successfully downloaded: {file_name}")
        return file_name
    except Exception as e:
        logging.error(f"Failed to download {url}: {e}")
        return None

# Part 3: Data Extraction Function

In [None]:
def extract_data(link, keyword_url, max_files):
    global processed_files
    with file_lock:
        if processed_files >= max_files:
            logging.info("Max file limit reached, stopping extraction")
            return False

    if is_url_already_processed(link, PATH_PDF):
        logging.info(f"Skipping duplicate URL: {link}")
        return True

    soup = open_page(link)
    if not soup:
        logging.warning(f"Failed to open page: {link}")
        return True

    link_pdf = get_pdf_url(soup)
    if not link_pdf:
        logging.info(f"No PDF found for {link}")
        return True

    file_name = download_pdf(link_pdf, PATH_PDF, keyword_url)
    if file_name:
        with file_lock:
            processed_files += 1
            logging.info(f"Processed file {processed_files}/{max_files}: {file_name}")
            if processed_files >= max_files:
                return False
    else:
        logging.info(f"Continuing extraction despite download failure for {link_pdf}")
        return True
    return True

# Part 4: Page Processing Function

In [None]:
def run_process(keyword_url, page, sort_page, max_files):
    global processed_files
    with file_lock:
        if processed_files >= max_files:
            logging.info("Max file limit reached, stopping page processing")
            return False

    if keyword_url.startswith("https"):
        link = f"{keyword_url}&page={page}"
    else:
        link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword_url}&page={page}"
    if sort_page:
        link = f"{link}&obf=TANGGAL_PUTUS&obm=desc"

    logging.info(f"Processing page {page}: {link}")
    soup = open_page(link)
    if not soup:
        logging.warning(f"Failed to open page {page}: {link}")
        return False

    links = soup.find_all("a", {"href": re.compile("/direktori/putusan")})
    for link in links:
        with file_lock:
            if processed_files >= max_files:
                logging.info("Max file limit reached during link processing")
                return False
        full_link = link["href"]
        if not full_link.startswith("http"):
            full_link = f"https://putusan3.mahkamahagung.go.id{full_link}"
        continue_processing = extract_data(full_link, keyword_url, max_files)
        if not continue_processing:
            logging.info("Stopping page processing due to max file limit")
            return False
    logging.info(f"Completed processing page {page}")
    return True

# Part 5: Main Scraper Function

In [None]:
def run_scraper(url=None, max_files=50):
    global processed_files
    if not url or not url.startswith("https://"):
        logging.error("Please provide a valid URL")
        return

    logging.info(f"Starting scraper with URL: {url}, max_files: {max_files}")
    soup = open_page(url)
    if not soup:
        logging.error("Failed to open initial URL, stopping scraper")
        return

    try:
        last_page = int(soup.find_all("a", {"class": "page-link"})[-1].get("data-ci-pagination-page"))
    except Exception as e:
        logging.warning(f"Could not determine last page, defaulting to 1: {e}")
        last_page = 1
    logging.info(f"Scraping {last_page} pages, potential {20 * last_page} files")

    with file_lock:
        if processed_files >= max_files:
            logging.info(f"Already have {processed_files} files, max limit reached")
            return

    with ThreadPoolExecutor(max_workers=1) as executor:
        futures = []
        for page in range(1, last_page + 1):
            with file_lock:
                if processed_files >= max_files:
                    logging.info("Max file limit reached, stopping page submission")
                    break
            future = executor.submit(run_process, url, page, True, max_files)
            futures.append(future)
        wait(futures)

    with file_lock:
        final_count = processed_files
    logging.info(f"Scraping complete. Downloaded {final_count} PDFs.")

# Part 6: Utility Functions for PDF Processing

In [None]:
def extract_pdf_text(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            text = extract_text(BytesIO(file.read()))
        logging.info(f"Extracted text from {os.path.basename(pdf_path)}")
        return text
    except Exception as e:
        logging.error(f"Failed to extract text from {pdf_path}: {e}")
        return ""

def clean_text(text):
    if not text:
        logging.info("No text to clean")
        return ""
    text = text.replace("M a h ka m a h A g u n g R e p u blik In d o n esia\n", "")
    text = text.replace("Disclaimer\n", "")
    text = text.replace(
        "Kepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\n", ""
    )
    text = text.replace(
        "pelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\n", ""
    )
    text = text.replace(
        "Dalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\n", ""
    )
    text = text.replace(
        "Email : kepaniteraan@mahkamahagung.go.id    Telp : 021-384 3348 (ext.318)\n", ""
    )
    text = text.replace("Direktori Putusan Mahkamah Agung Republik Indonesia", "")
    text = text.replace("putusan.mahkamahagung.go.id", "")
    text = text.replace("Pid.I.A.3", "")
    text = text.replace("Mahkamah Agung Republik Indonesia", "")
    text = re.sub(r'Halaman \d+', '', text)
    text = text.lower()
    text = re.sub(r'[^a-z0-9]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    original_length = len(re.sub(r'[^a-z0-9]', '', text.replace(" ", "")))
    cleaned_length = len(text.replace(" ", ""))
    integrity = (cleaned_length / original_length) * 100 if original_length > 0 else 0
    if integrity < 80:
        logging.warning(f"Text integrity below 80% for document: {integrity:.1f}%")
        return ""
    logging.info(f"Cleaned text successfully, integrity: {integrity:.1f}%")
    return text

def save_text_file(text, index, path):
    if not text:
        logging.info("No text to save")
        return None
    file_name = f"case_{index:03d}.txt"
    save_path = os.path.join(path, file_name)
    if len(save_path) > MAX_PATH_LENGTH:
        logging.error(f"Output path too long: {save_path}")
        return None
    with open(save_path, 'w', encoding='utf-8') as f:
        f.write(text)
    logging.info(f"Saved/overwritten cleaned text: {file_name}")
    return file_name

def get_next_index(pdf_files):
    return len(pdf_files) + 1

# Part 7: Main Processing Function

In [None]:
def process_pdfs(max_files=50):
    logging.info("Starting PDF processing with max_files=%d", max_files)

    # Clear existing files in output directory
    for file in os.listdir(PATH_OUTPUT):
        file_path = os.path.join(PATH_OUTPUT, file)
        if os.path.isfile(file_path):
            if len(file_path) > MAX_PATH_LENGTH:
                logging.warning(f"Skipping file deletion due to long path: {file_path}")
                continue
            os.remove(file_path)
            logging.info(f"Deleted existing file: {file_path}")
    logging.info(f"Cleared all existing files in %s", PATH_OUTPUT)

    pdf_files_list = [f for f in os.listdir(PATH_PDF) if f.endswith('.pdf')]
    logging.info(f"Found {len(pdf_files_list)} PDF files to process")

    if not pdf_files_list:
        logging.info("No PDF files found in %s", PATH_PDF)
        print("No PDF files found.")
        logging.warning(f"No PDF files found in {PATH_PDF}")
        return

    processed_files_text = 0
    for index, pdf_file in enumerate(pdf_files_list, start=1):
        if processed_files_text >= max_files:
            logging.info("Max file limit reached, stopping processing")
            break

        pdf_path = os.path.join(PATH_PDF, pdf_file)
        logging.info(f"Started processing PDF: {pdf_file}")
        text = extract_pdf_text(pdf_path)
        cleaned_text = clean_text(text)
        if cleaned_text:
            saved_file = save_text_file(cleaned_text, index, PATH_OUTPUT)
            if saved_file:
                processed_files_text += 1
        else:
            logging.info(f"No valid text after cleaning for {pdf_file}")

    final_count = len([f for f in os.listdir(PATH_OUTPUT) if f.startswith('case_') and f.endswith('.txt')])
    logging.info(f"Processing complete. Generated {final_count} text files.")
    print(f"Processing completed. Generated {final_count} text files.")

# Part 8: Run the Scraper and Processor

In [None]:
def main():
    logging.info("Starting main execution at %s", time.strftime("%Y-%m-%d %H:%M:%S"))
    scraper_url = "https://putusan3.mahkamahagung.go.id/search.html?q=&jenis_doc=putusan&cat=9e41907a1cfc34e0b1c265e262d35e44&jd=&tp=&court=&t_put=&t_reg=&t_upl=&t_pr="
    run_scraper(url=scraper_url, max_files=50)
    process_pdfs(max_files=50)
    logging.info("Main execution completed at %s", time.strftime("%Y-%m-%d %H:%M:%S"))

if __name__ == "__main__":
    main()

Processing completed. Generated 50 text files.
