In [7]:
%pip install pdfminer.six pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
# Part 1: Import Libraries
import os
import re
from datetime import datetime
from pdfminer.high_level import extract_text
from io import BytesIO
import logging
import shutil
import pandas as pd
import threading

# Configuration: Define base directory and paths
BASE_DIR = os.getcwd()  # Use current working directory
PATH_PDF = os.path.join(BASE_DIR, 'PDF', 'Kepailitan')  # Input PDF directory
PATH_OUTPUT = os.path.join(BASE_DIR, 'data', 'processed')  # Output directory
LOG_DIR = os.path.join(BASE_DIR, 'logs')
LOG_PATH = os.path.join(LOG_DIR, 'cleaning.log')

# Thread-safe counter for processed files
processed_files = 0
file_lock = threading.Lock()

# Ensure directories exist
for path in [PATH_OUTPUT, LOG_DIR]:
    os.makedirs(path, exist_ok=True)

# Ensure log path is a file
if os.path.isdir(LOG_PATH):
    print(f"Error: {LOG_PATH} is a directory. Removing and recreating as file.")
    shutil.rmtree(LOG_PATH, ignore_errors=True)
with open(LOG_PATH, 'w') as f:
    f.write('')

# Initialize logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(message)s',
    handlers=[
        logging.FileHandler(LOG_PATH),
        logging.StreamHandler()
    ]
)
logging.info("Logging initialized at %s", LOG_PATH)

2025-06-21 14:41:28,209 - Logging initialized at e:\Praktikum sem 6\COMPUTER_REASONING_CBR\CBR\logs\cleaning.log


In [9]:
# Part 2: Utility Functions
def extract_pdf_text(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            text = extract_text(BytesIO(file.read()))
        logging.info(f"Extracted text from {os.path.basename(pdf_path)}")
        return text
    except Exception as e:
        logging.error(f"Failed to extract text from {pdf_path}: {e}")
        return ""

def extract_metadata(text):
    metadata = {
        'case_id': None,
        'no_perkara': None,
        'tanggal': None,
        'jenis_perkara': None,
        'pasal': None,
        'pihak': None
    }
    
    # Extract using regex patterns (adjust based on PDF format)
    no_perkara_match = re.search(r'Nomor Perkara\s*:\s*(\d+/\w+\.\w+/\d+)', text)
    if no_perkara_match:
        metadata['no_perkara'] = no_perkara_match.group(1)
    
    tanggal_match = re.search(r'Tanggal Putusan\s*:\s*(\d{2}-\d{2}-\d{4})', text)
    if tanggal_match:
        metadata['tanggal'] = tanggal_match.group(1)
    
    jenis_match = re.search(r'Jenis Perkara\s*:\s*(\w+\s*\w+)', text)
    if jenis_match:
        metadata['jenis_perkara'] = jenis_match.group(1)
    
    pasal_match = re.search(r'Pasal\s*:\s*(\d+\s*\w+)', text)
    if pasal_match:
        metadata['pasal'] = pasal_match.group(1)
    
    pihak_match = re.search(r'Pihak Penggugat\s*:\s*(\w+\s*\w+)\s*Pihak Tergugat\s*:\s*(\w+\s*\w+)', text)
    if pihak_match:
        metadata['pihak'] = f"{pihak_match.group(1)} vs. {pihak_match.group(2)}"
    
    metadata['case_id'] = f"CASE_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{processed_files + 1}"
    return metadata

def extract_key_content(text):
    content = {
        'ringkasan_fakta': None,
        'argumen_hukum_utama': None
    }
    
    # Extract summary of facts (e.g., evidence and charges)
    faktar_match = re.search(r'Ringkasan Fakta\s*:(.*?)(?=\n\n|\Z)', text, re.DOTALL)
    if faktar_match:
        content['ringkasan_fakta'] = faktar_match.group(1).strip()
    
    # Extract legal arguments (e.g., decision and articles)
    argumen_match = re.search(r'Pertimbangan Hukum\s*:(.*?)(?=\n\n|\Z)', text, re.DOTALL)
    if argumen_match:
        content['argumen_hukum_utama'] = argumen_match.group(1).strip()
    
    return content

def feature_engineering(text):
    features = {
        'length': len(text.split()),
        'bag_of_words': ' '.join(sorted(set(text.split()))[:10]),  # Top 10 unique words
        'qa_pairs': [('What is the case about?', text[:50] + '...')]  # Simple QA pair
    }
    return features

In [10]:
# Part 3: Main Processing Function
def process_pdfs(max_files=50):
    global processed_files
    logging.info("Starting case representation with max_files=%d", max_files)

    # Prepare data list
    cases_data = []
    
    pdf_files = [f for f in os.listdir(PATH_PDF) if f.endswith('.pdf')]
    logging.info(f"Found {len(pdf_files)} PDF files to process")

    if not pdf_files:
        logging.info("No PDF files found in %s", PATH_PDF)
        print("No PDF files found.")
        return

    for index, pdf_file in enumerate(pdf_files, start=1):
        with file_lock:
            if processed_files >= max_files:
                logging.info("Max file limit reached, stopping processing")
                break
        
        pdf_path = os.path.join(PATH_PDF, pdf_file)
        logging.info(f"Processing PDF: {pdf_file}")
        text = extract_pdf_text(pdf_path)
        if not text:
            continue

        metadata = extract_metadata(text)
        key_content = extract_key_content(text)
        features = feature_engineering(text)

        case_entry = {
            'case_id': metadata['case_id'],
            'no_perkara': metadata['no_perkara'],
            'tanggal': metadata['tanggal'],
            'jenis_perkara': metadata['jenis_perkara'],
            'pasal': metadata['pasal'],
            'pihak': metadata['pihak'],
            'ringkasan_fakta': key_content['ringkasan_fakta'],
            'argumen_hukum_utama': key_content['argumen_hukum_utama'],
            'length': features['length'],
            'bag_of_words': features['bag_of_words'],
            'qa_pairs': str(features['qa_pairs']),
            'text_full': text
        }
        cases_data.append(case_entry)
        with file_lock:
            processed_files += 1
            logging.info(f"Processed case {processed_files}/{max_files}: {metadata['case_id']}")

    # Save to CSV and JSON
    os.makedirs(PATH_OUTPUT, exist_ok=True)
    csv_path = os.path.join(PATH_OUTPUT, 'cases.csv')
    json_path = os.path.join(PATH_OUTPUT, 'cases.json')
    
    df = pd.DataFrame(cases_data)
    df.to_csv(csv_path, index=False, encoding='utf-8')
    df.to_json(json_path, orient='records', indent=2)
    
    logging.info(f"Saved {len(cases_data)} cases to {csv_path} and {json_path}")
    print(f"Processing complete. Generated {len(cases_data)} case representations.")

In [11]:
# Part 4: Run the Processor
process_pdfs(max_files=50)

2025-06-21 14:41:28,337 - Starting case representation with max_files=50
2025-06-21 14:41:28,339 - Found 50 PDF files to process
2025-06-21 14:41:28,340 - Processing PDF: putusan_1750490497_2025-06-21.pdf
2025-06-21 14:41:29,703 - Extracted text from putusan_1750490497_2025-06-21.pdf
2025-06-21 14:41:29,708 - Processed case 1/50: CASE_20250621_144129_1
2025-06-21 14:41:29,709 - Processing PDF: putusan_1750490510_2025-06-21.pdf
2025-06-21 14:41:30,887 - Extracted text from putusan_1750490510_2025-06-21.pdf
2025-06-21 14:41:30,890 - Processed case 2/50: CASE_20250621_144130_2
2025-06-21 14:41:30,892 - Processing PDF: putusan_1750490524_2025-06-21.pdf
2025-06-21 14:41:32,007 - Extracted text from putusan_1750490524_2025-06-21.pdf
2025-06-21 14:41:32,010 - Processed case 3/50: CASE_20250621_144132_3
2025-06-21 14:41:32,011 - Processing PDF: putusan_1750490537_2025-06-21.pdf
2025-06-21 14:41:34,813 - Extracted text from putusan_1750490537_2025-06-21.pdf
2025-06-21 14:41:34,818 - Processed c

Processing complete. Generated 50 case representations.
