In [8]:
%pip install pdfminer.six pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
import re
import csv
import logging
from datetime import datetime

# Define base directory
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))  # For .py scripts
except NameError:
    BASE_DIR = os.getcwd()  # For Jupyter Notebooks or interactive sessions

# Define paths
PATH_OUTPUT = os.path.join(BASE_DIR, 'data', 'raw')
path_csv = os.path.join(BASE_DIR, 'data', 'processed', 'cases.csv')
log_dir = os.path.join(BASE_DIR, 'logs')
log_path = os.path.join(log_dir, 'metadata_extraction.log')

# Ensure directories exist
os.makedirs(log_dir, exist_ok=True)
os.makedirs(os.path.dirname(path_csv), exist_ok=True)

# Initialize logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(message)s',
    handlers=[
        logging.FileHandler(log_path),
        logging.StreamHandler()
    ]
)
logging.info("Starting metadata extraction process at %s", datetime.now().strftime('%H:%M:%S %Z on %A, %B %d, %Y'))

# Map Indonesian month names to English
month_map = {
    'januari': 'January',
    'februari': 'February',
    'maret': 'March',
    'april': 'April',
    'mei': 'May',
    'juni': 'June',
    'juli': 'July',
    'agustus': 'August',
    'september': 'September',
    'oktober': 'October',
    'november': 'November',
    'desember': 'December'
}

def extract_metadata(text, file_name):
    metadata = {
        'case_id': file_name.replace('.txt', ''),
        'nomor_perkara': '',
        'tahun_putusan': '',
        'bulan_putusan': '',
        'tanggal_putusan': '',
        'jenis_perkara': '',
        'tingkat_pemeriksaan': '',
        'lembaga_peradilan': '',
        'hakim_ketua': '',
        'pasal': '',
        'ringkasan_fakta': '',
        'jumlah_kata_putusan': 0,
        'full_text': text
    }

    def clean_name(text):
        if not text:
            return ''
        text = re.sub(r'\s*(s\s*h|m\s*h|cla|ccfa|cbc|ttd)\s*', ' ', text, flags=re.IGNORECASE).strip()
        return ' '.join(text.split())

    # Extract nomor_perkara
    match = re.search(r'nomor\s+(\d+\s+(?:pk|k)\s+pdt\s+sus\s+pailit\s+\d{4})', text, re.IGNORECASE)
    if match:
        metadata['nomor_perkara'] = match.group(1)
    else:
        logging.warning(f"No nomor_perkara found in {file_name}")
    logging.info(f"Extracted nomor_perkara: {metadata['nomor_perkara']} for {file_name}")

    # Extract tanggal/bulan/tahun putusan
    date_patterns = [
        r'\w+\s+tanggal\s+(\d{1,2}\s+\w+\s+\d{4})',
        r'tanggal\s+(\d{1,2}\s+\w+\s+\d{4})',
        r'(\d{1,2}\s+\w+\s+\d{4})',
    ]
    date_formats = ['%d %B %Y', '%d %b %Y', '%d %m %Y']

    date_found = False
    for pattern in date_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            full_date = match.group(1)
            for id_month, en_month in month_map.items():
                full_date = re.sub(rf'\b{id_month}\b', en_month, full_date, flags=re.IGNORECASE)
            for fmt in date_formats:
                try:
                    date_obj = datetime.strptime(full_date, fmt)
                    metadata['tahun_putusan'] = str(date_obj.year)
                    metadata['bulan_putusan'] = date_obj.strftime('%B')
                    metadata['tanggal_putusan'] = str(date_obj.day)
                    date_found = True
                    break
                except ValueError:
                    continue
        if date_found:
            break
    if not date_found:
        logging.warning(f"No valid date found in {file_name}. Sample text: {text[:200]}")

    # Jenis perkara
    if 'kepailitan' in text.lower():
        metadata['jenis_perkara'] = 'Kepailitan'

    # Tingkat pemeriksaan
    if 'peninjauan kembali' in text.lower():
        metadata['tingkat_pemeriksaan'] = 'Peninjauan Kembali'
    elif 'kasasi' in text.lower():
        metadata['tingkat_pemeriksaan'] = 'Kasasi'
    else:
        metadata['tingkat_pemeriksaan'] = 'Pertama'

    # Lembaga peradilan
    if 'mahkamah agung' in text.lower():
        metadata['lembaga_peradilan'] = 'Mahkamah Agung'
    elif 'pengadilan negeri' in text.lower():
        metadata['lembaga_peradilan'] = 'Pengadilan Negeri'

    # Hakim ketua
    match = re.search(r'ketua\s+majelis\s+([^\s][^,]+?)(?=\s+dan\s+[^\s].*?\s+hakim\s+hakim\s+anggota)', text, re.IGNORECASE)
    if match:
        metadata['hakim_ketua'] = clean_name(match.group(1))
    else:
        logging.warning(f"No hakim_ketua found in {file_name}")

    # Pasal
    match = re.search(r'memperhatikan\s+undang\s+undang\s+nomor\s+(\d+\s+tahun\s+\d{4})', text, re.IGNORECASE)
    if match:
        metadata['pasal'] = match.group(1)

    # Ringkasan fakta
    match = re.search(r'm\s+e\s+n\s+g\s+a\s+d\s+i\s+l\s+i\s+((?:\d+\s+)?[^\s].*?)(?=\s+demikian|\s+menimbang|\s*$)', text, re.IGNORECASE)
    if match:
        metadata['ringkasan_fakta'] = ' '.join(match.group(1).split())
    else:
        logging.warning(f"No ringkasan_fakta found in {file_name}. Sample text: {text[-200:]}")

    metadata['jumlah_kata_putusan'] = len(text.split())

    logging.info(f"Extracted metadata from {file_name}")
    return metadata

def save_to_csv(metadata_list):
    fieldnames = [
        'case_id', 'nomor_perkara', 'tahun_putusan', 'bulan_putusan', 'tanggal_putusan',
        'jenis_perkara', 'tingkat_pemeriksaan', 'lembaga_peradilan', 'hakim_ketua',
        'pasal', 'ringkasan_fakta', 'jumlah_kata_putusan', 'full_text'
    ]
    with open(path_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for metadata in metadata_list:
            writer.writerow(metadata)
    logging.info(f"Saved metadata to {path_csv}")

def process_text_files():
    metadata_list = []
    try:
        text_files = [f for f in os.listdir(PATH_OUTPUT) if f.endswith('.txt')]
        logging.info(f"Found {len(text_files)} text files to process in {PATH_OUTPUT}")
    except FileNotFoundError:
        logging.error(f"Directory {PATH_OUTPUT} does not exist")
        return

    for text_file in text_files:
        file_path = os.path.join(PATH_OUTPUT, text_file)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read().strip()
            metadata = extract_metadata(text, text_file)
            metadata_list.append(metadata)
            logging.info(f"Processed text file: {text_file}")
        except Exception as e:
            logging.error(f"Error processing {text_file}: {str(e)}")

    if metadata_list:
        save_to_csv(metadata_list)
        logging.info(f"Processed {len(metadata_list)} files and saved to {path_csv}")
    else:
        logging.warning("No files processed or no metadata extracted")

# Run the processor
if __name__ == "__main__":
    process_text_files()


2025-06-22 20:37:20,423 - Starting metadata extraction process at 20:37:20  on Sunday, June 22, 2025
2025-06-22 20:37:20,426 - Found 60 text files to process in d:\AL FITRA\STUDI UMM\SEMESTER 6\TEORI\PENALARAN KOMPUTER\SOURCE CODE\PENALARAN KOMPUTER\COMPUTER_REASONING_CBR\CBR\data\raw
2025-06-22 20:37:20,427 - Extracted nomor_perkara: 10 pk pdt sus pailit 2025 for case_001.txt
2025-06-22 20:37:20,431 - Extracted metadata from case_001.txt
2025-06-22 20:37:20,431 - Processed text file: case_001.txt
2025-06-22 20:37:20,433 - Extracted nomor_perkara: 1178 k pdt sus pailit 2024 for case_002.txt
2025-06-22 20:37:20,434 - Extracted metadata from case_002.txt
2025-06-22 20:37:20,435 - Processed text file: case_002.txt
2025-06-22 20:37:20,436 - Extracted nomor_perkara: 11 pk pdt sus pailit 2025 for case_003.txt
2025-06-22 20:37:20,437 - Extracted metadata from case_003.txt
2025-06-22 20:37:20,438 - Processed text file: case_003.txt
2025-06-22 20:37:20,439 - Extracted nomor_perkara: 1227 k pdt 