### Cell 1: Instalasi & Imports

In [None]:
%pip install pandas python-dateutil scikit-learn 

In [16]:
import os
import re
import pandas as pd
import pickle
from dateutil import parser
from sklearn.feature_extraction.text import TfidfVectorizer

# Direktori clean teks dari Tahap 1
CLEAN_TEXT_DIR = os.path.join("..", "data", "raw","clean_text")  # pastikan sesuai definisi Cell 1 Tahap 1
CLEAN_DIR = CLEAN_TEXT_DIR

# Direktori output
PROCESSED_DIR = os.path.join("..", "data", "processed")
META_CSV  = os.path.join(PROCESSED_DIR, "cases.csv")
VEC_DIR   = os.path.join("..", "data", "vectors")

# Buat direktori jika belum ada
os.makedirs(VEC_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

### Cell 2: Parsing Metadata

In [22]:
# Pola untuk ekstraksi
PATTERNS = {
    'case_id':        r'putusan pn surabaya nomor\s*([\w\./-]+)',
    'decision_date':  r'tanggal\s*(\d{1,2}\s+\w+\s+\d{4})',
    'prosecutor':     r'penuntut umum:\s*([\s\S]+?)terdakwa',
    'defendants':     r'terdakwa:\s*([\s\S]+?)nomor',
    'classification': r'klasifikasi\s*([\w ]+)',
    'keywords':       r'kata kunci\s*([\w, ]+)',
    'charge':         r'bersalah melakukan tindak pidana\s*([\s\S]+?)\;',
    'ringkasan_fakta':r'barang bukti berupa:\s*([\s\S]+?)(?=\n\s*\w|$)',
    'full_text':      r'(putusan pn surabaya[\s\S]+?)statistik'
}

def parse_date_id(s: str) -> str:
    parts = s.lower().split()
    if len(parts) == 3:
        day, mon, year = parts
        month_map = {
            'januari':'January','februari':'February','maret':'March','april':'April',
            'mei':'May','juni':'June','juli':'July','agustus':'August',
            'september':'September','oktober':'October','november':'November','desember':'December'
        }
        eng = month_map.get(mon, mon)
        try:
            return parser.parse(f"{day} {eng} {year}", dayfirst=True).date().isoformat()
        except:
            return None
    return None

records = []
for fn in sorted(os.listdir(CLEAN_DIR)):
    if not fn.startswith("clean_"): continue
    txt = open(os.path.join(CLEAN_DIR, fn), encoding="utf-8").read().lower()
    rec = { 'file': fn }
    for field, pat in PATTERNS.items():
        m = re.search(pat, txt)
        rec[field] = m.group(1).strip() if m else None
    rec['decision_date'] = parse_date_id(rec.get('decision_date') or "")
    rec['prosecutor'] = [p.strip(' .:') for p in re.split(r'\d+\.\s*', rec.get('prosecutor') or "") if p.strip()]
    rec['defendants'] = [d.strip(' .:,') for d in re.split(r'\d+\.\s*', rec.get('defendants') or "") if d.strip()]
    rec['ringkasan_fakta'] = [e.strip(' .;') for e in re.split(r';|\n', rec.get('ringkasan_fakta') or "") if e.strip()]
    rec['charge'] = " ".join(rec['charge'].split()) if rec.get('charge') else None
    rec['full_text'] = re.sub(r'\n+', ' ', rec['full_text']).strip() if rec.get('full_text') else None
    records.append(rec)

df = pd.DataFrame(records)
df.to_csv(META_CSV, index=False, encoding="utf-8")
print(f"[ii] Ekstraksi konten kunci selesai: {len(df)} records disimpan di {META_CSV}")

[ii] Ekstraksi konten kunci selesai: 47 records disimpan di ..\data\processed\cases.csv


### Cell 3: Feature Extraction & Vectorization

In [24]:
df = pd.read_csv(META_CSV)
if 'full_text' in df.columns and 'text_full' not in df.columns:
    df['text_full'] = df['full_text']

# Hitung panjang teks
df['text_length'] = df['text_full'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

# TF-IDF dengan placeholder
templates = [t if t.strip() else 'kosong' for t in df['text_full'].fillna('')]
vec = TfidfVectorizer(max_features=1000)
X   = vec.fit_transform(templates)

# QA-pairs
def make_qa(row):
    return {
        'pertanyaan': f"Pasal berapa yang diputus dalam kasus {row['case_id']}?",
        'jawaban': row.get('charge') or ''
    }

df['qa_pair'] = df.apply(make_qa, axis=1)

# Simpan vectorizer & matriks
os.makedirs(VEC_DIR, exist_ok=True)
with open(os.path.join(VEC_DIR, 'tfidf_fulltext_vec.pkl'), 'wb') as f:
    pickle.dump(vec, f)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out()).to_csv(
    os.path.join(VEC_DIR, 'tfidf_fulltext_matrix.csv'), index=False
)
print("[iii] Feature engineering selesai: text_length, TF-IDF, QA-pairs")

[iii] Feature engineering selesai: text_length, TF-IDF, QA-pairs


### Cell 4: Penyimpanan

In [34]:
# Cell 4 (revisi): Format Ulang & Simpan Bersih
import os
import pandas as pd
import re

PROCESSED_DIR = os.path.join("..", "data", "processed")
IN_CSV        = os.path.join(PROCESSED_DIR, "cases.csv")
OUT_CSV       = os.path.join(PROCESSED_DIR, "cases_cleaned.csv")
CLEAN_DIR     = os.path.join("..", "data", "raw", "clean_text")

# 1) Muat metadata
df = pd.read_csv(IN_CSV)

# 2) Isi full_text dari file clean_XXX.txt
full_texts = []
for fn in df['file']:
    path = os.path.join(CLEAN_DIR, fn)
    with open(path, encoding='utf-8') as f:
        full_texts.append(f.read().strip())
df['text_full'] = full_texts

# 3) Hitung text_length
df['text_length'] = df['text_full'].apply(lambda t: len(t.split()))

# 4) Format ringkasan_fakta & pasal seperti sebelumnya
def fmt_fakta(x, full):
    items = eval(x) if isinstance(x, str) and x.startswith('[') else (x if isinstance(x,list) else [])
    return "; ".join(items) if items else full.split('.')[0]

df['ringkasan_fakta'] = df.apply(lambda r: fmt_fakta(r['ringkasan_fakta'], r['text_full']), axis=1)

def fmt_pasal(chg, full):
    m = re.search(r'(\d+)', str(chg))
    if m: return m.group(1)
    m2 = re.search(r'pasal\s+(\d+)', full.lower())
    return m2.group(1) if m2 else ''

df['pasal'] = df.apply(lambda r: fmt_pasal(r['charge'], r['text_full']), axis=1)

# 5) Format pihak
def fmt_pihak(r):
    pros = eval(r['prosecutor']) if isinstance(r['prosecutor'], str) and r['prosecutor'].startswith('[') else r['prosecutor']
    defs = eval(r['defendants']) if isinstance(r['defendants'], str) and r['defendants'].startswith('[') else r['defendants']
    j = pros[0] if isinstance(pros,(list,tuple)) and pros else ""
    t = defs[0] if isinstance(defs,(list,tuple)) and defs else ""
    return f"{j} vs. {t}"

df['pihak'] = df.apply(fmt_pihak, axis=1)

# 6) Bangun final DataFrame
df_final = pd.DataFrame({
    'case_id':        range(1, len(df)+1),
    'no_perkara':     df['case_id'],
    'tanggal':        df['decision_date'],
    'ringkasan_fakta':df['ringkasan_fakta'],
    'pasal':          df['pasal'],
    'pihak':          df['pihak'],
    'text_full':      df['text_full'],
    'text_length':    df['text_length'],
})

# 7) Simpan CSV bersih
os.makedirs(PROCESSED_DIR, exist_ok=True)
df_final.to_csv(OUT_CSV, index=False, encoding='utf-8')
print(f"[iv] Dataset final dengan full_text diisi disimpan di {OUT_CSV}")


[iv] Dataset final dengan full_text diisi disimpan di ..\data\processed\cases_cleaned.csv


### Cell 5: Output

In [35]:
# Cell 5: Preview & Validasi (revisi)
from IPython.display import display

display(df_final.head())

# Cek kelengkapan
if (df_final['text_length'] == 0).any():
    print(f"⚠️ {(df_final['text_length']==0).sum()} kasus full_text masih kosong.")
else:
    print("✅ Semua full_text terisi.")

print("[v] Format akhir lengkap, silakan cek cases_cleaned.csv.")


Unnamed: 0,case_id,no_perkara,tanggal,ringkasan_fakta,pasal,pihak,text_full,text_length
0,1,2361/pid.b/2022/pn,2022-12-19,putusan pn surabaya\n2361/pid,,"diah ratri hapsari, sh, mh vs. hendri cavendis...",putusan pn surabaya\n2361/pid.b/2022/pn sby\np...,267
1,2,1379/pid.b/2022/pn,2022-09-08,1 (satu) buku pencatatan bulanan warna hijau p...,1.0,"bunari, sh\nanoek ekawati, sh., mh vs. i ketut...",putusan pn surabaya\n1379/pid.b/2022/pn sby\np...,1101
2,3,1696/pid.b/2022/pn,2022-12-06,putusan pn surabaya\n1696/pid,,didik k. w vs. h achmad sofi'ie bin sudarto alm,putusan pn surabaya\n1696/pid.b/2022/pn sby\np...,598
3,4,562/pid.b/2022/pn,2022-04-11,putusan pn surabaya\n562/pid,,"siska christina, sh vs. yuliardi kurniawan bin...",putusan pn surabaya\n562/pid.b/2022/pn sby\npu...,213
4,5,2582/pid.b/2021/pn,2022-01-20,1 (satu) buah bpkb sepeda motor honda beat war...,,"parlin manullang, sh vs. zainal arifin bin muh...",putusan pn surabaya\n2582/pid.b/2021/pn sby\np...,212


✅ Semua full_text terisi.
[v] Format akhir lengkap, silakan cek cases_cleaned.csv.
