# Data Cleaning

In [1]:
import pandas as pd

df = pd.read_csv('data/processed/cases.csv')

In [2]:
df.head()

Unnamed: 0,case_id,nomor_perkara,tahun_putusan,bulan_putusan,tanggal_putusan,jenis_perkara,tingkat_pemeriksaan,lembaga_peradilan,hakim_ketua,pasal,ringkasan_fakta,jumlah_kata_putusan,full_text
0,case_001,10 pk pdt sus pailit 2025,2024,October,23,Kepailitan,Peninjauan Kembali,Mahkamah Agung,dr ibrahim ll m,37 tahun 2004,1 menolak permohonan pemeriksaan peninjauan ke...,1735,dari 7 hal put nomor 10 pk pdt sus pailit 2025...
1,case_002,1178 k pdt sus pailit 2024,2024,June,3,Kepailitan,Kasasi,Mahkamah Agung,prof dr h hamdi um,37 tahun 2004,1 menolak permohonan kasasi dari pemohon kasas...,1795,dari 7 hal put nomor 1178 k pdt sus pailit 202...
2,case_003,11 pk pdt sus pailit 2025,2017,July,20,Kepailitan,Peninjauan Kembali,Mahkamah Agung,dr nurul elmiyah,37 tahun 2004,1 menolak permohonan pemeriksaan peninjauan ke...,3405,dari 13 hal put nomor 11 pk pdt sus pailit 202...
3,case_004,1227 k pdt sus pailit 2024,2024,June,10,Kepailitan,Peninjauan Kembali,Mahkamah Agung,dr pri pambudi teguh,37 tahun 2004,menolak permohonan kasasi dari pemohon kasasi ...,2334,dari 9 hal put nomor 1227 k pdt sus pailit 202...
4,case_005,123 k pdt sus pailit 2025,2024,October,30,Kepailitan,Kasasi,Mahkamah Agung,dr h panji widagdo,37 tahun 2004,1 menolak permohonan kasasi dari para pemohon ...,2519,dari 10 hal put nomor 123 k pdt sus pailit 202...


In [3]:
df.isnull().sum()   

case_id                0
nomor_perkara          2
tahun_putusan          0
bulan_putusan          0
tanggal_putusan        0
jenis_perkara          1
tingkat_pemeriksaan    0
lembaga_peradilan      0
hakim_ketua            1
pasal                  0
ringkasan_fakta        1
jumlah_kata_putusan    0
full_text              0
dtype: int64

In [4]:
df['nomor_perkara'].fillna('unknown', inplace=True)
df['jenis_perkara'].fillna(df['jenis_perkara'].mode()[0], inplace=True)
df['hakim_ketua'].fillna(df['hakim_ketua'].mode()[0], inplace=True)
df['ringkasan_fakta'].fillna(df['ringkasan_fakta'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['nomor_perkara'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['jenis_perkara'].fillna(df['jenis_perkara'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

In [5]:
df.isnull().sum()   

case_id                0
nomor_perkara          0
tahun_putusan          0
bulan_putusan          0
tanggal_putusan        0
jenis_perkara          0
tingkat_pemeriksaan    0
lembaga_peradilan      0
hakim_ketua            0
pasal                  0
ringkasan_fakta        0
jumlah_kata_putusan    0
full_text              0
dtype: int64

In [6]:
df.drop(columns=['nomor_perkara', 'tahun_putusan', 'bulan_putusan', 'tanggal_putusan', 'jenis_perkara', 'tingkat_pemeriksaan', 'lembaga_peradilan', 'hakim_ketua', 'pasal', 'jumlah_kata_putusan', 'full_text'], inplace=True)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   case_id          60 non-null     object
 1   ringkasan_fakta  60 non-null     object
dtypes: object(2)
memory usage: 1.1+ KB


# Text Representation & Modeling

## Kolom ringkasan_fakta & case_id

### TF-IDF (SVM)

In [8]:
import pandas as pd
from typing import List
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity
import json
import re

# === Basic Text Preprocessing ===
def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# === Feature Extraction ===
def extract_features(query_vec, doc_vec, query_text, doc_text):
    query_vec = query_vec.toarray()[0]
    doc_vec = doc_vec.toarray()[0]
    cos_sim = cosine_similarity([query_vec], [doc_vec])[0][0]
    query_words = set(query_text.split())
    doc_words = set(doc_text.split())
    overlap = len(query_words.intersection(doc_words)) / max(len(query_words), 1)
    coverage = len(query_words.intersection(doc_words)) / max(len(query_words), 1)
    combined_vec = np.concatenate([query_vec, doc_vec, [cos_sim, overlap, coverage]])
    return combined_vec

# === Load Dataset ===
df = pd.read_csv("data/processed/cases.csv")
texts = df["ringkasan_fakta"].fillna("").apply(preprocess_text)
case_ids = df["case_id"].tolist()

# === TF-IDF Vectorization ===
id_stop_words = [
    "dan", "di", "dari", "ke", "pada", "dengan", "untuk", "yang", "ini", "itu",
    "adalah", "tersebut", "sebagai", "oleh", "atau", "tetapi", "karena", "jika",
    "dalam", "bagi", "tentang", "melalui", "serta", "maka", "lagi", "sudah",
    "belum", "hanya", "saja", "bahwa", "apa", "siapa", "bagaimana", "kapan",
    "dimana", "kenapa", "sejak", "hingga", "agar", "supaya", "meskipun", "walau",
    "kecuali", "terhadap", "antara", "selain", "setiap", "sebelum", "sesudah"
]
vectorizer = TfidfVectorizer(
    max_features=4000,
    ngram_range=(1, 3),
    stop_words=id_stop_words
)
tfidf_matrix = vectorizer.fit_transform(texts)

# === Load Evaluation Queries ===
with open("data/eval/queries.json", "r", encoding="utf-8") as f:
    eval_queries = json.load(f)

# === Prepare Training Data for Pairwise SVM ===
X_train = []
y_train = []
for item in eval_queries:
    query = preprocess_text(item["query"])
    query_vec = vectorizer.transform([query])
    true_id = item["case_id"]
    true_idx = case_ids.index(true_id)
    
    true_vec = tfidf_matrix[true_idx]
    pos_features = extract_features(query_vec, true_vec, query, texts[true_idx])
    
    neg_indices = [i for i in range(len(case_ids)) if i != true_idx]
    neg_samples = np.random.choice(neg_indices, size=min(10, len(neg_indices)), replace=False)
    
    for neg_idx in neg_samples:
        neg_vec = tfidf_matrix[neg_idx]
        neg_features = extract_features(query_vec, neg_vec, query, texts[neg_idx])
        X_train.append(pos_features - neg_features)
        y_train.append(1)
        X_train.append(neg_features - pos_features)
        y_train.append(0)

X_train = np.array(X_train)
y_train = np.array(y_train)

print(f"Training data shape: {X_train.shape}")
print(f"Class distribution: {np.bincount(y_train)}")

# === Train SVM ===
param_grid = {'C': [0.01, 0.1, 1, 10, 100, 1000]}
svm = GridSearchCV(
    LinearSVC(max_iter=5000, class_weight='balanced'),
    param_grid,
    cv=3,
    scoring='accuracy'
)
svm.fit(X_train, y_train)
print(f"Best SVM Parameters: {svm.best_params_}")
print(f"Best CV Accuracy: {svm.best_score_:.2f}")

# === Retrieval Function ===
def retrieve(query: str, k: int = 5) -> List[tuple]:
    query = preprocess_text(query)
    query_vec = vectorizer.transform([query])
    
    scores = []
    for i in range(tfidf_matrix.shape[0]):
        doc_vec = tfidf_matrix[i]
        features = extract_features(query_vec, doc_vec, query, texts[i])
        score = svm.decision_function([features])[0]
        scores.append((case_ids[i], score))
    
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:k]

# === Evaluation ===
correct_5 = 0
correct_10 = 0
for item in eval_queries:
    query = item["query"]
    true_id = item["case_id"]
    results = retrieve(query, k=10)
    top_5 = results[:5]
    
    print(f"\nQuery: {query}")
    print(f"Top 5 Results (ID, Score): {top_5}")
    if any(true_id == result[0] for result in top_5):
        print("✅ FOUND in Top-5:", true_id)
        correct_5 += 1
    else:
        print("❌ NOT FOUND in Top-5:", true_id)
        true_idx = case_ids.index(true_id)
        true_text = texts[true_idx][:300] + "..." if len(texts[true_idx]) > 300 else texts[true_idx]
        print(f"True Case Text: {true_text}")
        query_words = set(preprocess_text(query).split())
        true_words = set(true_text.split())
        common_words = query_words.intersection(true_words)
        print(f"Common Words: {common_words}")
        query_vec = vectorizer.transform([query])
        true_vec = tfidf_matrix[true_idx]
        cos_sim = cosine_similarity(query_vec, true_vec)[0][0]
        print(f"Cosine Similarity with True Doc: {cos_sim:.4f}")
    
    if any(true_id == result[0] for result in results):
        correct_10 += 1

print(f"\nSVM Pairwise Accuracy@5: {correct_5}/{len(eval_queries)} = {correct_5 / len(eval_queries):.2f}")
print(f"SVM Pairwise Recall@10: {correct_10}/{len(eval_queries)} = {correct_10 / len(eval_queries):.2f}")

Training data shape: (200, 8003)
Class distribution: [100 100]




Best SVM Parameters: {'C': 10}
Best CV Accuracy: 0.76

Query: Permohonan peninjauan kembali dalam perkara kepailitan yang diajukan oleh debitur karena menganggap adanya kekeliruan nyata dalam putusan pengadilan niaga yang membatalkan kesepakatan perdamaian dan menyatakan debitur pailit berdasarkan ketentuan Pasal 285 ayat (2) huruf d UU No. 37 Tahun 2004
Top 5 Results (ID, Score): [('case_009', 2.5875060576070226), ('case_003', 2.5574895600781655), ('case_006', 2.545796143787113), ('case_045', 2.4219774958219644), ('case_013', 2.338976842317841)]
❌ NOT FOUND in Top-5: case_001
True Case Text: 1 menolak permohonan pemeriksaan peninjauan kembali dari pemohon peninjauan kembali dahulu termohon pkpu jeje supriatna tersebut 2 menghukum pemohon peninjauan kembali dahulu termohon pkpu untuk membayar biaya perkara dalam pemeriksaan peninjauan kembali ini yang ditetapkan sebesar rp10 000 000 00 s...
Common Words: {'dalam', '2', 'kembali', 'peninjauan', 'perkara', 'yang', 'permohonan'}
Cosine Si

### Indo-BERT

In [14]:
%pip install sentence-transformers
%pip install pandas numpy sklearn sentence-transformers torch tqdm
%pip install transformers torch
%pip install tf-keras
%pip install rank-bm25

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import pandas as pd
import numpy as np
from typing import List
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, CrossEncoder
from rank_bm25 import BM25Okapi
import json
import re
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize

# Download tokenizer NLTK jika belum ada
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# === Load Model ===
model_name = "indobenchmark/indobert-base-p1"
model = SentenceTransformer(model_name)
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# === Load Dataset Putusan ===
df = pd.read_csv("data/processed/cases.csv")
texts = df["ringkasan_fakta"].fillna("").tolist()
case_ids = df["case_id"].tolist()

# === Preprocessing Text ===
def preprocess_text(text: str) -> str:
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = re.sub(r'\b(?:putusan|nomor|tahun|pengadilan|hakim)\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'uu\s+no', 'undang-undang nomor', text, flags=re.IGNORECASE)
    text = re.sub(r'pasal\s+\d+', 'pasal', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    return ' '.join(tokens) if tokens else "empty"

texts = [preprocess_text(text) for text in texts]
tokenized_texts = [text.split() for text in texts]
bm25 = BM25Okapi(tokenized_texts)

# === Encode Dokumen ===
def encode_documents(texts: List[str], batch_size: int = 16) -> np.ndarray:
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding documents"):
        batch = texts[i:i + batch_size]
        try:
            batch_embeddings = model.encode(
                batch,
                convert_to_numpy=True,
                show_progress_bar=False,
                batch_size=batch_size,
                normalize_embeddings=True,
                max_length=512
            )
            embeddings.append(batch_embeddings)
        except Exception as e:
            print(f"Error on batch {i // batch_size}: {e}")
    if not embeddings:
        raise ValueError("No document embeddings generated.")
    return np.vstack(embeddings)

doc_embeddings = encode_documents(texts)

# === Encode Query ===
def encode_query(text: str) -> np.ndarray:
    text = preprocess_text(text)
    embedding = model.encode(
        [text],
        convert_to_numpy=True,
        show_progress_bar=False,
        normalize_embeddings=True,
        max_length=512
    )
    return embedding[0]

# === Retrieval Function ===
def retrieve(query: str, k: int = 10, alpha: float = 0.6) -> List[tuple]:
    query_clean = preprocess_text(query)
    query_vec = encode_query(query_clean)
    similarities = cosine_similarity([query_vec], doc_embeddings)[0]
    
    tokenized_query = query_clean.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_scores = bm25_scores / (np.max(bm25_scores) + 1e-10)

    combined_scores = alpha * similarities + (1 - alpha) * bm25_scores
    top_k_idx = combined_scores.argsort()[-k:][::-1]
    top_k_cases = [case_ids[i] for i in top_k_idx]
    pairs = [[query_clean, texts[i]] for i in top_k_idx]
    
    rerank_scores = cross_encoder.predict(pairs)
    reranked_idx = np.argsort(rerank_scores)[::-1][:5]
    return [(top_k_cases[i], float(rerank_scores[i])) for i in reranked_idx]

# === Load Query Evaluasi ===
with open("data/eval/queries.json", "r", encoding="utf-8") as f:
    eval_queries = json.load(f)

# === Proses Evaluasi ===
correct_5 = 0
correct_10 = 0
for item in eval_queries:
    query = item["query"]
    true_id = item["case_id"]
    results = retrieve(query, k=10)
    top_5 = results[:5]
    
    print(f"\nQuery: {query}")
    print(f"Top 5 Results (ID, Score): {top_5}")
    if any(true_id == result[0] for result in top_5):
        print("✅ FOUND in Top-5:", true_id)
        correct_5 += 1
    else:
        print("❌ NOT FOUND in Top-5:", true_id)
        true_idx = case_ids.index(true_id)
        true_text = texts[true_idx][:300] + "..." if len(texts[true_idx]) > 300 else texts[true_idx]
        print(f"True Case Text: {true_text}")
        query_words = set(preprocess_text(query).split())
        true_words = set(true_text.split())
        common_words = query_words.intersection(true_words)
        print(f"Common Words: {common_words}")
        query_vec = encode_query(query)
        true_vec = doc_embeddings[true_idx]
        cos_sim = cosine_similarity([query_vec], [true_vec])[0][0]
        print(f"Cosine Similarity with True Doc: {cos_sim:.4f}")
    
    if any(true_id == result[0] for result in results):
        correct_10 += 1

print(f"\nIndoBERT Pairwise Accuracy@5: {correct_5}/{len(eval_queries)} = {correct_5 / len(eval_queries):.2f}")
print(f"IndoBERT Pairwise Recall@10: {correct_10}/{len(eval_queries)} = {correct_10 / len(eval_queries):.2f}")




No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.
Encoding documents: 100%|██████████| 4/4 [00:17<00:00,  4.36s/it]



Query: Permohonan peninjauan kembali dalam perkara kepailitan yang diajukan oleh debitur karena menganggap adanya kekeliruan nyata dalam putusan pengadilan niaga yang membatalkan kesepakatan perdamaian dan menyatakan debitur pailit berdasarkan ketentuan Pasal 285 ayat (2) huruf d UU No. 37 Tahun 2004
Top 5 Results (ID, Score): [('case_045', 4.82124662399292), ('case_003', 4.748922348022461), ('case_060', 4.723039627075195), ('case_016', 4.633030414581299), ('case_001', 4.590864181518555)]
✅ FOUND in Top-5: case_001

Query: Upaya hukum kasasi oleh debitur dalam perkara kepailitan yang menolak putusan pengadilan niaga atas dalil ketidaksesuaian pencocokan piutang kreditur dengan nilai tagihan yang diajukan, sehingga tidak memenuhi unsur pailit sebagaimana diatur dalam UU Kepailitan
Top 5 Results (ID, Score): [('case_031', 3.0407636165618896), ('case_034', 2.670930862426758), ('case_017', 1.9597721099853516), ('case_016', 1.2848641872406006), ('case_025', 1.012903094291687)]
❌ NOT FOUND 

### Logistic Regression

In [11]:
import pandas as pd
from typing import List
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity
import json
import re

# === Basic Text Preprocessing ===
def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# === Feature Extraction ===
def extract_features(query_vec, doc_vec, query_text, doc_text):
    query_vec = query_vec.toarray()[0]
    doc_vec = doc_vec.toarray()[0]
    cos_sim = cosine_similarity([query_vec], [doc_vec])[0][0]
    query_words = set(query_text.split())
    doc_words = set(doc_text.split())
    overlap = len(query_words.intersection(doc_words)) / max(len(query_words), 1)
    coverage = len(query_words.intersection(doc_words)) / max(len(query_words), 1)
    combined_vec = np.concatenate([query_vec, doc_vec, [cos_sim, overlap, coverage]])
    return combined_vec

# === Load Dataset ===
df = pd.read_csv("data/processed/cases.csv")
texts = df["ringkasan_fakta"].fillna("").apply(preprocess_text)
case_ids = df["case_id"].tolist()

# === TF-IDF Vectorization ===
id_stop_words = [
    "dan", "di", "dari", "ke", "pada", "dengan", "untuk", "yang", "ini", "itu",
    "adalah", "tersebut", "sebagai", "oleh", "atau", "tetapi", "karena", "jika",
    "dalam", "bagi", "tentang", "melalui", "serta", "maka", "lagi", "sudah",
    "belum", "hanya", "saja", "bahwa", "apa", "siapa", "bagaimana", "kapan",
    "dimana", "kenapa", "sejak", "hingga", "agar", "supaya", "meskipun", "walau",
    "kecuali", "terhadap", "antara", "selain", "setiap", "sebelum", "sesudah"
]
vectorizer = TfidfVectorizer(
    max_features=4000,
    ngram_range=(1, 3),
    stop_words=id_stop_words
)
tfidf_matrix = vectorizer.fit_transform(texts)

# === Load Evaluation Queries ===
with open("data/eval/queries.json", "r", encoding="utf-8") as f:
    eval_queries = json.load(f)

# === Prepare Training Data ===
X_train = []
y_train = []
for item in eval_queries:
    query = preprocess_text(item["query"])
    query_vec = vectorizer.transform([query])
    true_id = item["case_id"]
    true_idx = case_ids.index(true_id)

    true_vec = tfidf_matrix[true_idx]
    pos_features = extract_features(query_vec, true_vec, query, texts[true_idx])

    neg_indices = [i for i in range(len(case_ids)) if i != true_idx]
    neg_samples = np.random.choice(neg_indices, size=min(10, len(neg_indices)), replace=False)

    for neg_idx in neg_samples:
        neg_vec = tfidf_matrix[neg_idx]
        neg_features = extract_features(query_vec, neg_vec, query, texts[neg_idx])
        
        diff_vec = pos_features - neg_features
        X_train.append(diff_vec)
        y_train.append(1)
        
        diff_vec_neg = neg_features - pos_features
        X_train.append(diff_vec_neg)
        y_train.append(0)

X_train = np.array(X_train)
y_train = np.array(y_train)

print(f"Training data shape: {X_train.shape}")
print(f"Class distribution: {np.bincount(y_train)}")

# === Train Logistic Regression with Tuning ===
param_grid = {'C': [0.01, 0.1, 1, 10, 100, 1000]}
logreg = GridSearchCV(
    LogisticRegression(max_iter=5000, class_weight='balanced'),
    param_grid,
    cv=3,
    scoring='accuracy'
)
logreg.fit(X_train, y_train)
print(f"Best Logistic Regression Parameters: {logreg.best_params_}")
print(f"Best CV Accuracy: {logreg.best_score_:.2f}")

# === Retrieval Function ===
def retrieve(query: str, k: int = 5) -> List[tuple]:
    query = preprocess_text(query)
    query_vec = vectorizer.transform([query])
    
    scores = []
    for i in range(tfidf_matrix.shape[0]):
        doc_vec = tfidf_matrix[i]
        features = extract_features(query_vec, doc_vec, query, texts[i])
        score = logreg.decision_function([features])[0]
        scores.append((case_ids[i], score))
    
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:k]

# === Evaluation ===
correct_5 = 0
correct_10 = 0
for item in eval_queries:
    query = item["query"]
    true_id = item["case_id"]
    results = retrieve(query, k=10)
    top_5 = results[:5]
    
    print(f"\nQuery: {query}")
    print(f"Top 5 Results (ID, Score): {top_5}")
    if any(true_id == result[0] for result in top_5):
        print("✅ FOUND in Top-5:", true_id)
        correct_5 += 1
    else:
        print("❌ NOT FOUND in Top-5:", true_id)
        true_idx = case_ids.index(true_id)
        true_text = texts[true_idx][:300] + "..." if len(texts[true_idx]) > 300 else texts[true_idx]
        print(f"True Case Text: {true_text}")
        query_words = set(preprocess_text(query).split())
        true_words = set(true_text.split())
        common_words = query_words.intersection(true_words)
        print(f"Common Words: {common_words}")
        query_vec = vectorizer.transform([query])
        true_vec = tfidf_matrix[true_idx]
        cos_sim = cosine_similarity(query_vec, true_vec)[0][0]
        print(f"Cosine Similarity with True Doc: {cos_sim:.4f}")
    
    if any(true_id == result[0] for result in results):
        correct_10 += 1

print(f"\nLogReg Pairwise Accuracy@5: {correct_5}/{len(eval_queries)} = {correct_5 / len(eval_queries):.2f}")
print(f"LogReg Pairwise Recall@10: {correct_10}/{len(eval_queries)} = {correct_10 / len(eval_queries):.2f}")


Training data shape: (200, 8003)
Class distribution: [100 100]
Best Logistic Regression Parameters: {'C': 1}
Best CV Accuracy: 0.60

Query: Permohonan peninjauan kembali dalam perkara kepailitan yang diajukan oleh debitur karena menganggap adanya kekeliruan nyata dalam putusan pengadilan niaga yang membatalkan kesepakatan perdamaian dan menyatakan debitur pailit berdasarkan ketentuan Pasal 285 ayat (2) huruf d UU No. 37 Tahun 2004
Top 5 Results (ID, Score): [('case_007', 2.6605188995203863), ('case_005', 2.355685891266474), ('case_006', 2.310528859224962), ('case_001', 2.151710884136517), ('case_010', 2.1175203462234284)]
✅ FOUND in Top-5: case_001

Query: Upaya hukum kasasi oleh debitur dalam perkara kepailitan yang menolak putusan pengadilan niaga atas dalil ketidaksesuaian pencocokan piutang kreditur dengan nilai tagihan yang diajukan, sehingga tidak memenuhi unsur pailit sebagaimana diatur dalam UU Kepailitan
Top 5 Results (ID, Score): [('case_007', 2.681701650163197), ('case_005',