### Cell 1: Instalasi & Imports

In [None]:
%pip install pandas scikit-learn faiss-cpu

In [14]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss
import pickle

# Direktori dan paths
PROCESSED_DIR   = os.path.join("..", "data", "processed")
VEC_DIR         = os.path.join("..", "data", "vectors")
EVAL_DIR        = os.path.join("..", "data", "eval")
CSV_PATH        = os.path.join(PROCESSED_DIR, "cases_cleaned.csv")
TFIDF_VEC_PATH  = os.path.join(VEC_DIR, "tfidf_fulltext_vec.pkl")
QUERIES_PATH    = os.path.join(EVAL_DIR, "queries.json")

# Load data dan prepare teks
os.makedirs(EVAL_DIR, exist_ok=True)
df = pd.read_csv(CSV_PATH)
texts = df['text_full'].fillna("").tolist()

# Fit TF-IDF
vec = TfidfVectorizer(max_features=5000)
sparse_X = vec.fit_transform(texts)
# Simpan vectorizer
os.makedirs(VEC_DIR, exist_ok=True)
with open(TFIDF_VEC_PATH, 'wb') as f:
    pickle.dump(vec, f)

# Convert ke array untuk Faiss
X = sparse_X.toarray().astype('float32')
print(f"[i] TF-IDF vector ready: shape={X.shape}")

[i] TF-IDF vector ready: shape=(47, 2551)


### Cell 2: Splitting Data

In [16]:
from sklearn.model_selection import train_test_split

# Labels: gunakan case_id
y = df['case_id'].values
# Bagi data train/test 80:20
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"[ii] Data split: {X_train.shape[0]} train, {X_test.shape[0]} test")

[ii] Data split: 37 train, 10 test


### Cell 3: Model Retrieval

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)

# Pilih model: 'nb' atau 'svm'
def train_model(method='nb'):
    if method == 'svm':
        clf = SVC(kernel='linear', probability=True, random_state=42)
    else:
        clf = MultinomialNB()
    clf.fit(X_train, y_train_enc)
    return clf

clf_nb  = train_model('nb')
clf_svm = train_model('svm')
print("[iii] Models trained: NB and SVM")

[iii] Models trained: NB and SVM


### Cell 4: Fungsi Retrieval

In [18]:
import faiss
from sklearn.metrics.pairwise import cosine_similarity

# Build Faiss index untuk cosine similarity
index = faiss.IndexFlatIP(X.shape[1])
faiss.normalize_L2(X)
index.add(X)

# Fungsi retrieve top-k dengan Faiss
def retrieve_faiss(query: str, k: int = 5):
    q_vec = vec.transform([query]).toarray().astype('float32')
    faiss.normalize_L2(q_vec)
    D, I = index.search(q_vec, k)
    res = df.iloc[I[0]].copy().reset_index(drop=True)
    res['score'] = D[0]
    return res

# Fungsi retrieve top-k alternatif dengan cosine_similarity

def retrieve_cosine(query: str, k: int = 5):
    q_vec = vec.transform([query]).toarray()
    sim = cosine_similarity(q_vec, X)[0]
    idx_sorted = sim.argsort()[::-1][:k]
    res = df.iloc[idx_sorted].copy().reset_index(drop=True)
    res['score'] = sim[idx_sorted]
    return res

# Contoh penggunaan: coba kedua metode
query = "penipuan buku pencatatan genset"
res_faiss = retrieve_faiss(query, k=5)
res_cosine = retrieve_cosine(query, k=5)

print("=== Faiss Retrieval ===")
display(res_faiss[['case_id','no_perkara','tanggal','ringkasan_fakta','score']])
print("=== Cosine Retrieval ===")
display(res_cosine[['case_id','no_perkara','tanggal','ringkasan_fakta','score']])

=== Faiss Retrieval ===


Unnamed: 0,case_id,no_perkara,tanggal,ringkasan_fakta,score
0,2,1379/pid.b/2022/pn,2022-09-08,1 (satu) buku pencatatan bulanan warna hijau p...,0.195149
1,7,2225/pid.b/2022/pn,2022-12-19,putusan pn surabaya\n2225/pid,0.04514
2,31,1891/pid.b/2022/pn,2022-10-31,putusan pn surabaya\n1891/pid,0.028963
3,47,881/pid.b/2022/pn,2022-06-23,putusan pn surabaya\n881/pid,0.027381
4,32,2678/pid.b/2021/pn,2022-02-14,putusan pn surabaya\n2678/pid,0.022293


=== Cosine Retrieval ===


Unnamed: 0,case_id,no_perkara,tanggal,ringkasan_fakta,score
0,2,1379/pid.b/2022/pn,2022-09-08,1 (satu) buku pencatatan bulanan warna hijau p...,0.195149
1,7,2225/pid.b/2022/pn,2022-12-19,putusan pn surabaya\n2225/pid,0.04514
2,31,1891/pid.b/2022/pn,2022-10-31,putusan pn surabaya\n1891/pid,0.028963
3,47,881/pid.b/2022/pn,2022-06-23,putusan pn surabaya\n881/pid,0.027381
4,32,2678/pid.b/2021/pn,2022-02-14,putusan pn surabaya\n2678/pid,0.022293


### Cell 5: Pengujian Awal

In [28]:
import os
import json

# Path JSON queries hasil evaluasi
os.makedirs(EVAL_DIR, exist_ok=True)
QUERIES_PATH = os.path.join(EVAL_DIR, "queries.json")

# Siapkan 5 query uji menggunakan kalimat pertama setiap kasus
def get_first_sentence(text):
    return text.split('.')[0][:100].strip()

records = []
for idx, row in df.sample(5, random_state=42).iterrows():
    records.append({
        'query': get_first_sentence(row['text_full']),
        'ground_truth': int(row['case_id'])
    })

# Simpan ke JSON seperti modul
with open(QUERIES_PATH, 'w', encoding='utf-8') as f:
    json.dump(records, f, ensure_ascii=False, indent=2)

print(f"[v] {len(records)} queries saved to {QUERIES_PATH}")

[v] 5 queries saved to ..\data\eval\queries.json
