### Cell 1: Ekstrak Solusi 

In [12]:
import os
import pandas as pd

# Paths
PROCESSED_DIR = os.path.join("..", "data", "processed")
CSV_PATH      = os.path.join(PROCESSED_DIR, "cases_cleaned.csv")

# Load data kasus
df = pd.read_csv(CSV_PATH)

# Ekstrak solusi: gunakan ringkasan_fakta sebagai solution_text
df['solution_text'] = df['ringkasan_fakta']

# Peta case_id → solution_text
case_solutions = dict(zip(df['case_id'], df['solution_text']))

print(f"[i] Ekstraksi solusi: {len(case_solutions)} kasus di-mapping.")

[i] Ekstraksi solusi: 47 kasus di-mapping.


### Cell 2: Algoritma Prediksi

In [13]:
from collections import Counter

# Majority vote: pilih case_id paling sering muncul
def predict_outcome_majority(topk_ids):
    cnt = Counter(topk_ids)
    return cnt.most_common(1)[0][0]

# Weighted similarity: bobot=skor
def predict_outcome_weighted(topk_ids, topk_scores):
    cnt = Counter()
    for cid, sc in zip(topk_ids, topk_scores):
        cnt[cid] += sc
    return cnt.most_common(1)[0][0]


### Cell 3: Implementasi Fungsi

In [14]:
import os
import pickle
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer

# Paths
VEC_DIR     = os.path.join("..", "data", "vectors")
VEC_PATH    = os.path.join(VEC_DIR, "tfidf_fulltext_vec.pkl")

# Muat TF-IDF vectorizer
with open(VEC_PATH, 'rb') as f:
    vec = pickle.load(f)

# Siapkan matrix X untuk index
texts = df['text_full'].fillna("").tolist()
X = vec.transform(texts).toarray().astype('float32')

# Build & normalisasi Faiss index
index = faiss.IndexFlatIP(X.shape[1])
faiss.normalize_L2(X)
index.add(X)

# Fungsi retrieve berbasis Faiss
def retrieve_faiss(query: str, k: int = 5):
    qv = vec.transform([query]).toarray().astype('float32')
    faiss.normalize_L2(qv)
    D, I = index.search(qv, k)
    res = df.iloc[I[0]].copy().reset_index(drop=True)
    res['score'] = D[0]
    return res

# Fungsi predict_outcome
def predict_outcome(query: str, k: int = 5, method: str = 'majority') -> str:
    res = retrieve_faiss(query, k)
    ids    = res['case_id'].tolist()
    scores = res['score'].tolist()
    if method == 'weighted':
        cid = predict_outcome_weighted(ids, scores)
    else:
        cid = predict_outcome_majority(ids)
    return case_solutions.get(cid, "")

# Contoh
q = "penipuan buku pencatatan genset"
print("Majority:", predict_outcome(q, method='majority'))
print("Weighted:", predict_outcome(q, method='weighted'))


Majority: 1 (satu) buku pencatatan bulanan warna hijau piutang ppn 2017 (bukti po dari pt. srikandi jawara dunia atas pemesanan 10 (sepuluh) unit genset)
Weighted: 1 (satu) buku pencatatan bulanan warna hijau piutang ppn 2017 (bukti po dari pt. srikandi jawara dunia atas pemesanan 10 (sepuluh) unit genset)


### Cell 4: Demo Manual (iv): Demo Manual

In [15]:
# Ambil 5 sampel untuk demo
for _, row in df.sample(5, random_state=0).iterrows():
    query = row['text_full'].split('.')[0][:100]
    true_sol = case_solutions[row['case_id']]
    maj = predict_outcome(query, method='majority')
    wtd = predict_outcome(query, method='weighted')
    print(f"Query       : {query}")
    print(f"Ground-truth: {true_sol}")
    print(f"Majority    : {maj}")
    print(f"Weighted    : {wtd}\n")


Query       : putusan pn surabaya
6/pid
Ground-truth: 3 (tiga) lembar nota penjualan, 6 (enam) lembar sales order (so), 5 (lima) lembar copy surat jalan serta 1 (satu) lembar faktur penjualan pt damai sejahtera abadi (ufo elektronik) jl. kertajaya no. 149 surabaya, 5 (lima) tv led merk lg type 32lm550bpta,1 (satu) lembar surat jalan nomor : sjgbhq21110/ 00178, invoice : invktj2110/00469
Majority    : putusan pn surabaya
679/pid
Weighted    : putusan pn surabaya
679/pid

Query       : putusan pn surabaya
1470/pid
Ground-truth: 1 (satu) unit sepeda motor yamaha n-max 2dp tahun 2015 warna merah nopol. l-4460-zz noka. mh3sg3120fk023974 nosin. g3e4e0054520 stnk an. agus mulyadi alamat, griya benowo indah ii blok q-10 surabaya
Majority    : 1 (satu) unit sepeda motor yamaha n-max 2dp tahun 2015 warna merah nopol. l-4460-zz noka. mh3sg3120fk023974 nosin. g3e4e0054520 stnk an. agus mulyadi alamat, griya benowo indah ii blok q-10 surabaya
Weighted    : 1 (satu) unit sepeda motor yamaha n-max 2d

### Cell 5: Simpan Predictions

In [1]:
import os, json, pickle, faiss
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Paths
EVAL_DIR       = os.path.join("..","data","eval")
RESULTS_DIR    = os.path.join("..","data","results")
VEC_DIR        = os.path.join("..","data","vectors")
QUERIES_PATH   = os.path.join(EVAL_DIR, "queries.json")
VECTOR_PATH    = os.path.join(VEC_DIR, "tfidf_fulltext_vec.pkl")
PRED_CSV       = os.path.join(RESULTS_DIR, "predictions.csv")

# Load queries
with open(QUERIES_PATH, 'r', encoding='utf-8') as f:
    queries = json.load(f)

# Load TF-IDF & build Faiss
vec = pickle.load(open(VECTOR_PATH, 'rb'))
df = pd.read_csv(os.path.join("..","data","processed","cases_cleaned.csv"))
X = vec.transform(df["text_full"].fillna("")).toarray().astype("float32")
index = faiss.IndexFlatIP(X.shape[1]); faiss.normalize_L2(X); index.add(X)

def retrieve_faiss(q, k=5):
    v = vec.transform([q]).toarray().astype("float32")
    faiss.normalize_L2(v)
    D, I = index.search(v, k)
    return df.iloc[I[0]]["case_id"].tolist()

# Generate predictions
os.makedirs(RESULTS_DIR, exist_ok=True)
preds = []
for qid, q in enumerate(queries, start=1):
    query = q["query"]
    gt    = q["ground_truth"]
    top5  = retrieve_faiss(query, k=5)
    preds.append({
        "query_id":       qid,
        "ground_truth":   gt,
        "predicted_case": top5[0],      # apa adanya
        "top_5_case_ids": top5
    })

pd.DataFrame(preds).to_csv(PRED_CSV, index=False, encoding="utf-8")
print("Predictions saved:", PRED_CSV)

Predictions saved: ..\data\results\predictions.csv
