# **BLOK 0 – Setup umum (install & import)**

In [19]:
# BLOK 0 – Setup umum

!pip install rank-bm25 sentence-transformers --quiet

import pandas as pd
import numpy as np
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util


OBAT_PATH = "final_clean_data_20112024_halodoc_based.csv"
GT_PATH   = "Ground Truth NLP - Sheet1.csv"

# --- Konfigurasi evaluasi ---
K_LIST = [5, 10]          # nilai k untuk top-k
RRF_K = 60                # konstanta RRF
N_CANDIDATES_HYBRID = 100 # banyak kandidat awal dari masing-masing model


# **BLOK 2 – Eksplorasi Data Obat Halodoc**

In [20]:
# BLOK 2 – Baca & eksplorasi data obat Halodoc

obat_df = pd.read_csv(OBAT_PATH)

print("Shape data obat:", obat_df.shape)
print("\n5 baris pertama:")
display(obat_df.head())

print("\nInfo kolom:")
display(obat_df.dtypes)

print("\nCek missing value per kolom:")
display(obat_df.isna().sum())

print("\nCek duplikasi id:")
print("Jumlah id duplikat:", obat_df['id'].duplicated().sum())

print("\nDistribusi tipe obat (top 10):")
print(obat_df['tipe'].value_counts().head(10))


Shape data obat: (3137, 7)

5 baris pertama:


Unnamed: 0,id,nomor_registrasi,nama,tipe,komposisi,indikasi_umum,indikasi
0,7,DKL1333528917A1,ericfil,tablet,sildenafil citrate,ericfil diindikasikan untuk mengobati disfungs...,disfungsi ereksi
1,154,GKL1433531717B1,sildenafil citrate,tablet,sildenafil citrate,terapi disfungsi ereksi pada pria dewasa,disfungsi ereksi
2,238,DKI1690401417B1,viagra,tablet,sildenafil citrate,terapi disfungsi ereksi pada pria dewasa,disfungsi ereksi
3,241,DKI1973401817D1,cialis,tablet,tadalafil,pengobatan ketidakmampuan mencapai atau memper...,hipertensi pulmonal| bph (benign prostatic hyp...
4,245,DKL1307919609A1,topgra,kaplet,sildenafil citrate,terapi disfungsi ereksi pada pria dewasa,disfungsi ereksi



Info kolom:


Unnamed: 0,0
id,int64
nomor_registrasi,object
nama,object
tipe,object
komposisi,object
indikasi_umum,object
indikasi,object



Cek missing value per kolom:


Unnamed: 0,0
id,0
nomor_registrasi,0
nama,0
tipe,69
komposisi,0
indikasi_umum,0
indikasi,71



Cek duplikasi id:
Jumlah id duplikat: 47

Distribusi tipe obat (top 10):
tipe
tablet      964
sirup       425
kaplet      390
kapsul      379
krim        201
tetes       153
suspensi    107
minyak       78
larutan      54
sachet       48
Name: count, dtype: int64


**Interpretasi**

Hasil eksplorasi menunjukkan bahwa dataset obat Halodoc terdiri dari **3.137 baris** dengan 7 kolom, dengan `id` bertipe numerik dan kolom lain bertipe teks sehingga `id` sesuai digunakan sebagai identifier. Terdapat **missing value** pada kolom `tipe` (69 baris) dan `indikasi` (71 baris), serta **47 nilai `id` duplikat** yang perlu ditangani pada tahap pra-pemrosesan. Distribusi bentuk sediaan didominasi oleh **tablet, sirup, kaplet, dan kapsul**, sedangkan bentuk lain seperti krim, tetes, dan suspensi muncul dalam jumlah yang relatif lebih sedikit.


In [3]:


# Buang duplikasi berdasarkan id (ambil kemunculan pertama)
obat_df = obat_df.drop_duplicates(subset="id", keep="first").reset_index(drop=True)

# Isi missing value kolom yang dipakai ke teks dengan string kosong
obat_df['tipe'] = obat_df['tipe'].fillna("")
obat_df['indikasi'] = obat_df['indikasi'].fillna("")


In [4]:
print("Shape setelah buang duplikat:", obat_df.shape)
print("Duplikat id:", obat_df['id'].duplicated().sum())


Shape setelah buang duplikat: (3090, 7)
Duplikat id: 0


# **BLOK 3 – Eksplorasi & Perapian Ground Truth**

In [5]:
# BLOK 3 – Baca, rapikan, dan ubah Ground Truth ke long format

gt_df = pd.read_csv(GT_PATH)

print("Shape GT asli:", gt_df.shape)
display(gt_df.head())

# Fungsi bantu untuk parse list id dari string
def parse_id_list(s):
    if pd.isna(s):
        return []
    parts = [p.strip() for p in str(s).split(",") if str(p).strip() != ""]
    ids = []
    for p in parts:
        try:
            ids.append(int(p))
        except:
            pass
    return ids

# Ubah ke long format: 1 baris = 1 query
rows = []
for _, row in gt_df.iterrows():
    # single-hop
    if isinstance(row.get('query'), str) and row['query'].strip() != "":
        rows.append({
            "query_text": row['query'].strip(),
            "query_type": (row['type'].strip() if isinstance(row.get('type'), str) else "single-hop"),
            "relevant_ids_raw": row['relavant_ids']  # kolom salah tulis
        })
    # multi-hop
    if isinstance(row.get('query.1'), str) and row['query.1'].strip() != "":
        rows.append({
            "query_text": row['query.1'].strip(),
            "query_type": (row['type.1'].strip() if isinstance(row.get('type.1'), str) else "multi-hop"),
            "relevant_ids_raw": row['relevant_ids']
        })

gt_long_df = pd.DataFrame(rows)
gt_long_df['relevant_ids'] = gt_long_df['relevant_ids_raw'].apply(parse_id_list)

print("Shape GT long:", gt_long_df.shape)
print("\nDistribusi jenis query:")
print(gt_long_df['query_type'].value_counts())

# Cek konsistensi id dengan data obat
corpus_ids = set(obat_df['id'].tolist())
gt_long_df['relevant_ids_in_corpus'] = gt_long_df['relevant_ids'].apply(
    lambda lst: [i for i in lst if i in corpus_ids]
)

print("\nJumlah query yang relevant_ids-nya jadi kosong (setelah dicek ke data obat):",
      (gt_long_df['relevant_ids_in_corpus'].str.len() == 0).sum())

print("\nContoh 5 baris GT long:")
display(gt_long_df.head())


Shape GT asli: (100, 3)


Unnamed: 0,query,type,relavant_ids
0,Obat apa yang biasanya digunakan untuk meredak...,single-hop,"634,637,639,644,654,658,666,682,759,780,805,81..."
1,Obat untuk mengobati luka memar atau lebam,single-hop,"1737, 5553, 5564, 5578, 6865, 6895, 6898, 7079..."
2,Obat untuk kerusakan kantung udara paru-paru a...,single-hop,"455, 456, 458, 459, 464, 464, 471, 473, 483, 4..."
3,Obat untuk infeksi jamur candida,single-hop,"5072, 5162, 5187, 5189, 5240, 5244, 5403, 5426..."
4,Obat untuk infeksi tulang atau osteomielitis,single-hop,"5628, 5659, 5714, 5754, 5856, 5858, 5904, 5907..."


Shape GT long: (100, 4)

Distribusi jenis query:
query_type
single-hop    50
multi-hop     50
Name: count, dtype: int64

Jumlah query yang relevant_ids-nya jadi kosong (setelah dicek ke data obat): 1

Contoh 5 baris GT long:


Unnamed: 0,query_text,query_type,relevant_ids_raw,relevant_ids,relevant_ids_in_corpus
0,Obat apa yang biasanya digunakan untuk meredak...,single-hop,"634,637,639,644,654,658,666,682,759,780,805,81...","[634, 637, 639, 644, 654, 658, 666, 682, 759, ...","[634, 637, 639, 644, 654, 658, 666, 682, 759, ..."
1,Obat untuk mengobati luka memar atau lebam,single-hop,"1737, 5553, 5564, 5578, 6865, 6895, 6898, 7079...","[1737, 5553, 5564, 5578, 6865, 6895, 6898, 707...","[1737, 5553, 5564, 5578, 6865, 6895, 6898, 707..."
2,Obat untuk kerusakan kantung udara paru-paru a...,single-hop,"455, 456, 458, 459, 464, 464, 471, 473, 483, 4...","[455, 456, 458, 459, 464, 464, 471, 473, 483, ...","[455, 456, 458, 459, 464, 464, 471, 473, 483, ..."
3,Obat untuk infeksi jamur candida,single-hop,"5072, 5162, 5187, 5189, 5240, 5244, 5403, 5426...","[5072, 5162, 5187, 5189, 5240, 5244, 5403, 542...","[5072, 5162, 5187, 5189, 5240, 5244, 5403, 542..."
4,Obat untuk infeksi tulang atau osteomielitis,single-hop,"5628, 5659, 5714, 5754, 5856, 5858, 5904, 5907...","[5628, 5659, 5714, 5754, 5856, 5858, 5904, 590...","[5628, 5659, 5714, 5754, 5856, 5858, 5904, 590..."


# **BLOK 4 – Menyusun Representasi Dokumen Obat**

In [6]:
# BLOK 4 – Susun teks dokumen (corpus obat) untuk indexing

text_cols = ['nama', 'tipe', 'komposisi', 'indikasi_umum', 'indikasi']

def build_document_text(row):
    parts = []
    for col in text_cols:
        if col in row and pd.notna(row[col]):
            parts.append(str(row[col]))
    return " ".join(parts)

corpus_df = obat_df.copy()
corpus_df['document_text'] = corpus_df.apply(build_document_text, axis=1)

print("Shape corpus:", corpus_df.shape)
display(corpus_df[['id', 'document_text']].head())

# Kalau mau versi fokus indikasi saja:
corpus_df['document_text_indikasi'] = (
    corpus_df[['indikasi_umum', 'indikasi']]
    .fillna("")
    .agg(" ".join, axis=1)
)


Shape corpus: (3090, 8)


Unnamed: 0,id,document_text
0,7,ericfil tablet sildenafil citrate ericfil diin...
1,154,sildenafil citrate tablet sildenafil citrate t...
2,238,viagra tablet sildenafil citrate terapi disfun...
3,241,cialis tablet tadalafil pengobatan ketidakmamp...
4,245,topgra kaplet sildenafil citrate terapi disfun...


# **BLOK 5 – EDA Singkat**

In [7]:
# BLOK 5 – EDA singkat corpus & GT

# Panjang dokumen (dalam token)
corpus_df['doc_len_tokens'] = corpus_df['document_text'].str.split().str.len()
print("Statistik panjang dokumen (token):")
display(corpus_df['doc_len_tokens'].describe())

# Kata yang paling sering muncul (sederhana, pakai split)
from collections import Counter

all_tokens = []
for doc in corpus_df['document_text'].astype(str).str.lower():
    all_tokens.extend(doc.split())

token_counts = Counter(all_tokens)
print("\n30 kata paling sering muncul:")
for word, cnt in token_counts.most_common(30):
    print(f"{word:20s} {cnt}")

# EDA Ground Truth: panjang query dan jumlah relevant_ids
gt_long_df['n_relevant'] = gt_long_df['relevant_ids'].str.len()
gt_long_df['query_len_tokens'] = gt_long_df['query_text'].str.split().str.len()

print("\nStatistik jumlah relevant_ids per query:")
display(gt_long_df['n_relevant'].describe())

print("\nStatistik panjang query (token):")
display(gt_long_df['query_len_tokens'].describe())


Statistik panjang dokumen (token):


Unnamed: 0,doc_len_tokens
count,3090.0
mean,25.800647
std,15.588148
min,4.0
25%,16.0
50%,23.0
75%,32.0
max,367.0



30 kata paling sering muncul:
,                    3293
infeksi              1889
dan                  1371
sakit                1220
nyeri                1118
saluran              1003
tablet               963
membantu             664
meredakan            592
yang                 576
kulit                504
pada                 503
akut                 468
gangguan             465
terapi               461
batuk                442
-                    440
penyakit             439
untuk                438
hydrochloride        422
gejala               414
sirup                413
kaplet               385
kapsul               378
pengobatan           355
tukak                343
demam                316
pasien               311
meringankan          309
diabetes             306

Statistik jumlah relevant_ids per query:


Unnamed: 0,n_relevant
count,100.0
mean,30.97
std,39.878212
min,0.0
25%,9.0
50%,22.5
75%,36.0
max,270.0



Statistik panjang query (token):


Unnamed: 0,query_len_tokens
count,100.0
mean,7.74
std,3.529443
min,3.0
25%,5.75
50%,7.0
75%,9.0
max,23.0


# **BLOK 6 – Fungsi Metrik Evaluasi (Precision, Recall, MRR, nDCG)**

In [8]:
# BLOK 6 – Definisi fungsi metrik evaluasi

def precision_at_k(retrieved_ids, relevant_ids, k):
    if k == 0:
        return 0.0
    retrieved_k = retrieved_ids[:k]
    if len(retrieved_k) == 0:
        return 0.0
    rel_set = set(relevant_ids)
    hits = sum(1 for doc_id in retrieved_k if doc_id in rel_set)
    return hits / len(retrieved_k)

def recall_at_k(retrieved_ids, relevant_ids, k):
    rel_set = set(relevant_ids)
    if len(rel_set) == 0:
        return 0.0
    retrieved_k = retrieved_ids[:k]
    hits = sum(1 for doc_id in retrieved_k if doc_id in rel_set)
    return hits / len(rel_set)

def reciprocal_rank(retrieved_ids, relevant_ids, k):
    rel_set = set(relevant_ids)
    for rank, doc_id in enumerate(retrieved_ids[:k], start=1):
        if doc_id in rel_set:
            return 1.0 / rank
    return 0.0

def dcg_at_k(retrieved_ids, relevant_ids, k):
    rel_set = set(relevant_ids)
    dcg = 0.0
    for i, doc_id in enumerate(retrieved_ids[:k], start=1):
        if doc_id in rel_set:
            dcg += 1.0 / math.log2(i + 1)
    return dcg

def idcg_at_k(relevant_ids, k):
    ideal_hits = min(len(relevant_ids), k)
    idcg = 0.0
    for i in range(1, ideal_hits + 1):
        idcg += 1.0 / math.log2(i + 1)
    return idcg

def ndcg_at_k(retrieved_ids, relevant_ids, k):
    idcg = idcg_at_k(relevant_ids, k)
    if idcg == 0:
        return 0.0
    return dcg_at_k(retrieved_ids, relevant_ids, k) / idcg


# **BLOK 7 – Model Leksikal 1: TF-IDF**

In [9]:
# BLOK 7 – Index & retrieval TF-IDF

# Bangun index TF-IDF
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=50000
)
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus_df['document_text'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

def tfidf_retrieve(query_text, top_k=10):
    """Kembalikan list (doc_id, score) urut dari skor tertinggi."""
    q_vec = tfidf_vectorizer.transform([query_text])
    sims = cosine_similarity(q_vec, tfidf_matrix)[0]
    top_idx = np.argsort(sims)[::-1][:top_k]
    ids = corpus_df['id'].iloc[top_idx].tolist()
    scores = sims[top_idx]
    return list(zip(ids, scores))

# Tes
sample_query = gt_long_df.iloc[0]['query_text']
print("Contoh query:", sample_query)
print("Top-5 TF-IDF:")
print(tfidf_retrieve(sample_query, top_k=5))


TF-IDF matrix shape: (3090, 31597)
Contoh query: Obat apa yang biasanya digunakan untuk meredakan sakit kepala ringan?
Top-5 TF-IDF:
[(2244, np.float64(0.3113355843969192)), (2086, np.float64(0.28051422700905504)), (2568, np.float64(0.2720200835992548)), (2691, np.float64(0.2700493076187848)), (2814, np.float64(0.2637547012747775))]


## Jaccard Similarity

In [10]:
# ============================================================
# BLOK X – Lexical Retrieval: Jaccard Similarity (Fixed)
# ============================================================

import re

# Ambil corpus text dari corpus_df (INI YANG BENAR)
corpus_texts = corpus_df['document_text'].tolist()

def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return set(text.split())

def jaccard_similarity(query, document):
    q_tokens = tokenize(query)
    d_tokens = tokenize(document)

    union = q_tokens.union(d_tokens)
    if len(union) == 0:
        return 0.0

    return len(q_tokens.intersection(d_tokens)) / len(union)

def jaccard_retrieve(query, top_k=10):
    scores = []
    for idx, doc in enumerate(corpus_texts):
        score = jaccard_similarity(query, doc)
        scores.append((idx, score))

    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return scores[:top_k]

# Tes cepat
print("\nTop-5 Jaccard Similarity:")
print(jaccard_retrieve(sample_query, top_k=5))



Top-5 Jaccard Similarity:
[(760, 0.3125), (583, 0.2916666666666667), (856, 0.2916666666666667), (635, 0.2857142857142857), (839, 0.28)]


# **BLOK 8 – Model Leksikal 2: BM25**

In [11]:
# BLOK 8 – Index & retrieval BM25

# Tokenisasi sederhana (split spasi)
tokenized_corpus = [doc.split() for doc in corpus_df['document_text'].astype(str).tolist()]
bm25 = BM25Okapi(tokenized_corpus)

def bm25_retrieve(query_text, top_k=10):
    """Kembalikan list (doc_id, score) urut dari skor tertinggi."""
    tokenized_query = query_text.split()
    scores = bm25.get_scores(tokenized_query)
    top_idx = np.argsort(scores)[::-1][:top_k]
    ids = corpus_df['id'].iloc[top_idx].tolist()
    top_scores = np.array(scores)[top_idx]
    return list(zip(ids, top_scores))

# Tes cepat
print("\nTop-5 BM25:")
print(bm25_retrieve(sample_query, top_k=5))



Top-5 BM25:
[(2568, np.float64(15.36417860453346)), (2244, np.float64(13.077993185346392)), (2094, np.float64(12.733673548743639)), (2861, np.float64(11.934524549395631)), (2798, np.float64(11.782066555724864))]


## BM25L

In [12]:
# ===== BM25L (mengikuti format & corpus BM25 existing) =====
from rank_bm25 import BM25L
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Add this line to download the specific missing resource

# gunakan document_text sebagai corpus, sama seperti BM25
corpus_bm25l = corpus_df['document_text'].astype(str).tolist()

# tokenisasi dokumen
tokenized_docs_bm25l = [nltk.word_tokenize(doc.lower()) for doc in corpus_bm25l]

# inisiasi model BM25L
bm25l_model = BM25L(tokenized_docs_bm25l)


def bm25l_search(query, topk=5):
    query_tokens = nltk.word_tokenize(query.lower())
    scores = bm25l_model.get_scores(query_tokens)
    ranked_idx = scores.argsort()[::-1][:topk]

    # output tuple (index dokumen, skor)
    return [(int(idx), float(scores[idx])) for idx in ranked_idx]

print("Top-5 BM25L:")
print(bm25l_search("obat untuk pusing", topk=5))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Top-5 BM25L:
[(1101, 17.375960974431102), (1358, 15.906040339363336), (2670, 13.791337244481493), (881, 13.587731318586828), (2197, 13.531934303601961)]


# **BLOK 9 – Model Semantik**

In [13]:
# BLOK 9 – Index & retrieval semantik berbasis embedding

SEM_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
sem_model = SentenceTransformer(SEM_MODEL_NAME)

# Encode semua dokumen obat
doc_texts = corpus_df['document_text'].astype(str).tolist()
doc_embeddings = sem_model.encode(
    doc_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_tensor=True
)

import torch

def semantic_retrieve(query_text, top_k=10):
    """Kembalikan list (doc_id, score_cosine) urut dari skor tertinggi."""
    q_emb = sem_model.encode([query_text], convert_to_tensor=True)
    cos_scores = util.cos_sim(q_emb, doc_embeddings)[0]  # shape: (n_docs,)
    top_k = min(top_k, len(doc_texts))
    top_scores, top_idx = torch.topk(cos_scores, k=top_k)
    top_idx = top_idx.cpu().numpy()
    top_scores = top_scores.cpu().numpy()
    ids = corpus_df['id'].iloc[top_idx].tolist()
    return list(zip(ids, top_scores))

# Tes cepat
print("\nTop-5 Semantik:")
print(semantic_retrieve(sample_query, top_k=5))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/49 [00:00<?, ?it/s]


Top-5 Semantik:
[(2568, np.float32(0.87950903)), (2327, np.float32(0.825724)), (2613, np.float32(0.8067676)), (2209, np.float32(0.80654925)), (1758, np.float32(0.7919038))]


## IndoBERT

In [14]:
# ============================================================
# BLOK Y – Semantic Retrieval: IndoBERT
# ============================================================

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Ambil corpus text (KONSISTEN dengan BLOK 4)
corpus_texts = corpus_df['document_text'].tolist()

# Load IndoBERT sentence model
indobert_model = SentenceTransformer("indobenchmark/indobert-base-p1")

# Encode seluruh corpus (sekali saja)
doc_embeddings_indobert = indobert_model.encode(
    corpus_texts,
    show_progress_bar=True,
    convert_to_numpy=True
)




config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Batches:   0%|          | 0/97 [00:00<?, ?it/s]

In [15]:
def indobert_retrieve(query, top_k=10):
    query_emb = indobert_model.encode(
        [query],
        convert_to_numpy=True
    )

    scores = cosine_similarity(
        query_emb,
        doc_embeddings_indobert
    )[0]

    ranked = sorted(
        enumerate(scores),
        key=lambda x: x[1],
        reverse=True
    )

    return ranked[:top_k]
# Tes cepat
print("\nTop-5 IndoBERT Semantic Retrieval:")
print(indobert_retrieve(sample_query, top_k=5))



Top-5 IndoBERT Semantic Retrieval:
[(576, np.float32(0.6588209)), (760, np.float32(0.6498837)), (744, np.float32(0.6399039)), (579, np.float32(0.6382374)), (886, np.float32(0.6375767))]


## MiniLM

In [16]:
# ============================================================
# BLOK Z – Semantic Retrieval: MiniLM
# ============================================================

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Ambil corpus text (konsisten)
corpus_texts = corpus_df['document_text'].tolist()

# Load MiniLM model
minilm_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Encode seluruh corpus
doc_embeddings_minilm = minilm_model.encode(
    corpus_texts,
    show_progress_bar=True,
    convert_to_numpy=True
)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/97 [00:00<?, ?it/s]

In [17]:
def minilm_retrieve(query, top_k=10):
    query_emb = minilm_model.encode(
        [query],
        convert_to_numpy=True
    )

    scores = cosine_similarity(
        query_emb,
        doc_embeddings_minilm
    )[0]

    ranked = sorted(
        enumerate(scores),
        key=lambda x: x[1],
        reverse=True
    )

    return ranked[:top_k]
# Tes cepat
print("\nTop-5 MiniLM Semantic Retrieval:")
print(minilm_retrieve(sample_query, top_k=5))



Top-5 MiniLM Semantic Retrieval:
[(760, np.float32(0.6949805)), (337, np.float32(0.68823266)), (42, np.float32(0.6706643)), (2568, np.float32(0.6693573)), (2569, np.float32(0.6609724))]


## MPNet

In [21]:
# ============================================================
# BLOK AA – Semantic Retrieval: MPNet
# ============================================================

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Ambil corpus text (konsisten)
corpus_texts = corpus_df['document_text'].tolist()

# Load MPNet model
mpnet_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Encode seluruh corpus
doc_embeddings_mpnet = mpnet_model.encode(
    corpus_texts,
    show_progress_bar=True,
    convert_to_numpy=True
)


Batches:   0%|          | 0/97 [00:00<?, ?it/s]

In [22]:
def mpnet_retrieve(query, top_k=10):
    query_emb = mpnet_model.encode(
        [query],
        convert_to_numpy=True
    )

    scores = cosine_similarity(
        query_emb,
        doc_embeddings_mpnet
    )[0]

    ranked = sorted(
        enumerate(scores),
        key=lambda x: x[1],
        reverse=True
    )

    return ranked[:top_k]
# Tes cepat
print("\nTop-5 MPNet Semantic Retrieval:")
print(mpnet_retrieve(sample_query, top_k=5))



Top-5 MPNet Semantic Retrieval:
[(760, np.float32(0.7271973)), (455, np.float32(0.72324467)), (2321, np.float32(0.69815147)), (132, np.float32(0.69610125)), (462, np.float32(0.6852256))]


# **BLOK 10 – Model Hybrid**

TF-IDF + MiniLM

In [28]:
# Mapping id -> index corpus (WAJIB untuk dense score)
id_to_idx = {
    doc_id: idx
    for idx, doc_id in enumerate(corpus_df['id'].tolist())
}


In [29]:
# ============================================================
# BLOK AD – Hybrid Retrieval: TF-IDF + MiniLM
# ============================================================

ALPHA = 0.6  # bobot MiniLM (semantic), TF-IDF = 1 - ALPHA

def hybrid_tfidf_minilm_retrieve(query_text, top_k=10):
    # ---------- TF-IDF ----------
    tfidf_results = tfidf_retrieve(
        query_text,
        top_k=len(corpus_df)
    )
    tfidf_dict = {doc_id: score for doc_id, score in tfidf_results}

    # Normalisasi TF-IDF (0–1)
    if len(tfidf_dict) > 0:
        max_tfidf = max(tfidf_dict.values())
    else:
        max_tfidf = 1.0

    # ---------- MiniLM ----------
    query_emb = minilm_model.encode(
        [query_text],
        convert_to_numpy=True
    )

    minilm_scores = cosine_similarity(
        query_emb,
        doc_embeddings_minilm
    )[0]

    # ---------- Hybrid Scoring ----------
    hybrid_scores = []

    for doc_id, tfidf_score in tfidf_dict.items():
        idx = id_to_idx[doc_id]
        dense_score = minilm_scores[idx]

        tfidf_norm = tfidf_score / max_tfidf if max_tfidf > 0 else 0.0

        final_score = (
            ALPHA * dense_score +
            (1 - ALPHA) * tfidf_norm
        )

        hybrid_scores.append((doc_id, final_score))

    # Urutkan skor akhir
    hybrid_scores = sorted(
        hybrid_scores,
        key=lambda x: x[1],
        reverse=True
    )

    return hybrid_scores[:top_k]

print("\nTop-5 Hybrid TF-IDF + MiniLM:")
print(hybrid_tfidf_minilm_retrieve(sample_query, top_k=5))



Top-5 Hybrid TF-IDF + MiniLM:
[(2568, np.float64(0.7664762576952513)), (2086, np.float64(0.7368112684299688)), (2244, np.float64(0.7226348459720612)), (2301, np.float64(0.6835432712759099)), (2805, np.float64(0.6634614453713139))]


## BM25 + IndoBERT

In [30]:
# ============================================================
# BLOK AC – Hybrid Retrieval: BM25 + IndoBERT
# ============================================================

ALPHA = 0.7  # bobot IndoBERT (bisa kamu tuning)

def hybrid_bm25_indobert_retrieve(query, top_k=10):
    # ---------- BM25 ----------
    bm25_results = bm25_retrieve(query, top_k=len(corpus_df))
    bm25_dict = {doc_id: score for doc_id, score in bm25_results}

    # Normalisasi BM25 (0–1)
    if len(bm25_dict) > 0:
        max_bm25 = max(bm25_dict.values())
    else:
        max_bm25 = 1.0

    # ---------- IndoBERT ----------
    query_emb = indobert_model.encode(
        [query],
        convert_to_numpy=True
    )

    indobert_scores = cosine_similarity(
        query_emb,
        doc_embeddings_indobert
    )[0]

    # ---------- Hybrid Scoring ----------
    hybrid_scores = []

    for doc_id, bm25_score in bm25_dict.items():
        idx = id_to_idx[doc_id]
        dense_score = indobert_scores[idx]

        bm25_norm = bm25_score / max_bm25 if max_bm25 > 0 else 0.0

        final_score = (
            ALPHA * dense_score +
            (1 - ALPHA) * bm25_norm
        )

        hybrid_scores.append((doc_id, final_score))

    # Urutkan
    hybrid_scores = sorted(
        hybrid_scores,
        key=lambda x: x[1],
        reverse=True
    )

    return hybrid_scores[:top_k]
# Tes cepat
print("\nTop-5 Hybrid BM25 + IndoBERT:")
print(hybrid_bm25_indobert_retrieve(sample_query, top_k=5))



Top-5 Hybrid BM25 + IndoBERT:
[(2568, np.float64(0.7549185633659363)), (2094, np.float64(0.6910356481157462)), (2244, np.float64(0.6625448349194731)), (2861, np.float64(0.6553758923796462)), (2798, np.float64(0.6501822804947262))]


## BM25 + MPNet

In [31]:
# ============================================================
# BLOK AF – Hybrid Retrieval: BM25 + MPNet
# ============================================================

ALPHA = 0.7  # bobot MPNet (disarankan > 0.5)

def hybrid_bm25_mpnet_retrieve(query, top_k=10):
    # ---------- BM25 ----------
    bm25_results = bm25_retrieve(query, top_k=len(corpus_df))
    bm25_dict = {doc_id: score for doc_id, score in bm25_results}

    max_bm25 = max(bm25_dict.values()) if bm25_dict else 1.0

    # ---------- MPNet ----------
    query_emb = mpnet_model.encode([query], convert_to_numpy=True)
    mpnet_scores = cosine_similarity(
        query_emb, doc_embeddings_mpnet
    )[0]

    # ---------- Hybrid scoring ----------
    hybrid_scores = []

    for doc_id, bm25_score in bm25_dict.items():
        idx = id_to_idx[doc_id]

        bm25_norm = bm25_score / max_bm25 if max_bm25 > 0 else 0.0
        dense_score = mpnet_scores[idx]

        final_score = (
            ALPHA * dense_score +
            (1 - ALPHA) * bm25_norm
        )

        hybrid_scores.append((doc_id, final_score))

    hybrid_scores.sort(key=lambda x: x[1], reverse=True)
    return hybrid_scores[:top_k]
# Tes cepat
print("\nTop-5 Hybrid BM25 + MPNet:")
print(hybrid_bm25_mpnet_retrieve(sample_query, top_k=5))



Top-5 Hybrid BM25 + MPNet:
[(2568, np.float64(0.8090380907058716)), (7170, np.float64(0.6263272805981733)), (2861, np.float64(0.5935966615465926)), (7179, np.float64(0.5783770588085042)), (7174, np.float64(0.5654069825001223))]


## IndoBERT + TF-IDF

In [26]:
# ============================================================
# BLOK AH – Hybrid Retrieval: IndoBERT + TF-IDF
# ============================================================

ALPHA = 0.6  # bobot IndoBERT (bisa kamu tuning)

def hybrid_indobert_tfidf_retrieve(query, top_k=10):
    # ---------- TF-IDF ----------
    query_tfidf = tfidf_vectorizer.transform([query])
    tfidf_scores = cosine_similarity(
        query_tfidf, tfidf_matrix
    )[0]

    max_tfidf = tfidf_scores.max() if tfidf_scores.max() != 0 else 1.0
    tfidf_scores_norm = tfidf_scores / max_tfidf

    # ---------- IndoBERT ----------
    query_emb = indobert_model.encode(
        [query], convert_to_numpy=True
    )

    indobert_scores = cosine_similarity(
        query_emb, doc_embeddings_indobert
    )[0]

    max_indobert = indobert_scores.max() if indobert_scores.max() != 0 else 1.0
    indobert_scores_norm = indobert_scores / max_indobert

    # ---------- Hybrid ----------
    final_scores = (
        ALPHA * indobert_scores_norm +
        (1 - ALPHA) * tfidf_scores_norm
    )

    # ---------- Ranking ----------
    ranked_idx = np.argsort(final_scores)[::-1][:top_k]

    results = [
        (corpus_df.iloc[idx]['id'], final_scores[idx])
        for idx in ranked_idx
    ]

    return results
# Tes cepat
print("\nTop-5 Hybrid IndoBERT + TF-IDF:")
print(hybrid_indobert_tfidf_retrieve(sample_query, top_k=5))



Top-5 Hybrid IndoBERT + TF-IDF:
[(np.int64(2568), np.float64(0.9413486560717161)), (np.int64(2244), np.float64(0.9297577977180481)), (np.int64(2086), np.float64(0.917842406134818)), (np.int64(2058), np.float64(0.9088671969756199)), (np.int64(2094), np.float64(0.8854508985727182))]


# **BLOK 11 – Evaluasi Kuantitatif 4 Model (TF-IDF, BM25, Semantik, Hybrid)**

In [32]:
import numpy as np
import pandas as pd
from collections import OrderedDict

K_LIST = [5, 10, 15, 50]

gt_eval_df = gt_long_df[gt_long_df['relevant_ids_in_corpus'].str.len() > 0].copy()

print("Jumlah query untuk evaluasi:", gt_eval_df.shape[0])
print(gt_eval_df['query_type'].value_counts())

_corpus_id_values = set(corpus_df['id'].tolist())
_corpus_len = len(corpus_df)

def _normalize_docid(raw_id):
    try:
        if isinstance(raw_id, (np.integer, int)):
            if raw_id in _corpus_id_values:
                return raw_id
            if 0 <= int(raw_id) < _corpus_len:
                return corpus_df['id'].iloc[int(raw_id)]
            return raw_id
        return raw_id
    except Exception:
        return raw_id


def _normalize_results(results):
    normalized = []
    for doc_id, score in results:
        nid = _normalize_docid(doc_id)
        normalized.append((nid, float(score)))
    return normalized


def wrap_retriever_to_ids(func):
    def wrapped(query_text, top_k):
        try:
            raw = func(query_text, top_k=top_k)
        except TypeError:
            raw = func(query_text, topk=top_k)

        norm = _normalize_results(raw)
        return [doc_id for doc_id, _ in norm]
    return wrapped

# -------- LEXICAL --------
tfidf_ids   = wrap_retriever_to_ids(tfidf_retrieve)
bm25_ids    = wrap_retriever_to_ids(bm25_retrieve)

try:
    bm25l_ids = wrap_retriever_to_ids(bm25l_retrieve)
except NameError:
    bm25l_ids = wrap_retriever_to_ids(bm25l_search)

jaccard_ids = wrap_retriever_to_ids(jaccard_retrieve)


# -------- SEMANTIC --------
indobert_ids = wrap_retriever_to_ids(indobert_retrieve)
minilm_ids   = wrap_retriever_to_ids(minilm_retrieve)
mpnet_ids    = wrap_retriever_to_ids(mpnet_retrieve)


# -------- HYBRID --------
hybrid_bm25_indobert_ids = wrap_retriever_to_ids(
    hybrid_bm25_indobert_retrieve
)

hybrid_bm25_mpnet_ids = wrap_retriever_to_ids(
    hybrid_bm25_mpnet_retrieve
)

hybrid_indobert_tfidf_ids = wrap_retriever_to_ids(
    hybrid_indobert_tfidf_retrieve
)

# -------- HYBRID (TAMBAHAN) --------
hybrid_tfidf_minilm_ids = wrap_retriever_to_ids(
    hybrid_tfidf_minilm_retrieve
)


models = OrderedDict([

    # -------- LEXICAL --------
    ("TF-IDF", tfidf_ids),
    ("BM25", bm25_ids),
    ("BM25L", bm25l_ids),
    ("Jaccard", jaccard_ids),

    # -------- SEMANTIC --------
    ("IndoBERT", indobert_ids),
    ("MiniLM", minilm_ids),
    ("MPNet", mpnet_ids),

    # -------- HYBRID --------
    ("Hybrid_BM25+IndoBERT", hybrid_bm25_indobert_ids),
    ("Hybrid_BM25+MPNet", hybrid_bm25_mpnet_ids),
    ("Hybrid_IndoBERT+TFIDF", hybrid_indobert_tfidf_ids),
    ("Hybrid_TFIDF+MiniLM", hybrid_tfidf_minilm_ids),  # ✅ TAMBAHAN
])

print("\nModels to be evaluated:")
for m in models:
    print(" -", m)


def evaluate_retriever(gt_df, retriever, model_name, k_list=K_LIST):
    rows = []

    for _, row in gt_df.iterrows():
        relevant_ids = row['relevant_ids_in_corpus']
        if len(relevant_ids) == 0:
            continue

        query_text = row['query_text']
        query_type = row['query_type']

        max_k = max(k_list)
        retrieved_ids = retriever(query_text, top_k=max_k)

        metric_row = {
            "model": model_name,
            "query_text": query_text,
            "query_type": query_type
        }

        for k in k_list:
            metric_row[f"P@{k}"]    = precision_at_k(retrieved_ids, relevant_ids, k)
            metric_row[f"R@{k}"]    = recall_at_k(retrieved_ids, relevant_ids, k)
            metric_row[f"MRR@{k}"]  = reciprocal_rank(retrieved_ids, relevant_ids, k)
            metric_row[f"nDCG@{k}"] = ndcg_at_k(retrieved_ids, relevant_ids, k)

        rows.append(metric_row)

    result_df = pd.DataFrame(rows)

    summary_overall = (
        result_df.groupby("model")
        .mean(numeric_only=True)
        .reset_index()
    )

    summary_by_type = (
        result_df.groupby(["model", "query_type"])
        .mean(numeric_only=True)
        .reset_index()
    )

    return result_df, summary_overall, summary_by_type

all_results = {}
summary_overall_list = []
summary_by_type_list = []

for model_name, retriever in models.items():
    print(f"\n=== Evaluating: {model_name} ===")

    res_df, sum_overall, sum_by_type = evaluate_retriever(
        gt_eval_df,
        retriever,
        model_name,
        k_list=K_LIST
    )

    all_results[model_name] = res_df
    summary_overall_list.append(sum_overall)
    summary_by_type_list.append(sum_by_type)

summary_overall_df = pd.concat(summary_overall_list, ignore_index=True)
summary_by_type_df = pd.concat(summary_by_type_list, ignore_index=True)


metric_cols = []
for k in K_LIST:
    metric_cols += [f"P@{k}", f"R@{k}", f"MRR@{k}", f"nDCG@{k}"]

summary_overall_df = summary_overall_df[
    ["model"] + metric_cols
]

summary_by_type_df = summary_by_type_df[
    ["model", "query_type"] + metric_cols
]

print("\n>> Overall summary:")
display(summary_overall_df)

print("\n>> Summary by query type:")
display(summary_by_type_df)

summary_overall_df.to_csv(
    "evaluation_summary_overall_all_models.csv",
    index=False
)

summary_by_type_df.to_csv(
    "evaluation_summary_by_type_all_models.csv",
    index=False
)

print("\n✅ Evaluasi selesai & CSV tersimpan!")



Jumlah query untuk evaluasi: 99
query_type
single-hop    50
multi-hop     49
Name: count, dtype: int64

Models to be evaluated:
 - TF-IDF
 - BM25
 - BM25L
 - Jaccard
 - IndoBERT
 - MiniLM
 - MPNet
 - Hybrid_BM25+IndoBERT
 - Hybrid_BM25+MPNet
 - Hybrid_IndoBERT+TFIDF
 - Hybrid_TFIDF+MiniLM

=== Evaluating: TF-IDF ===

=== Evaluating: BM25 ===

=== Evaluating: BM25L ===

=== Evaluating: Jaccard ===

=== Evaluating: IndoBERT ===

=== Evaluating: MiniLM ===

=== Evaluating: MPNet ===

=== Evaluating: Hybrid_BM25+IndoBERT ===

=== Evaluating: Hybrid_BM25+MPNet ===

=== Evaluating: Hybrid_IndoBERT+TFIDF ===

=== Evaluating: Hybrid_TFIDF+MiniLM ===

>> Overall summary:


Unnamed: 0,model,P@5,R@5,MRR@5,nDCG@5,P@10,R@10,MRR@10,nDCG@10,P@15,R@15,MRR@15,nDCG@15,P@50,R@50,MRR@50,nDCG@50
0,TF-IDF,0.444444,0.172279,0.572054,0.463603,0.417172,0.294992,0.584155,0.471892,0.382492,0.367661,0.587169,0.470989,0.246061,0.642479,0.590109,0.533732
1,BM25,0.373737,0.157431,0.561953,0.413052,0.345455,0.260552,0.574547,0.418811,0.317845,0.327589,0.575269,0.422835,0.195758,0.540731,0.57916,0.464814
2,BM25L,0.2,0.090806,0.305051,0.203366,0.184848,0.140369,0.319665,0.210617,0.16835,0.171279,0.32397,0.21086,0.131515,0.327269,0.329134,0.26499
3,Jaccard,0.151515,0.061167,0.282155,0.166179,0.128283,0.090499,0.292043,0.159814,0.119192,0.134825,0.298484,0.16668,0.095758,0.269247,0.307652,0.211471
4,IndoBERT,0.153535,0.062166,0.333838,0.179958,0.123232,0.083693,0.343246,0.1636,0.113805,0.108448,0.345706,0.163447,0.079596,0.198154,0.352167,0.178587
5,MiniLM,0.193939,0.086339,0.348316,0.213013,0.163636,0.118445,0.353127,0.201612,0.150168,0.1513,0.354887,0.201734,0.08202,0.239313,0.359529,0.211611
6,MPNet,0.157576,0.053345,0.267172,0.162216,0.156566,0.098467,0.27678,0.171711,0.139394,0.126185,0.281477,0.168413,0.077576,0.204889,0.284384,0.175605
7,Hybrid_BM25+IndoBERT,0.381818,0.159263,0.565825,0.419233,0.320202,0.229993,0.576387,0.397896,0.290236,0.302185,0.583762,0.39874,0.192323,0.518385,0.587199,0.446273
8,Hybrid_BM25+MPNet,0.369697,0.137257,0.578956,0.409506,0.331313,0.224004,0.593947,0.400625,0.298316,0.287067,0.599378,0.396384,0.180404,0.512684,0.601777,0.437156
9,Hybrid_IndoBERT+TFIDF,0.438384,0.170488,0.633502,0.481821,0.376768,0.266344,0.64315,0.463087,0.338721,0.332136,0.643992,0.455981,0.216566,0.568086,0.648513,0.498204



>> Summary by query type:


Unnamed: 0,model,query_type,P@5,R@5,MRR@5,nDCG@5,P@10,R@10,MRR@10,nDCG@10,P@15,R@15,MRR@15,nDCG@15,P@50,R@50,MRR@50,nDCG@50
0,TF-IDF,multi-hop,0.302041,0.193508,0.447619,0.328588,0.283673,0.338487,0.458244,0.362249,0.240816,0.394207,0.462875,0.36951,0.144898,0.675093,0.466845,0.461619
1,TF-IDF,single-hop,0.584,0.151475,0.694,0.595919,0.548,0.252366,0.707548,0.579342,0.521333,0.341646,0.708976,0.570439,0.3452,0.610518,0.710907,0.604402
2,BM25,multi-hop,0.281633,0.210815,0.505442,0.3385,0.27551,0.346756,0.522668,0.384113,0.25034,0.427794,0.524125,0.41023,0.137551,0.66553,0.528333,0.486645
3,BM25,single-hop,0.464,0.105114,0.617333,0.486113,0.414,0.176071,0.625389,0.452815,0.384,0.229388,0.625389,0.435188,0.2528,0.418428,0.62897,0.44342
4,BM25L,multi-hop,0.183673,0.1188,0.294558,0.192328,0.155102,0.180065,0.304956,0.200599,0.133333,0.212306,0.309475,0.203998,0.086939,0.356812,0.31484,0.257852
5,BM25L,single-hop,0.216,0.063372,0.315333,0.214183,0.214,0.101467,0.334079,0.220434,0.202667,0.131073,0.338175,0.217584,0.1752,0.298317,0.343143,0.271985
6,Jaccard,multi-hop,0.142857,0.090661,0.314626,0.176372,0.114286,0.122083,0.32602,0.171357,0.107483,0.193146,0.332735,0.190955,0.071429,0.354351,0.342665,0.243534
7,Jaccard,single-hop,0.16,0.032262,0.250333,0.15619,0.142,0.059546,0.258746,0.148503,0.130667,0.07767,0.264918,0.142891,0.1196,0.185844,0.273339,0.18005
8,IndoBERT,multi-hop,0.097959,0.081501,0.242177,0.134289,0.069388,0.097886,0.247643,0.126801,0.059864,0.12586,0.250914,0.134063,0.040408,0.207957,0.260589,0.161643
9,IndoBERT,single-hop,0.208,0.043218,0.423667,0.224712,0.176,0.069785,0.436937,0.199662,0.166667,0.091385,0.438603,0.192244,0.118,0.188547,0.441914,0.195193



✅ Evaluasi selesai & CSV tersimpan!


Berdasarkan hasil evaluasi terhadap berbagai model retrieval yang mencakup pendekatan lexical, semantic, dan hybrid, dapat disimpulkan bahwa model Hybrid IndoBERT + TF-IDF menunjukkan kinerja terbaik secara keseluruhan. Model ini secara konsisten memperoleh nilai tertinggi pada metrik Precision@K, nDCG@K, serta performa yang sangat kompetitif pada Recall@K dan MRR@K di berbagai nilai K (5, 10, 15, dan 50). Hal ini menunjukkan bahwa penggabungan representasi semantic dari IndoBERT dengan kekuatan pencocokan kata secara eksplisit dari TF-IDF mampu menghasilkan hasil retrieval yang lebih presisi dan peringkat dokumen yang lebih optimal dibandingkan model tunggal.

Model-model lexical seperti TF-IDF dan BM25 menunjukkan performa yang kuat terutama pada nilai Recall@K yang besar, menandakan kemampuannya dalam menjangkau lebih banyak dokumen relevan. Namun, pendekatan ini masih memiliki keterbatasan dalam memahami konteks dan makna semantik dari query. Sebaliknya, model semantic murni seperti IndoBERT, MiniLM, dan MPNet belum mampu melampaui performa model lexical, yang mengindikasikan bahwa pemahaman semantik saja belum cukup efektif tanpa dukungan pencocokan leksikal pada domain data yang digunakan.

Pendekatan hybrid secara umum memberikan peningkatan performa yang signifikan dibandingkan model tunggal. Model Hybrid BM25 + MPNet menunjukkan keunggulan pada metrik MRR, yang menandakan kemampuannya dalam menempatkan dokumen relevan pada peringkat teratas. Namun demikian, Hybrid IndoBERT + TF-IDF tetap menjadi pendekatan paling seimbang, karena mampu mempertahankan kualitas ranking secara global sekaligus meningkatkan presisi hasil teratas. Oleh karena itu, model ini direkomendasikan sebagai proposed method untuk sistem retrieval yang membutuhkan akurasi tinggi dan stabilitas performa pada berbagai skenario query.

# **BLOK 12 – Analisis Kualitatif & Error Analysis**

In [33]:
# BLOK 12 – Helper analisis kualitatif

import pandas as pd

def inspect_query(query_text, top_k=10):
    print("=== QUERY ===")
    print(query_text)
    print("\nGround truth IDs (di dalam corpus):")

    gt_row = gt_eval_df[gt_eval_df['query_text'] == query_text]
    if gt_row.empty:
        print("Tidak ditemukan di gt_eval_df.")
        return

    rel_ids = gt_row.iloc[0]['relevant_ids_in_corpus']
    rel_set = set(rel_ids)
    print(f"Total GT relevan: {len(rel_ids)}")
    # Jika daftar terlalu panjang, bisa dipersingkat
    print(rel_ids[:20], "...")  # tampilkan 20 pertama saja

    for name, retriever in models.items():
        print(f"\n--- {name} (top-{top_k}) ---")
        retrieved_ids = retriever(query_text, top_k=top_k)

        rows = []
        for rank, doc_id in enumerate(retrieved_ids, start=1):
            relevan = doc_id in rel_set
            nama_obat = id_to_nama.get(doc_id, "(nama tidak ditemukan)")
            rows.append({
                "Rank": rank,
                "DocID": doc_id,
                "Relevan_GT": "Ya" if relevan else "Tidak",
                "Nama_Obat": nama_obat
            })

        df_view = pd.DataFrame(rows)
        # Hitung jumlah relevan di top-k
        hits = (df_view["Relevan_GT"] == "Ya").sum()
        print(f"Relevan dalam top-{top_k}: {hits} dari {len(rel_ids)} dokumen GT")
        display(df_view)


In [34]:
id_to_nama = corpus_df.set_index('id')['nama'].to_dict()
example_query = "Obat apa yang biasanya digunakan untuk meredakan sakit kepala ringan?"
inspect_query(example_query, top_k=10)

=== QUERY ===
Obat apa yang biasanya digunakan untuk meredakan sakit kepala ringan?

Ground truth IDs (di dalam corpus):
Total GT relevan: 267
[634, 637, 639, 644, 654, 658, 666, 682, 759, 780, 805, 812, 854, 858, 860, 862, 893, 908, 930, 948] ...

--- TF-IDF (top-10) ---
Relevan dalam top-10: 10 dari 267 dokumen GT


Unnamed: 0,Rank,DocID,Relevan_GT,Nama_Obat
0,1,2244,Ya,bodrex extra
1,2,2086,Ya,panadol
2,3,2568,Ya,migranal
3,4,2691,Ya,rodemol
4,5,2814,Ya,tempra forte rasa anggur
5,6,2266,Ya,emturnas forte
6,7,2863,Ya,tempra forte rasa strawberry
7,8,2809,Ya,tempra forte rasa orange
8,9,2798,Ya,tempra rasa anggur
9,10,2861,Ya,tempra forte bubblegum



--- BM25 (top-10) ---
Relevan dalam top-10: 10 dari 267 dokumen GT


Unnamed: 0,Rank,DocID,Relevan_GT,Nama_Obat
0,1,2568,Ya,migranal
1,2,2244,Ya,bodrex extra
2,3,2094,Ya,panadol extra
3,4,2861,Ya,tempra forte bubblegum
4,5,2798,Ya,tempra rasa anggur
5,6,2691,Ya,rodemol
6,7,2814,Ya,tempra forte rasa anggur
7,8,2266,Ya,emturnas forte
8,9,2863,Ya,tempra forte rasa strawberry
9,10,2809,Ya,tempra forte rasa orange



--- BM25L (top-10) ---
Relevan dalam top-10: 6 dari 267 dokumen GT


Unnamed: 0,Rank,DocID,Relevan_GT,Nama_Obat
0,1,6397,Tidak,sitro
1,2,2589,Tidak,natures health fuco
2,3,805,Ya,demacolin
3,4,583,Tidak,velutine plus
4,5,842,Tidak,bisolvon kids
5,6,2863,Ya,tempra forte rasa strawberry
6,7,2809,Ya,tempra forte rasa orange
7,8,644,Ya,balsem telon tresno joyo
8,9,2798,Ya,tempra rasa anggur
9,10,2861,Ya,tempra forte bubblegum



--- Jaccard (top-10) ---
Relevan dalam top-10: 6 dari 267 dokumen GT


Unnamed: 0,Rank,DocID,Relevan_GT,Nama_Obat
0,1,760,Tidak,tremenza
1,2,583,Tidak,velutine plus
2,3,2861,Ya,tempra forte bubblegum
3,4,2244,Ya,bodrex extra
4,5,2798,Ya,tempra rasa anggur
5,6,585,Tidak,seretide diskus
6,7,694,Tidak,konicare minyak kayu putih hot
7,8,2748,Ya,trifamol
8,9,952,Ya,dextral
9,10,2828,Ya,panadol anakanak



--- IndoBERT (top-10) ---
Relevan dalam top-10: 5 dari 267 dokumen GT


Unnamed: 0,Rank,DocID,Relevan_GT,Nama_Obat
0,1,2058,Ya,sumagesic
1,2,760,Tidak,tremenza
2,3,744,Tidak,breathy nasal spray
3,4,2072,Ya,sanmol forte
4,5,2973,Tidak,paracetamol
5,6,860,Ya,procold flu
6,7,840,Tidak,nalgestan
7,8,585,Tidak,seretide diskus
8,9,2828,Ya,panadol anakanak
9,10,2301,Ya,mirasic forte



--- MiniLM (top-10) ---
Relevan dalam top-10: 1 dari 267 dokumen GT


Unnamed: 0,Rank,DocID,Relevan_GT,Nama_Obat
0,1,760,Tidak,tremenza
1,2,337,Tidak,fertin
2,3,412,Tidak,he man
3,4,2568,Ya,migranal
4,5,7689,Tidak,larutan penyegar cap badak rasa anggur
5,6,2374,Tidak,artrilox
6,7,2638,Tidak,pirofel
7,8,727,Tidak,minyak telon tresno joyo
8,9,707,Tidak,minyak kayu putih balpirik
9,10,2555,Tidak,meflam



--- MPNet (top-10) ---
Relevan dalam top-10: 5 dari 267 dokumen GT


Unnamed: 0,Rank,DocID,Relevan_GT,Nama_Obat
0,1,760,Tidak,tremenza
1,2,455,Tidak,lasal ekspektoran
2,3,7179,Tidak,ammeltz yokoyoko
3,4,679,Tidak,minyak telon cap gajah
4,5,1751,Ya,minyak angin cap kapak
5,6,2554,Ya,mefix
6,7,1817,Ya,minyak angin jahe
7,8,654,Ya,minyak kayu putih cap lang
8,9,7200,Tidak,minyak obat gosok anak mas
9,10,1761,Ya,balsem hijau cap kaki tiga



--- Hybrid_BM25+IndoBERT (top-10) ---
Relevan dalam top-10: 10 dari 267 dokumen GT


Unnamed: 0,Rank,DocID,Relevan_GT,Nama_Obat
0,1,2568,Ya,migranal
1,2,2094,Ya,panadol extra
2,3,2244,Ya,bodrex extra
3,4,2861,Ya,tempra forte bubblegum
4,5,2798,Ya,tempra rasa anggur
5,6,2828,Ya,panadol anakanak
6,7,2058,Ya,sumagesic
7,8,2691,Ya,rodemol
8,9,2814,Ya,tempra forte rasa anggur
9,10,2086,Ya,panadol



--- Hybrid_BM25+MPNet (top-10) ---
Relevan dalam top-10: 7 dari 267 dokumen GT


Unnamed: 0,Rank,DocID,Relevan_GT,Nama_Obat
0,1,2568,Ya,migranal
1,2,7170,Ya,minyak gosok cap tawon
2,3,2861,Ya,tempra forte bubblegum
3,4,7179,Tidak,ammeltz yokoyoko
4,5,7174,Tidak,minyak kutus kutus
5,6,2244,Ya,bodrex extra
6,7,1751,Ya,minyak angin cap kapak
7,8,2618,Ya,ostarin
8,9,1761,Ya,balsem hijau cap kaki tiga
9,10,7296,Tidak,tjing tjau balsem



--- Hybrid_IndoBERT+TFIDF (top-10) ---
Relevan dalam top-10: 9 dari 267 dokumen GT


Unnamed: 0,Rank,DocID,Relevan_GT,Nama_Obat
0,1,2568,Ya,migranal
1,2,2058,Ya,sumagesic
2,3,2086,Ya,panadol
3,4,2244,Ya,bodrex extra
4,5,2094,Ya,panadol extra
5,6,2868,Tidak,analpim
6,7,2805,Ya,sanmol
7,8,2072,Ya,sanmol forte
8,9,2301,Ya,mirasic forte
9,10,2828,Ya,panadol anakanak



--- Hybrid_TFIDF+MiniLM (top-10) ---
Relevan dalam top-10: 9 dari 267 dokumen GT


Unnamed: 0,Rank,DocID,Relevan_GT,Nama_Obat
0,1,2568,Ya,migranal
1,2,2086,Ya,panadol
2,3,2244,Ya,bodrex extra
3,4,2301,Ya,mirasic forte
4,5,2805,Ya,sanmol
5,6,2868,Tidak,analpim
6,7,2798,Ya,tempra rasa anggur
7,8,2514,Ya,itamol
8,9,2748,Ya,trifamol
9,10,2629,Ya,paramol forte


In [35]:
# 1) Semua query jalan ke model (untuk lihat hasil)
gt_all_df = gt_long_df.copy()
gt_all_df

Unnamed: 0,query_text,query_type,relevant_ids_raw,relevant_ids,relevant_ids_in_corpus,n_relevant,query_len_tokens
0,Obat apa yang biasanya digunakan untuk meredak...,single-hop,"634,637,639,644,654,658,666,682,759,780,805,81...","[634, 637, 639, 644, 654, 658, 666, 682, 759, ...","[634, 637, 639, 644, 654, 658, 666, 682, 759, ...",270,10
1,Obat untuk mengobati luka memar atau lebam,single-hop,"1737, 5553, 5564, 5578, 6865, 6895, 6898, 7079...","[1737, 5553, 5564, 5578, 6865, 6895, 6898, 707...","[1737, 5553, 5564, 5578, 6865, 6895, 6898, 707...",36,7
2,Obat untuk kerusakan kantung udara paru-paru a...,single-hop,"455, 456, 458, 459, 464, 464, 471, 473, 483, 4...","[455, 456, 458, 459, 464, 464, 471, 473, 483, ...","[455, 456, 458, 459, 464, 464, 471, 473, 483, ...",44,8
3,Obat untuk infeksi jamur candida,single-hop,"5072, 5162, 5187, 5189, 5240, 5244, 5403, 5426...","[5072, 5162, 5187, 5189, 5240, 5244, 5403, 542...","[5072, 5162, 5187, 5189, 5240, 5244, 5403, 542...",39,5
4,Obat untuk infeksi tulang atau osteomielitis,single-hop,"5628, 5659, 5714, 5754, 5856, 5858, 5904, 5907...","[5628, 5659, 5714, 5754, 5856, 5858, 5904, 590...","[5628, 5659, 5714, 5754, 5856, 5858, 5904, 590...",28,6
...,...,...,...,...,...,...,...
95,Obat kapsul atau tablet apa yang mengandung vi...,multi-hop,"2151, 2163, 2223, 2303, 2305, 2307, 2324, 2349...","[2151, 2163, 2223, 2303, 2305, 2307, 2324, 234...","[2151, 2163, 2223, 2303, 2305, 2307, 2324, 234...",47,17
96,Obat salep atau krim antivirus apa yang dipaka...,multi-hop,"5148, 5238, 5299, 5316, 5350, 5351, 5369, 5462","[5148, 5238, 5299, 5316, 5350, 5351, 5369, 5462]","[5148, 5238, 5299, 5316, 5350, 5351, 5369, 5462]",8,17
97,Obat tablet apa untuk nyeri haid yang juga mem...,multi-hop,"1195, 2062, 2076, 2080, 2090, 2095, 2104, 2121...","[1195, 2062, 2076, 2080, 2090, 2095, 2104, 212...","[1195, 2062, 2076, 2080, 2090, 2095, 2104, 212...",68,16
98,Obat antibiotik tetes mata apa yang bisa digun...,multi-hop,"4582, 4880, 4901","[4582, 4880, 4901]","[4582, 4880, 4901]",3,15


In [36]:
    import shutil
    import os

    # Replace 'my_folder' with the actual name of your folder in Colab
    folder_to_zip = 'semantic_model_export'
    zip_filename = f'{folder_to_zip}.zip'

    # Create the zip archive
    shutil.make_archive(folder_to_zip, 'zip', folder_to_zip)
    print(f"Folder '{folder_to_zip}' zipped as '{zip_filename}'")

FileNotFoundError: [Errno 2] No such file or directory: 'semantic_model_export'

## **BLOK X – Ekspor Model Hybrid TF-IDF + MiniLM**

In [37]:
import os
import joblib
import numpy as np
import json
from scipy.sparse import save_npz, load_npz
from sentence_transformers import SentenceTransformer

# Define export directory
export_dir = 'hybrid_tfidf_minilm_export'
os.makedirs(export_dir, exist_ok=True)
print(f"Direktori '{export_dir}' dibuat atau sudah ada.")

# 1. Save TF-IDF Vectorizer
joblib.dump(tfidf_vectorizer, os.path.join(export_dir, 'tfidf_vectorizer.joblib'))
print("tfidf_vectorizer disimpan.")

# 2. Save TF-IDF Matrix
save_npz(os.path.join(export_dir, 'tfidf_matrix.npz'), tfidf_matrix)
print("tfidf_matrix disimpan.")

# 3. Save MiniLM Model
minilm_model.save(os.path.join(export_dir, 'minilm_model'))
print("minilm_model disimpan.")

# 4. Save MiniLM Document Embeddings
np.save(os.path.join(export_dir, 'doc_embeddings_minilm.npy'), doc_embeddings_minilm)
print("doc_embeddings_minilm disimpan.")

# 5. Save id_to_idx mapping
with open(os.path.join(export_dir, 'id_to_idx.json'), 'w') as f:
    json.dump(id_to_idx, f)
print("id_to_idx disimpan.")

print(f"\nSemua komponen model hybrid TF-IDF + MiniLM berhasil diekspor ke '{export_dir}'.")

Direktori 'hybrid_tfidf_minilm_export' dibuat atau sudah ada.
tfidf_vectorizer disimpan.
tfidf_matrix disimpan.
minilm_model disimpan.
doc_embeddings_minilm disimpan.
id_to_idx disimpan.

Semua komponen model hybrid TF-IDF + MiniLM berhasil diekspor ke 'hybrid_tfidf_minilm_export'.


## **BLOK Y – Muat & Uji Model Hybrid TF-IDF + MiniLM yang Diekspor**

In [38]:
import os
import joblib
import numpy as np
import json
from scipy.sparse import load_npz
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import math

# Define export directory (harus sama dengan saat ekspor)
export_dir = 'hybrid_tfidf_minilm_export'

# 1. Load TF-IDF Vectorizer
loaded_tfidf_vectorizer = joblib.load(os.path.join(export_dir, 'tfidf_vectorizer.joblib'))
print("tfidf_vectorizer dimuat.")

# 2. Load TF-IDF Matrix
loaded_tfidf_matrix = load_npz(os.path.join(export_dir, 'tfidf_matrix.npz'))
print("tfidf_matrix dimuat.")

# 3. Load MiniLM Model
loaded_minilm_model = SentenceTransformer(os.path.join(export_dir, 'minilm_model'))
print("minilm_model dimuat.")

# 4. Load MiniLM Document Embeddings
loaded_doc_embeddings_minilm = np.load(os.path.join(export_dir, 'doc_embeddings_minilm.npy'))
print("doc_embeddings_minilm dimuat.")

# 5. Load id_to_idx mapping
with open(os.path.join(export_dir, 'id_to_idx.json'), 'r') as f:
    loaded_id_to_idx = json.load(f)
print("id_to_idx dimuat.")

# Re-create helper functions and the hybrid retrieval logic
# We need the original corpus_df for ID lookup, assuming it's available or reloaded.
# For this demonstration, we'll assume `corpus_df` is still in the kernel.

# TF-IDF retrieve function (modified to use loaded components)
def loaded_tfidf_retrieve(query_text, top_k=10):
    q_vec = loaded_tfidf_vectorizer.transform([query_text])
    sims = cosine_similarity(q_vec, loaded_tfidf_matrix)[0]
    # Ensure we use corpus_df that was used during training/indexing for correct ID mapping
    top_idx = np.argsort(sims)[::-1][:top_k]
    ids = corpus_df['id'].iloc[top_idx].tolist()
    scores = sims[top_idx]
    return list(zip(ids, scores))

# Hybrid TF-IDF + MiniLM retrieve function (modified to use loaded components)
ALPHA = 0.6 # This should be consistent with the exported model's ALPHA if tuned

def loaded_hybrid_tfidf_minilm_retrieve(query_text, top_k=10):
    # ---------- TF-IDF ----------
    # Get all results for normalization
    tfidf_results = loaded_tfidf_retrieve(
        query_text,
        top_k=len(corpus_df) # Retrieve all for proper normalization
    )
    tfidf_dict = {doc_id: score for doc_id, score in tfidf_results}

    # Normalisasi TF-IDF (0–1)
    if len(tfidf_dict) > 0:
        max_tfidf = max(tfidf_dict.values())
    else:
        max_tfidf = 1.0

    # ---------- MiniLM ----------
    q_emb = loaded_minilm_model.encode([query_text], convert_to_tensor=True)
    cos_scores = util.cos_sim(q_emb, loaded_doc_embeddings_minilm)[0]  # shape: (n_docs,)
    minilm_scores = cos_scores.cpu().numpy() # Convert to numpy for consistent scoring

    # ---------- Hybrid Scoring ----------
    hybrid_scores = []

    # Iterate through all documents (or a subset for efficiency)
    # We need to consider all documents for proper dense score application
    for idx, doc_id in enumerate(corpus_df['id'].tolist()):
        tfidf_score = tfidf_dict.get(doc_id, 0.0) # Get TFIDF score, or 0 if not in top N_CANDIDATES_HYBRID
        dense_score = minilm_scores[idx]

        tfidf_norm = tfidf_score / max_tfidf if max_tfidf > 0 else 0.0

        final_score = (
            ALPHA * dense_score +
            (1 - ALPHA) * tfidf_norm
        )

        hybrid_scores.append((doc_id, final_score))

    # Urutkan skor akhir
    hybrid_scores = sorted(
        hybrid_scores,
        key=lambda x: x[1],
        reverse=True
    )

    return hybrid_scores[:top_k]


print("\n--- Tes Model yang Dimuat ---")
sample_query = "Obat apa yang biasanya digunakan untuk meredakan sakit kepala ringan?"
print("Contoh query:", sample_query)
print("Top-5 Hybrid TF-IDF + MiniLM (dari model yang dimuat):")
print(loaded_hybrid_tfidf_minilm_retrieve(sample_query, top_k=5))

print("\n✅ Model hybrid TF-IDF + MiniLM berhasil dimuat dan diuji.")

tfidf_vectorizer dimuat.
tfidf_matrix dimuat.
minilm_model dimuat.
doc_embeddings_minilm dimuat.
id_to_idx dimuat.

--- Tes Model yang Dimuat ---
Contoh query: Obat apa yang biasanya digunakan untuk meredakan sakit kepala ringan?
Top-5 Hybrid TF-IDF + MiniLM (dari model yang dimuat):
[(2568, np.float64(0.7664762576952513)), (2086, np.float64(0.7368112684299688)), (2244, np.float64(0.7226348161697388)), (2301, np.float64(0.6835432712759099)), (2805, np.float64(0.6634614751736363))]

✅ Model hybrid TF-IDF + MiniLM berhasil dimuat dan diuji.


## **BLOK Z – Zip Folder Ekspor Model**

In [39]:
import shutil
import os

folder_to_zip = 'hybrid_tfidf_minilm_export'
zip_filename = f'{folder_to_zip}.zip'

# Create the zip archive
shutil.make_archive(folder_to_zip, 'zip', folder_to_zip)
print(f"Folder '{folder_to_zip}' zipped as '{zip_filename}'")

Folder 'hybrid_tfidf_minilm_export' zipped as 'hybrid_tfidf_minilm_export.zip'
