# BPS Question Deduplication — Interactive UI Notebook
### Semantic Similarity • Duplicate Detection • Clustering • UI Controls

Notebook ini menyediakan:
- Slider threshold
- Dropdown model embedding
- Upload CSV
- Tombol menjalankan deduplication
- Output: tabel duplicate pairs, clusters, dan heatmap


In [7]:
%pip install sentence-transformers seaborn matplotlib scikit-learn networkx ipywidgets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io
import networkx as nx

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

from ipywidgets import (
    FloatSlider, Dropdown, Button, FileUpload,
    VBox, HBox, Output
)

print("Libraries loaded.")


Collecting ipywidgets
  Downloading ipywidgets-8.1.8-py3-none-any.whl (139 kB)
     ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
     ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
     ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
     ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
     ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
     ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
     ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
     ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
     ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
     ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
     ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
     ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
     -- -------------------------------------


[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Libraries loaded.


## UI Components: Slider, Dropdown, Upload, Run Button


In [8]:
# Slider threshold
threshold_slider = FloatSlider(
    value=0.78,
    min=0.60,
    max=0.95,
    step=0.01,
    description='Threshold:',
    style={'description_width': '110px'},
    layout={'width': '380px'}
)

# Dropdown model
model_dropdown = Dropdown(
    options={
        "MiniLM-L6-v2 (Default)": "sentence-transformers/all-MiniLM-L6-v2",
        "Multilingual-MPNet": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
        "MiniLM-L12-v2": "sentence-transformers/all-MiniLM-L12-v2"
    },
    value="sentence-transformers/all-MiniLM-L6-v2",
    description="Embedding Model:",
    style={'description_width': '140px'},
    layout={'width': '480px'}
)

# CSV Upload
upload = FileUpload(
    description="Upload CSV",
    accept='.csv',
    multiple=False
)

# Run Button
run_button = Button(
    description="Run Deduplication",
    button_style='success',
    layout={'width': '200px', 'height': '40px'}
)

output = Output()


## Handler untuk Upload CSV


In [None]:
df = None

def on_upload_change(change):
    global df
    if upload.value:
        file_info = list(upload.value.values())[0]
        content = file_info["content"]
        df = pd.read_csv(
            io.BytesIO(content),
            sep=";",
            encoding="utf-8-sig")
        print("CSV loaded! Rows:", len(df))
        display(df.head())

upload.observe(on_upload_change, names='value')


## Run Deduplication Logic (Triggered by Button)


In [None]:
def run_pipeline(_):
    global df
    if df is None:
        with output:
            output.clear_output()
            print("⚠️ Upload CSV dulu.")
        return

    with output:
        output.clear_output()

        TH = threshold_slider.value
        MODEL = model_dropdown.value

        print(f"📌 Threshold: {TH}")
        print(f"📌 Model: {MODEL}\n")

        # Load model
        model = SentenceTransformer(MODEL)

        # Embedding
        print("🔄 Generating embeddings...")
        emb = model.encode(df["question_text"].tolist(), show_progress_bar=True)

        print("🔄 Computing similarity matrix...")
        sim_matrix = cosine_similarity(emb)

        # Heatmap
        print("\n🔥 Similarity Matrix Heatmap")
        plt.figure(figsize=(10, 7))
        sns.heatmap(sim_matrix, cmap="viridis")
        plt.show()

        # Duplicate pairs
        print("\n📌 Duplicate Pairs (≥ threshold)")
        pairs = []
        n = len(df)

        for i in range(n):
            for j in range(i+1, n):
                sim = sim_matrix[i][j]
                if sim >= TH:
                    pairs.append({
                        "id1": df.loc[i, "question_id"],
                        "id2": df.loc[j, "question_id"],
                        "question1": df.loc[i, "question_text"],
                        "question2": df.loc[j, "question_text"],
                        "similarity": sim
                    })

        pairs_df = pd.DataFrame(pairs)
        display(pairs_df)

        # Clustering
        print("\n🧩 Clusters")
        if len(pairs_df) > 0:
            G = nx.Graph()
            for _, row in pairs_df.iterrows():
                G.add_edge(row["id1"], row["id2"])

            clusters = [sorted(list(c)) for c in nx.connected_components(G)]
            for i, c in enumerate(clusters):
                print(f"Cluster {i+1}: {c}")
        else:
            print("Tidak ada pasangan mirip pada threshold ini.")

run_button.on_click(run_pipeline)


## UI Panel


In [None]:
ui = VBox([
    upload,
    model_dropdown,
    threshold_slider,
    run_button,
    output
])

ui
