# Model 4 - Rangkuman Aspek - Opini

## Import Library

In [2]:
import pandas as pd
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import ast

## Aspect Constraint

In [3]:
ASPECT_LABELS_INFER = [
    "Kualitas Barang",
    "Pelayanan Penjual",
    "Kemasan Barang",
    "Harga Barang",
    "Sesuai Deskripsi",
    "Pengiriman",
    "Lainnya"
]

## Helper for JSON Parsing on "absa_predict" Column

In [4]:
def parse_absa_predict(label_str):
    """Parsing string JSON dari kolom absa_predict."""
    if pd.isna(label_str): return []
    try:
        # Bersihkan potensi karakter aneh jika perlu
        return ast.literal_eval(str(label_str))
    except:
        try:
            return json.loads(str(label_str))
        except:
            return []

## Maximal Marginal Relevance (RRM) Algorithm

In [5]:
def maximal_marginal_relevance(sentence_vector, phrase_vectors, m=0.8):
    """Algoritma MMR untuk seleksi kalimat bukti."""
    # Jika kandidat cuma 1 atau kurang, kembalikan indeks 0
    if phrase_vectors.shape[0] <= 1:
        return [0]

    # Hitung kemiripan semua kandidat dengan Query (Topik)
    sim_to_doc = cosine_similarity(sentence_vector, phrase_vectors)[0]

    # Pilih kalimat pertama (paling relevan)
    selected_indices = [np.argmax(sim_to_doc)]
    candidate_indices = list(range(len(phrase_vectors)))
    candidate_indices.remove(selected_indices[0])

    # Tentukan mau ambil berapa bukti (Max 3)
    top_k = min(len(phrase_vectors), 3)

    while len(selected_indices) < top_k and candidate_indices:
        selected_vectors = phrase_vectors[selected_indices]
        remaining_vectors = phrase_vectors[candidate_indices]

        # Hitung kemiripan sisa kandidat dengan yang SUDAH terpilih
        sim_to_selected = cosine_similarity(remaining_vectors, selected_vectors)
        max_sim_to_selected = np.max(sim_to_selected, axis=1)

        # Hitung skor MMR
        current_sim_to_doc = sim_to_doc[candidate_indices]
        mmr_score = (m * current_sim_to_doc) - ((1 - m) * max_sim_to_selected)

        # Pilih skor tertinggi
        best_candidate_local_idx = np.argmax(mmr_score)
        best_candidate_global_idx = candidate_indices[best_candidate_local_idx]

        selected_indices.append(best_candidate_global_idx)
        candidate_indices.remove(best_candidate_global_idx)

    return selected_indices

## Generate Summary Function

In [6]:
def generate_summary(df_input, m_param=0.7):
    results = []

    # Handling jika review_id tidak ada (misal data dummy)
    if 'review_id' not in df_input.columns:
        df_input = df_input.copy()
        df_input['review_id'] = df_input.index

    # Grouping berdasarkan product_id
    grouped = df_input.groupby('product_id')

    for product_id, group in grouped:
        prod_name = group.iloc[0]['product_name']

        product_summary = {
            "product_id": str(product_id),
            "product_name": prod_name,
            "aspect_summary": {}
        }

        # Kumpulkan semua temuan aspek di produk ini
        aspect_findings = []

        for idx, row in group.iterrows():
            labels = parse_absa_predict(row['absa_predict'])

            for label in labels:
                item = {
                    'review_id': row['review_id'],
                    'full_text': row['text_norm'],
                    'aspect': label['aspect'],
                    'sentiment': label['sentiment'],
                    'opinion': label['opinion_span'],
                    'sent_id': label.get('sent_id', 0)
                }
                aspect_findings.append(item)

        df_aspects = pd.DataFrame(aspect_findings)

        if df_aspects.empty:
            results.append(product_summary)
            continue

        # Proses Summarization per Aspek
        for aspect in ASPECT_LABELS_INFER:
            df_curr = df_aspects[df_aspects['aspect'] == aspect]
            if df_curr.empty: continue

            # --- 1. Statistik ---
            pos_count = len(df_curr[df_curr['sentiment'] == 'pos'])
            neg_count = len(df_curr[df_curr['sentiment'] == 'neg'])

            # --- 2. Opini Dominan ---
            opinions_pos = df_curr[df_curr['sentiment'] == 'pos']['opinion'].tolist()
            opinions_neg = df_curr[df_curr['sentiment'] == 'neg']['opinion'].tolist()

            top_op_pos = Counter(opinions_pos).most_common(1)
            top_op_neg = Counter(opinions_neg).most_common(1)

            opini_pos_str = top_op_pos[0][0] if top_op_pos else "baik"
            opini_neg_str = top_op_neg[0][0] if top_op_neg else "kurang"

            # --- 3. Template Kalimat & Penentuan Dominan ---
            summary_text = ""

            # Tentukan sentimen dominan
            if pos_count >= neg_count:
                dom_sentiment = 'pos'
                base = f"Mayoritas ulasan menilai {aspect} positif, menyoroti {opini_pos_str}"
                if neg_count > 0:
                    summary_text = f"{base}; namun sebagian kecil mengeluhkan {opini_neg_str}."
                else:
                    summary_text = f"{base}."
            else:
                dom_sentiment = 'neg'
                base = f"Mayoritas ulasan menilai {aspect} negatif, terutama keluhan {opini_neg_str}"
                if pos_count > 0:
                    summary_text = f"{base}; meskipun ada yang memuji {opini_pos_str}."
                else:
                    summary_text = f"{base}."

            # --- 4. MMR Evidence Selection ---

            # Filter kandidat: Hanya ambil ulasan dengan sentimen dominan
            df_evidence_pool = df_curr[df_curr['sentiment'] == dom_sentiment]

            # Fallback jika kosong
            if df_evidence_pool.empty:
                df_evidence_pool = df_curr

            # Deduplikasi
            unique_evidence = df_evidence_pool.drop_duplicates(subset=['full_text'])
            candidates = unique_evidence['full_text'].tolist()
            candidate_data = unique_evidence[['review_id', 'sent_id', 'full_text']].to_dict('records')

            evidence_list = []

            if candidates:
                vectorizer = TfidfVectorizer(min_df=1)

                try:
                    #Konversi ke Dense Matrix
                    tfidf_matrix = vectorizer.fit_transform(candidates).toarray()

                    # Reshape Query Vector agar jadi (1, Features)
                    query_vec = np.mean(tfidf_matrix, axis=0).reshape(1, -1)

                    # Jalanin MMR
                    selected_idxs = maximal_marginal_relevance(query_vec, tfidf_matrix, m=m_param)

                    for idx in selected_idxs:
                        evidence_list.append(candidate_data[idx])
                except Exception as e:
                    # fallback
                    evidence_list.append(candidate_data[0])

            # Simpan hasil
            product_summary["aspect_summary"][aspect] = {
                "summary": summary_text,
                "evidence": evidence_list
            }

        results.append(product_summary)

    return results

## Membuat CSV Summary Output

In [7]:
# Membuat DataFrame dari input
df = pd.read_csv('https://drive.google.com/uc?id=1wV9Q680atztyeSNYgYyyNE_NDFfjWr4k', encoding='latin-1')

output_json = generate_summary(df, m_param=0.7)

df_output = pd.DataFrame(output_json)

# Simpan ke CSV
df_output.to_csv("summary_output.csv", index=False, encoding="utf-8-sig")
print("CSV berhasil dibuat!")

# Print Hasil
print(json.dumps(output_json, indent=2, ensure_ascii=False))

CSV berhasil dibuat!
[
  {
    "product_id": "1505048",
    "product_name": "NOKIA Charger AC-50E Original",
    "aspect_summary": {
      "Kualitas Barang": {
        "summary": "Mayoritas ulasan menilai Kualitas Barang positif, menyoroti barang berkualitas; namun sebagian kecil mengeluhkan barang susah diterima.",
        "evidence": [
          {
            "review_id": 7386,
            "sent_id": 0,
            "full_text": "barang susah diterima packing rapi dengan barang berkualitas terima kasih"
          },
          {
            "review_id": 7388,
            "sent_id": 0,
            "full_text": "barang susah sampai dengan diterima dengan baik rekomen karena barang original dengan biasa dipakai untuk perangkat perangkat lain juga"
          }
        ]
      },
      "Kemasan Barang": {
        "summary": "Mayoritas ulasan menilai Kemasan Barang positif, menyoroti packing rapi.",
        "evidence": [
          {
            "review_id": 7386,
            "sent_id": 0,
  

## Lambda Scenario Testing

In [8]:
import pandas as pd
import json

# nge-run tiga skenario
print("Sedang menjalankan Model dengan Lambda 0.3...")
res_03 = generate_summary(df, m_param=0.3)

print("Sedang menjalankan Model dengan Lambda 0.7...")
res_07 = generate_summary(df, m_param=0.7)

print("Sedang menjalankan Model dengan Lambda 1.0...")
res_10 = generate_summary(df, m_param=1.0)

# fungsi nge-compare
def compare_experiments(r1, r2, r3):
    # Buat dictionary biar gampang dicari berdasarkan Product ID
    def map_results(res_list):
        mapper = {}
        for item in res_list:
            pid = item['product_id']
            # Kita map sampai ke level aspek
            for aspect, val in item['aspect_summary'].items():
                # Ambil list teks evidence-nya saja untuk dibandingkan
                evidence_texts = [e['full_text'] for e in val['evidence']]
                mapper[(pid, aspect)] = sorted(evidence_texts) # Sort biar urutan ga ngaruh
        return mapper

    map_03 = map_results(r1)
    map_07 = map_results(r2)
    map_10 = map_results(r3)

    differences = []

    for key in map_07.keys(): # Pakai keys dari 0.7
        ev_03 = map_03.get(key, [])
        ev_07 = map_07.get(key, [])
        ev_10 = map_10.get(key, [])

        if (ev_03 != ev_07) or (ev_07 != ev_10) or (ev_03 != ev_10):
            differences.append({
                "product_id": key[0],
                "aspect": key[1],
                "ev_03": ev_03,
                "ev_07": ev_07,
                "ev_10": ev_10
            })

    return differences

# hasil

diff_cases = compare_experiments(res_03, res_07, res_10)

print(f"\nDitemukan {len(diff_cases)} kasus di mana Lambda mempengaruhi hasil evidence.")
print("="*60)

# Tampilkan 10 contoh pertama saja biar ga kepanjangan
for i, case in enumerate(diff_cases[:10]):
    print(f"KASUS #{i+1}")
    print(f"Produk ID : {case['product_id']}")
    print(f"Aspek     : {case['aspect']}")
    print("-" * 20)
    print(f"[Lambda 0.3 - Variatif] : {case['ev_03']}")
    print(f"[Lambda 0.7 - Balanced] : {case['ev_07']}")
    print(f"[Lambda 1.0 - Mirip]    : {case['ev_10']}")
    print("="*60)

Sedang menjalankan Model dengan Lambda 0.3...
Sedang menjalankan Model dengan Lambda 0.7...
Sedang menjalankan Model dengan Lambda 1.0...

Ditemukan 1784 kasus di mana Lambda mempengaruhi hasil evidence.
KASUS #1
Produk ID : 1828413
Aspek     : Kualitas Barang
--------------------
[Lambda 0.3 - Variatif] : ['barang diterima mantap terima kasih kasih ya', 'barang susah sampai gan sama dengan original nya', 'ngebut amat gan barang mantap sama seperti standar jangan ragu deh']
[Lambda 0.7 - Balanced] : ['barang diterima mantap terima kasih kasih ya', 'barang susah sampai gan sama dengan original nya', 'ngebut amat gan barang mantap sama seperti standar jangan ragu deh']
[Lambda 1.0 - Mirip]    : ['barang susah mendarat dengan aman pengiriman oke kondisi barang juga oke', 'barang susah sampai gan sama dengan original nya', 'ngebut amat gan barang mantap sama seperti standar jangan ragu deh']
KASUS #2
Produk ID : 2881935
Aspek     : Kualitas Barang
--------------------
[Lambda 0.3 - Variati