In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import f1_score, accuracy_score

In [2]:
# KONFIGURASI
FILE_GT = 'development_set_anotasi.csv'
KATEGORI_ASPEK = [
    'Kamera', 'Baterai', 'Performa & Gaming', 'Layar',
    'Desain & Kualitas Bodi', 'Harga', 'Pengiriman',
    'Layanan Penjual', 'UI'
]
TEKNIK_LIST = ['zero_shot', 'few_shot', 'zero_shot_cot', 'few_shot_cot']
ORDER_KATEGORI = ['Sangat Pendek', 'Pendek', 'Sedang', 'Panjang', 'Sangat Panjang']

# FUNGSI
def parse_list(text):
    """Safe parsing string list to python list."""
    if isinstance(text, list): return text
    if not isinstance(text, str): return []
    text = text.strip()
    if text.startswith('[') and text.endswith(']'):
        try:
            inner = text[1:-1]
            if not inner: return []
            # Handle quote variations
            parts = [p.strip().strip('"').strip("'") for p in inner.split(',')]
            return [p for p in parts if p]
        except:
            return []
    return []

def get_binary_vec(label_list):
    """Convert list of labels to binary vector based on global categories."""
    return [1 if k in label_list else 0 for k in KATEGORI_ASPEK]

def kategorikan_panjang(panjang):
    """Kategorisasi berdasarkan panjang teks (sesuai logika asli)."""
    if panjang <= 7: return 'Sangat Pendek'
    elif panjang <= 12: return 'Pendek'
    elif panjang <= 19: return 'Sedang'
    elif panjang <= 32: return 'Panjang'
    else: return 'Sangat Panjang'

def calculate_metrics(y_true_all, y_pred_all):
    # Label-Based Accuracy
    acc_per_label = []
    for i in range(len(KATEGORI_ASPEK)):
        true_i = [y[i] for y in y_true_all]
        pred_i = [y[i] for y in y_pred_all]
        acc_per_label.append(accuracy_score(true_i, pred_i))
    lba = np.mean(acc_per_label)

    # Macro F1
    f1 = f1_score(y_true_all, y_pred_all, average='macro', zero_division=0)
    
    return lba, f1

def get_mode(series):
    counts = Counter()
    for pred in series:
        counts[tuple(sorted(pred))] += 1
    return list(counts.most_common(1)[0][0])


In [3]:
# membaca ground truth
df_gt = pd.read_csv(FILE_GT)

# Konversi One-Hot GT kembali ke List Labels untuk pemrosesan standar
df_gt['ground_truth'] = df_gt.apply(lambda row: [k for k in KATEGORI_ASPEK if row.get(k) == 1], axis=1)

# Gunakan kolom panjang_teks ASLI dari file (PENTING: Jangan hitung ulang)
if 'panjang_teks' in df_gt.columns:
    df_gt['kategori_panjang'] = df_gt['panjang_teks'].apply(kategorikan_panjang)
else:
    # Fallback jika kolom tidak ada (seharusnya ada sesuai kode asli)
    df_gt['panjang_teks_calc'] = df_gt['comment'].apply(lambda x: len(str(x).split()))
    df_gt['kategori_panjang'] = df_gt['panjang_teks_calc'].apply(kategorikan_panjang)

df_base = df_gt[['comment', 'kategori_panjang', 'ground_truth']].copy()

In [4]:
# ANALISIS RATA-RATA (AVERAGE PER RUN)
print("Menghitung Kinerja Rata-Rata (5 Run)...")
run_metrics = []

for i in range(1, 6):
    df_run = pd.read_csv(f'eksperimen_no_{i}.csv')
    
    # Merge inner dengan GT
    df_merged = pd.merge(df_base, df_run, on='comment', how='inner')
    
    for kategori in ORDER_KATEGORI:
        subset = df_merged[df_merged['kategori_panjang'] == kategori]
        if subset.empty: continue
        
        y_true = [get_binary_vec(row['ground_truth']) for _, row in subset.iterrows()]
        
        for teknik in TEKNIK_LIST:
            # Parse prediksi
            preds_raw = subset[f'prediksi_{teknik}'].apply(parse_list)
            y_pred = [get_binary_vec(p) for p in preds_raw]
            
            acc, f1 = calculate_metrics(y_true, y_pred)
            
            run_metrics.append({
                'kategori': kategori,
                'teknik': teknik,
                'accuracy': acc,
                'f1_score': f1
            })

# Agregasi Rata-rata
df_runs = pd.DataFrame(run_metrics)
df_avg = df_runs.groupby(['kategori', 'teknik'])[['accuracy', 'f1_score']].mean().reset_index()

# Formatting Output Rata-Rata
pivot_avg = df_avg.pivot(index='kategori', columns='teknik', values=['accuracy', 'f1_score'])
pivot_avg.columns = [f'{col[0]}_{col[1]}' for col in pivot_avg.columns] # Flatten columns
pivot_avg = pivot_avg.reindex(ORDER_KATEGORI)

print("\n=== HASIL RATA-RATA 5 RUN (AVERAGE)")
print(pivot_avg.round(4))
pivot_avg.to_csv('hasil_evaluasi_rata_rata.csv')

Menghitung Kinerja Rata-Rata (5 Run)...

=== HASIL RATA-RATA 5 RUN (AVERAGE)
                accuracy_few_shot  accuracy_few_shot_cot  accuracy_zero_shot  \
kategori                                                                       
Sangat Pendek              0.8852                 0.8813              0.7951   
Pendek                     0.8113                 0.9237              0.7234   
Sedang                     0.7933                 0.9006              0.7056   
Panjang                    0.7610                 0.8961              0.6906   
Sangat Panjang             0.7202                 0.8100              0.6689   

                accuracy_zero_shot_cot  f1_score_few_shot  \
kategori                                                    
Sangat Pendek                   0.7599             0.2876   
Pendek                          0.8262             0.3671   
Sedang                          0.8071             0.4292   
Panjang                         0.7805             0.4141

In [5]:
# ANALISIS KONSENSUS (MAJORITY VOTE)
print("\nMenghitung Kinerja Majority Vote...")

# 1. Load dan Stack semua run
dfs = []
for i in range(1, 6):
    d = pd.read_csv(f'eksperimen_no_{i}.csv')
    # Ambil hanya kolom penting
    cols = ['comment'] + [f'prediksi_{t}' for t in TEKNIK_LIST]
    dfs.append(d[cols])

df_stack = pd.concat(dfs, ignore_index=True)

# 2. Parsing List di awal (Optimasi)
for t in TEKNIK_LIST:
    df_stack[f'prediksi_{t}'] = df_stack[f'prediksi_{t}'].apply(parse_list)

# 3. Group by Comment -> Cari Mode (Logika Asli User)
agg_dict = {f'prediksi_{t}': get_mode for t in TEKNIK_LIST}
df_majority_preds = df_stack.groupby('comment').agg(agg_dict).reset_index()

# 4. Merge dengan GT
df_maj_final = pd.merge(df_base, df_majority_preds, on='comment', how='inner')

# 5. Hitung Metrik Majority
maj_results = []
for kategori in ORDER_KATEGORI:
    subset = df_maj_final[df_maj_final['kategori_panjang'] == kategori]
    if subset.empty: continue
    
    y_true = [get_binary_vec(row['ground_truth']) for _, row in subset.iterrows()]
    
    for teknik in TEKNIK_LIST:
        y_pred = [get_binary_vec(row[f'prediksi_{teknik}']) for _, row in subset.iterrows()]
        
        acc, f1 = calculate_metrics(y_true, y_pred)
        
        maj_results.append({
            'kategori': kategori,
            'teknik': teknik,
            'accuracy': acc,
            'f1_score': f1
        })

# Formatting Output Majority
df_maj_res = pd.DataFrame(maj_results)
pivot_maj = df_maj_res.pivot(index='kategori', columns='teknik', values=['accuracy', 'f1_score'])
pivot_maj.columns = [f'{col[0]}_{col[1]}' for col in pivot_maj.columns]
pivot_maj = pivot_maj.reindex(ORDER_KATEGORI)

print("\nHASIL KONSENSUS (MAJORITY VOTE)")
print(pivot_maj.round(4))
pivot_maj.to_csv('hasil_evaluasi_majority_vote.csv')

print("\nSemua hasil tersimpan.")


Menghitung Kinerja Majority Vote...

HASIL KONSENSUS (MAJORITY VOTE)
                accuracy_few_shot  accuracy_few_shot_cot  accuracy_zero_shot  \
kategori                                                                       
Sangat Pendek              0.8920                 0.8908              0.7896   
Pendek                     0.8281                 0.9236              0.7461   
Sedang                     0.8063                 0.9038              0.7080   
Panjang                    0.7721                 0.9031              0.6916   
Sangat Panjang             0.7397                 0.8127              0.6899   

                accuracy_zero_shot_cot  f1_score_few_shot  \
kategori                                                    
Sangat Pendek                   0.7647             0.3175   
Pendek                          0.8510             0.3806   
Sedang                          0.8291             0.4420   
Panjang                         0.8041             0.4341   
San