In [46]:
import pandas as pd
from collections import defaultdict

Load data dan mapping label yang digunakan

In [47]:
df = pd.read_excel("MATCH_final-anotasi-pos-tagging.xlsx")

label_mapping = {
    'PAD':'PAD',
    'ADV': 'ADV',
    'ASP': 'ADV', 
    'B--':'B--',
    'CC-': 'CC-', 
    'CCA': 'CC-',   
    'CCS': 'CC-', 
    'CD': 'CD',
    'X--': 'F--', 
    'F--': 'F--', 
    'DET': 'DET', 
    'NSB': 'NSB', 
    'NSD': 'NSD',
    'NSM': 'NSB',  
    'NSV': 'NSB',
    'NSP': 'NSB',
    'NSP': 'NSD', 
    'NUM':'NUM',
    'ORD':'NUM', 
    'Z--':'Z--',
    'O--':'O--',
    'S--': 'S--',
    'T--': 'T--', 
    'R--':'ADV', 
    'PRD': 'PRD',
    'PRF': 'PRF',
    'PRP': 'PRP',
    'PRT': 'PRT',
    'PR-': 'PRD',
    'PRN': 'PRN',
    'W--': 'PRQ',
    'PRQ': 'PRQ',
    'PP1': 'PRP',
    'PS2': 'PRP',
    'PS3': 'PRF', 
    'G--':'PRF',
    'ASS':'ASS',
    'VSB': 'VSB',
    'VSP': 'VSP',
    'VBI': 'VBI',
    'VBL': 'VBL',
    'VBT': 'VBT',
    'VSA': 'VSA',
    'CO': 'CO', 
    'H--': 'H--',
    'M--': 'M--',
}


Split token beserta labelnya

In [48]:
def parse_tagged_sentence(tagged_text):
    """
    Mengubah string kalimat bertag menjadi list tuple (kata, tag).
    Mengabaikan token yang tidak sesuai format kata/tag.
    """
    tokens = []
    for w in tagged_text.split():
        parts = w.rsplit('/', 1)  # Split dari kanan, sekali saja
        if len(parts) == 2:
            tokens.append((parts[0], parts[1]))
    return tokens

Penyesuaian tag asli ke tag sesuai dengan label mapping yang telah diinisialisasi sebelumnya

In [49]:
def map_tag(tag):
    """Mapping tag asli ke tag final sesuai label_mapping"""
    return label_mapping.get(tag, tag)

Mengambil dan membandingkan tiap anotasi (stanza x manual & gpt x manual) dengan manual sebagai acuannya. 
Setiap index merepresentasikan urutan token pada anotasi.

In [50]:
def evaluate_tags(gpt_tags, stanza_tags, manual_tags):
    results = []
    len_gpt = len(gpt_tags)
    len_stanza = len(stanza_tags)
    len_manual = len(manual_tags)

    max_len = max(len_gpt, len_stanza, len_manual)

    for idx in range(max_len):
        gpt_tok, gpt_tag = gpt_tags[idx] if idx < len_gpt else (None, None)
        stanza_tok, stanza_tag = stanza_tags[idx] if idx < len_stanza else (None, None)
        manual_tok, manual_tag = manual_tags[idx] if idx < len_manual else (None, None)

        # Bandingkan token (kata) dan tag setelah mapping
        gpt_match = (gpt_tok == manual_tok) and (map_tag(gpt_tag) == map_tag(manual_tag))
        stanza_match = (stanza_tok == manual_tok) and (map_tag(stanza_tag) == map_tag(manual_tag))

        result = {
            'index': idx,
            'manual_token': manual_tok,
            'manual_tag': map_tag(manual_tag) if manual_tag else None,
            'gpt_token': gpt_tok,
            'gpt_tag': map_tag(gpt_tag) if gpt_tag else None,
            'stanza_token': stanza_tok,
            'stanza_tag': map_tag(stanza_tag) if stanza_tag else None,
            'gpt_correct': gpt_match,
            'stanza_correct': stanza_match
        }
        results.append(result)
    return results

Mengambil semua kalimat (soal) untuk dibandingkan

In [51]:
# Loop tiap baris (kalimat)
for i, row in df.iterrows():
    gpt = parse_tagged_sentence(row['gpt'])
    stanza = parse_tagged_sentence(row['stanza'])
    manual = parse_tagged_sentence(row['manual'])

    evaluated = evaluate_tags(gpt, stanza, manual)

    print(f"\n=== Kalimat ke-{i+1} ===")
    for r in evaluated:
        print(f"Index {r['index']}: manual=({r['manual_token']}/{r['manual_tag']}), "
              f"gpt=({r['gpt_token']}/{r['gpt_tag']}) [{'✅' if r['gpt_correct'] else '❌'}], "
              f"stanza=({r['stanza_token']}/{r['stanza_tag']}) [{'✅' if r['stanza_correct'] else '❌'}]")


=== Kalimat ke-1 ===
Index 0: manual=(Bagian/NSD), gpt=(Bagian/NSD) [✅], stanza=(Bagian/NSD) [✅]
Index 1: manual=(dari/ADV), gpt=(dari/ADV) [✅], stanza=(dari/ADV) [✅]
Index 2: manual=(bidang/NSD), gpt=(bidang/NSD) [✅], stanza=(bidang/NSD) [✅]
Index 3: manual=(biologi/NSD), gpt=(biologi/NSD) [✅], stanza=(biologi/NSD) [✅]
Index 4: manual=(yang/S--), gpt=(yang/S--) [✅], stanza=(yang/S--) [✅]
Index 5: manual=(membutuhkan/VSA), gpt=(membutuhkan/VSA) [✅], stanza=(membutuhkan/VSA) [✅]
Index 6: manual=(penggunaan/NSD), gpt=(penggunaan/NSD) [✅], stanza=(penggunaan/NSD) [✅]
Index 7: manual=(mikroskop/NSD), gpt=(mikroskop/NSD) [✅], stanza=(mikroskop/F--) [❌]
Index 8: manual=(,/Z--), gpt=(,/Z--) [✅], stanza=(,/Z--) [✅]
Index 9: manual=(sebagai/ADV), gpt=(sebagai/ADV) [✅], stanza=(sebagai/ADV) [✅]
Index 10: manual=(contoh/NSD), gpt=(contoh/NSD) [✅], stanza=(contoh/NSD) [✅]

=== Kalimat ke-2 ===
Index 0: manual=(Dalam/ADV), gpt=(Dalam/ADV) [✅], stanza=(Dalam/NSD) [❌]
Index 1: manual=(studi/NSD), gp

Melihat dan membandingkan akurasi tiap metode anotasi dengan label spesifik

In [52]:
print("PERBANDINGAN AKURASI SECARA KESELURUHAN")

total_manual_tokens = 0
total_gpt_correct = 0
total_stanza_correct = 0

for i, row in df.iterrows():
    gpt = parse_tagged_sentence(row['gpt'])
    stanza = parse_tagged_sentence(row['stanza'])
    manual = parse_tagged_sentence(row['manual'])

    evaluated = evaluate_tags(gpt, stanza, manual)

    for r in evaluated:
        if r['manual_token'] is not None:
            total_manual_tokens += 1
            if r['gpt_correct']:
                total_gpt_correct += 1
            if r['stanza_correct']:
                total_stanza_correct += 1

gpt_accuracy = total_gpt_correct / total_manual_tokens if total_manual_tokens else 0
stanza_accuracy = total_stanza_correct / total_manual_tokens if total_manual_tokens else 0


print(f"Akurasi GPT: {gpt_accuracy:.2%}")
print(f"Akurasi Stanza: {stanza_accuracy:.2%}")
print(f"Selisih akurasi GPT dan Stanza: {(gpt_accuracy - stanza_accuracy):.2%}")


PERBANDINGAN AKURASI SECARA KESELURUHAN
Akurasi GPT: 63.13%
Akurasi Stanza: 61.88%
Selisih akurasi GPT dan Stanza: 1.25%


### Membandingkan akurasi secara umum per jenis kata

Mapping label untuk pengelombokan kata berdasarkan huruf pertama pada label

In [53]:
def get_general_category(tag):
    """
    Mengelompokkan tag ke dalam kategori umum berdasarkan huruf pertama
    """
    if tag is None:
        return None
    
    # Kategori berdasarkan huruf pertama
    if tag.startswith('N'):
        return 'Noun'
    elif tag.startswith('V'):
        return 'Verb'
    elif tag.startswith('P'):
        return 'Pronomina'
    elif tag.startswith('C'):
        return 'Coordinating'
    elif tag.startswith('ADV') or tag.startswith('R'):
        return 'Adverb'
    elif tag.startswith('DET'):
        return 'Determiner'
    elif tag.startswith('NUM') or tag.startswith('CD'):
        return 'Number'
    elif tag.startswith('ASS'):
        return 'Assertion'
    elif tag.startswith('B'):
        return 'Interjection'
    elif tag.startswith('F'):
        return 'Foreign'
    elif tag.startswith('H'):
        return 'Honorific'
    elif tag.startswith('M'):
        return 'Modal'
    elif tag.startswith('O'):
        return 'Onomatopoeia'
    elif tag.startswith('S'):
        return 'Suffix'
    elif tag.startswith('T'):
        return 'Particle'
    elif tag.startswith('Z'):
        return 'Punctuation'
    elif tag == 'PAD':
        return 'Padding'
    else:
        return 'Other'


Split tiap kata dengan tokennya masing-masing

In [54]:
def parse_tagged_sentence(tagged_text):
    """
    Mengubah string kalimat bertag menjadi list tuple (kata, tag).
    Mengabaikan token yang tidak sesuai format kata/tag.
    """
    tokens = []
    for w in tagged_text.split():
        parts = w.rsplit('/', 1)  # Split dari kanan, sekali saja
        if len(parts) == 2:
            tokens.append((parts[0], parts[1]))
    return tokens


In [55]:
def map_tag(tag):
    """Mapping tag asli ke tag final sesuai label_mapping"""
    return label_mapping.get(tag, tag)


Melihat dan membandingkan akurasi tiap metode anotasi menggunakan pengelompokan kata

In [56]:
def evaluate_tags_general(gpt_tags, stanza_tags, manual_tags):
    """
    Evaluasi dengan perbandingan kategori umum dan spesifik
    """
    results = []
    len_gpt = len(gpt_tags)
    len_stanza = len(stanza_tags)
    len_manual = len(manual_tags)

    max_len = max(len_gpt, len_stanza, len_manual)

    for idx in range(max_len):
        gpt_tok, gpt_tag = gpt_tags[idx] if idx < len_gpt else (None, None)
        stanza_tok, stanza_tag = stanza_tags[idx] if idx < len_stanza else (None, None)
        manual_tok, manual_tag = manual_tags[idx] if idx < len_manual else (None, None)

        # Mapping tag ke tag final
        gpt_mapped = map_tag(gpt_tag) if gpt_tag else None
        stanza_mapped = map_tag(stanza_tag) if stanza_tag else None
        manual_mapped = map_tag(manual_tag) if manual_tag else None

        # Kategori umum
        gpt_general = get_general_category(gpt_mapped)
        stanza_general = get_general_category(stanza_mapped)
        manual_general = get_general_category(manual_mapped)

        # Perbandingan spesifik (seperti sebelumnya)
        gpt_specific_match = (gpt_tok == manual_tok) and (gpt_mapped == manual_mapped)
        stanza_specific_match = (stanza_tok == manual_tok) and (stanza_mapped == manual_mapped)

        # Perbandingan kategori umum
        gpt_general_match = (gpt_tok == manual_tok) and (gpt_general == manual_general)
        stanza_general_match = (stanza_tok == manual_tok) and (stanza_general == manual_general)

        result = {
            'index': idx,
            'manual_token': manual_tok,
            'manual_tag': manual_mapped,
            'manual_category': manual_general,
            'gpt_token': gpt_tok,
            'gpt_tag': gpt_mapped,
            'gpt_category': gpt_general,
            'stanza_token': stanza_tok,
            'stanza_tag': stanza_mapped,
            'stanza_category': stanza_general,
            'gpt_specific_correct': gpt_specific_match,
            'stanza_specific_correct': stanza_specific_match,
            'gpt_general_correct': gpt_general_match,
            'stanza_general_correct': stanza_general_match
        }
        results.append(result)
    return results

Analisis akurasi per kalimat

In [57]:
print("=== ANALISIS PER KALIMAT ===")
for i, row in df.iterrows():
    if i >= 3:  # Hanya tampilkan 3 kalimat pertama untuk contoh
        break
        
    gpt = parse_tagged_sentence(row['gpt'])
    stanza = parse_tagged_sentence(row['stanza'])
    manual = parse_tagged_sentence(row['manual'])

    evaluated = evaluate_tags_general(gpt, stanza, manual)

    print(f"\n=== Kalimat ke-{i+1} ===")
    for r in evaluated:
        if r['manual_token'] is not None:
            print(f"Index {r['index']}: manual=({r['manual_token']}/{r['manual_tag']}/{r['manual_category']})")
            print(f"  GPT=({r['gpt_token']}/{r['gpt_tag']}/{r['gpt_category']}) "
                  f"[Spesifik: {'✅' if r['gpt_specific_correct'] else '❌'}, "
                  f"Umum: {'✅' if r['gpt_general_correct'] else '❌'}]")
            print(f"  Stanza=({r['stanza_token']}/{r['stanza_tag']}/{r['stanza_category']}) "
                  f"[Spesifik: {'✅' if r['stanza_specific_correct'] else '❌'}, "
                  f"Umum: {'✅' if r['stanza_general_correct'] else '❌'}]")


=== ANALISIS PER KALIMAT ===

=== Kalimat ke-1 ===
Index 0: manual=(Bagian/NSD/Noun)
  GPT=(Bagian/NSD/Noun) [Spesifik: ✅, Umum: ✅]
  Stanza=(Bagian/NSD/Noun) [Spesifik: ✅, Umum: ✅]
Index 1: manual=(dari/ADV/Adverb)
  GPT=(dari/ADV/Adverb) [Spesifik: ✅, Umum: ✅]
  Stanza=(dari/ADV/Adverb) [Spesifik: ✅, Umum: ✅]
Index 2: manual=(bidang/NSD/Noun)
  GPT=(bidang/NSD/Noun) [Spesifik: ✅, Umum: ✅]
  Stanza=(bidang/NSD/Noun) [Spesifik: ✅, Umum: ✅]
Index 3: manual=(biologi/NSD/Noun)
  GPT=(biologi/NSD/Noun) [Spesifik: ✅, Umum: ✅]
  Stanza=(biologi/NSD/Noun) [Spesifik: ✅, Umum: ✅]
Index 4: manual=(yang/S--/Suffix)
  GPT=(yang/S--/Suffix) [Spesifik: ✅, Umum: ✅]
  Stanza=(yang/S--/Suffix) [Spesifik: ✅, Umum: ✅]
Index 5: manual=(membutuhkan/VSA/Verb)
  GPT=(membutuhkan/VSA/Verb) [Spesifik: ✅, Umum: ✅]
  Stanza=(membutuhkan/VSA/Verb) [Spesifik: ✅, Umum: ✅]
Index 6: manual=(penggunaan/NSD/Noun)
  GPT=(penggunaan/NSD/Noun) [Spesifik: ✅, Umum: ✅]
  Stanza=(penggunaan/NSD/Noun) [Spesifik: ✅, Umum: ✅]
In

Analisis Keseluruhan setiap anotasi

In [58]:
print("\n\n=== ANALISIS KESELURUHAN ===")

# Counters untuk akurasi
total_manual_tokens = 0
total_gpt_specific_correct = 0
total_stanza_specific_correct = 0
total_gpt_general_correct = 0
total_stanza_general_correct = 0

# Counters per kategori
category_stats = defaultdict(lambda: {
    'total': 0,
    'gpt_specific': 0,
    'stanza_specific': 0,
    'gpt_general': 0,
    'stanza_general': 0
})

# Loop semua kalimat
for i, row in df.iterrows():
    gpt = parse_tagged_sentence(row['gpt'])
    stanza = parse_tagged_sentence(row['stanza'])
    manual = parse_tagged_sentence(row['manual'])

    evaluated = evaluate_tags_general(gpt, stanza, manual)

    for r in evaluated:
        if r['manual_token'] is not None:
            total_manual_tokens += 1
            category = r['manual_category']
            
            # Update category stats
            category_stats[category]['total'] += 1
            
            # Akurasi spesifik
            if r['gpt_specific_correct']:
                total_gpt_specific_correct += 1
                category_stats[category]['gpt_specific'] += 1
            if r['stanza_specific_correct']:
                total_stanza_specific_correct += 1
                category_stats[category]['stanza_specific'] += 1
                
            # Akurasi umum
            if r['gpt_general_correct']:
                total_gpt_general_correct += 1
                category_stats[category]['gpt_general'] += 1
            if r['stanza_general_correct']:
                total_stanza_general_correct += 1
                category_stats[category]['stanza_general'] += 1

# Hitung akurasi keseluruhan
gpt_specific_accuracy = total_gpt_specific_correct / total_manual_tokens if total_manual_tokens else 0
stanza_specific_accuracy = total_stanza_specific_correct / total_manual_tokens if total_manual_tokens else 0
gpt_general_accuracy = total_gpt_general_correct / total_manual_tokens if total_manual_tokens else 0
stanza_general_accuracy = total_stanza_general_correct / total_manual_tokens if total_manual_tokens else 0

print("AKURASI KESELURUHAN:")
print(f"Total token: {total_manual_tokens}")
print(f"\nAkurasi Spesifik:")
print(f"  GPT: {gpt_specific_accuracy:.2%}")
print(f"  Stanza: {stanza_specific_accuracy:.2%}")
print(f"  Selisih: {(gpt_specific_accuracy - stanza_specific_accuracy):.2%}")

print(f"\nAkurasi Kategori Umum:")
print(f"  GPT: {gpt_general_accuracy:.2%}")
print(f"  Stanza: {stanza_general_accuracy:.2%}")
print(f"  Selisih: {(gpt_general_accuracy - stanza_general_accuracy):.2%}")

print(f"\nPerbandingan Peningkatan dengan Kategori Umum:")
print(f"  GPT: {(gpt_general_accuracy - gpt_specific_accuracy):.2%}")
print(f"  Stanza: {(stanza_general_accuracy - stanza_specific_accuracy):.2%}")





=== ANALISIS KESELURUHAN ===
AKURASI KESELURUHAN:
Total token: 2167

Akurasi Spesifik:
  GPT: 63.13%
  Stanza: 61.88%
  Selisih: 1.25%

Akurasi Kategori Umum:
  GPT: 76.19%
  Stanza: 71.25%
  Selisih: 4.94%

Perbandingan Peningkatan dengan Kategori Umum:
  GPT: 13.06%
  Stanza: 9.37%


Akurasi per kategori kata

In [59]:
print("\n\nAKURASI PER KATEGORI:")
print("-" * 80)
print(f"{'Kategori':<15} {'Total':<8} {'GPT Spesifik':<12} {'GPT Umum':<10} {'Stanza Spesifik':<15} {'Stanza Umum':<12}")
print("-" * 80)

for category in sorted(category_stats.keys()):
    stats = category_stats[category]
    total = stats['total']
    
    if total > 0:
        gpt_spec_pct = stats['gpt_specific'] / total * 100
        gpt_gen_pct = stats['gpt_general'] / total * 100
        stanza_spec_pct = stats['stanza_specific'] / total * 100
        stanza_gen_pct = stats['stanza_general'] / total * 100
        
        print(f"{category:<15} {total:<8} {gpt_spec_pct:<12.1f}% {gpt_gen_pct:<10.1f}% {stanza_spec_pct:<15.1f}% {stanza_gen_pct:<12.1f}%")

# Tabel perbandingan untuk visualisasi
print("\n\nTABEL PERBANDINGAN AKURASI:")
print("-" * 60)
print(f"{'Metrik':<25} {'GPT':<15} {'Stanza':<15}")
print("-" * 60)
print(f"{'Akurasi Spesifik':<25} {gpt_specific_accuracy:<15.2%} {stanza_specific_accuracy:<15.2%}")
print(f"{'Akurasi Kategori Umum':<25} {gpt_general_accuracy:<15.2%} {stanza_general_accuracy:<15.2%}")
print(f"{'Peningkatan':<25} {(gpt_general_accuracy - gpt_specific_accuracy):<15.2%} {(stanza_general_accuracy - stanza_specific_accuracy):<15.2%}")
print("-" * 60)



AKURASI PER KATEGORI:
--------------------------------------------------------------------------------
Kategori        Total    GPT Spesifik GPT Umum   Stanza Spesifik Stanza Umum 
--------------------------------------------------------------------------------
Adverb          260      82.3        % 82.3      % 73.1           % 73.1        %
Coordinating    66       45.5        % 45.5      % 0.0            % 51.5        %
Determiner      4        0.0         % 0.0       % 0.0            % 0.0         %
Foreign         48       50.0        % 50.0      % 100.0          % 100.0       %
Interjection    1        0.0         % 0.0       % 100.0          % 100.0       %
Noun            882      80.8        % 84.9      % 75.2           % 77.9        %
Onomatopoeia    6        0.0         % 0.0       % 100.0          % 100.0       %
Other           104      0.0         % 0.0       % 0.0            % 3.8         %
Particle        4        0.0         % 0.0       % 100.0          % 100.0       

Analisis Akurasi Per soal

In [60]:
def calculate_sentence_accuracy(gpt_tags, stanza_tags, manual_tags):
    """Menghitung akurasi untuk satu kalimat"""
    len_gpt = len(gpt_tags)
    len_stanza = len(stanza_tags)
    len_manual = len(manual_tags)
    
    max_len = max(len_gpt, len_stanza, len_manual)
    
    gpt_correct_specific = 0
    stanza_correct_specific = 0
    gpt_correct_general = 0
    stanza_correct_general = 0
    valid_tokens = 0
    
    for idx in range(max_len):
        gpt_tok, gpt_tag = gpt_tags[idx] if idx < len_gpt else (None, None)
        stanza_tok, stanza_tag = stanza_tags[idx] if idx < len_stanza else (None, None)
        manual_tok, manual_tag = manual_tags[idx] if idx < len_manual else (None, None)
        
        if manual_tok is not None:
            valid_tokens += 1
            
            # Mapping tag
            gpt_mapped = map_tag(gpt_tag) if gpt_tag else None
            stanza_mapped = map_tag(stanza_tag) if stanza_tag else None
            manual_mapped = map_tag(manual_tag) if manual_tag else None
            
            # Kategori umum
            gpt_general = get_general_category(gpt_mapped)
            stanza_general = get_general_category(stanza_mapped)
            manual_general = get_general_category(manual_mapped)
            
            # Hitung kebenaran spesifik
            if (gpt_tok == manual_tok) and (gpt_mapped == manual_mapped):
                gpt_correct_specific += 1
            if (stanza_tok == manual_tok) and (stanza_mapped == manual_mapped):
                stanza_correct_specific += 1
                
            # Hitung kebenaran kategori umum
            if (gpt_tok == manual_tok) and (gpt_general == manual_general):
                gpt_correct_general += 1
            if (stanza_tok == manual_tok) and (stanza_general == manual_general):
                stanza_correct_general += 1
    
    results = {
        'total_tokens': valid_tokens,
        'gpt_specific_correct': gpt_correct_specific,
        'stanza_specific_correct': stanza_correct_specific,
        'gpt_general_correct': gpt_correct_general,
        'stanza_general_correct': stanza_correct_general,
        'gpt_specific_accuracy': gpt_correct_specific / valid_tokens if valid_tokens > 0 else 0,
        'stanza_specific_accuracy': stanza_correct_specific / valid_tokens if valid_tokens > 0 else 0,
        'gpt_general_accuracy': gpt_correct_general / valid_tokens if valid_tokens > 0 else 0,
        'stanza_general_accuracy': stanza_correct_general / valid_tokens if valid_tokens > 0 else 0
    }
    
    return results

In [61]:
# Analisis per soal
print("=== ANALISIS AKURASI PER SOAL ===")
print("=" * 120)
print(f"{'No':<4} {'Tokens':<8} {'GPT Spesifik':<12} {'GPT Umum':<10} {'Stanza Spesifik':<15} {'Stanza Umum':<12} {'Selisih Spesifik':<15} {'Selisih Umum':<12} {'Soal':<15}")
print("=" * 120)

# Untuk menyimpan statistik per soal
question_stats = []
soal_performance = defaultdict(list)

for i, row in df.iterrows():
    # Ambil informasi soal (asumsi ada kolom 'soal', sesuaikan dengan nama kolom actual)
    soal_id = row.get('soal', f'Soal_{i+1}')  # Jika tidak ada kolom soal, gunakan nomor urut
    
    gpt = parse_tagged_sentence(row['gpt'])
    stanza = parse_tagged_sentence(row['stanza'])
    manual = parse_tagged_sentence(row['manual'])
    
    accuracy = calculate_sentence_accuracy(gpt, stanza, manual)
    
    # Simpan statistik
    stats = {
        'sentence_id': i + 1,
        'soal': soal_id,
        'total_tokens': accuracy['total_tokens'],
        'gpt_specific_accuracy': accuracy['gpt_specific_accuracy'],
        'stanza_specific_accuracy': accuracy['stanza_specific_accuracy'],
        'gpt_general_accuracy': accuracy['gpt_general_accuracy'],
        'stanza_general_accuracy': accuracy['stanza_general_accuracy'],
        'diff_specific': accuracy['gpt_specific_accuracy'] - accuracy['stanza_specific_accuracy'],
        'diff_general': accuracy['gpt_general_accuracy'] - accuracy['stanza_general_accuracy']
    }
    
    question_stats.append(stats)
    soal_performance[soal_id].append(stats)
    
    # Tampilkan hasil per kalimat
    print(f"{i+1:<4} {accuracy['total_tokens']:<8} "
          f"{accuracy['gpt_specific_accuracy']:<12.1%} {accuracy['gpt_general_accuracy']:<10.1%} "
          f"{accuracy['stanza_specific_accuracy']:<15.1%} {accuracy['stanza_general_accuracy']:<12.1%} "
          f"{stats['diff_specific']:<15.1%} {stats['diff_general']:<12.1%} {str(soal_id):<15}")

print("=" * 120)

# Agregasi per soal (jika ada beberapa kalimat per soal)
print("\n\n=== RANGKUMAN AKURASI PER SOAL ===")
print("=" * 100)
print(f"{'Kalimat':<8} {'Rata-rata GPT Spesifik':<20} {'Rata-rata GPT Umum':<18} {'Rata-rata Stanza Spesifik':<22} {'Rata-rata Stanza Umum':<20} {'Soal':<15}")
print("=" * 100)

soal_summary = []
for soal_id, stats_list in soal_performance.items():
    num_sentences = len(stats_list)
    avg_gpt_specific = sum(s['gpt_specific_accuracy'] for s in stats_list) / num_sentences
    avg_gpt_general = sum(s['gpt_general_accuracy'] for s in stats_list) / num_sentences
    avg_stanza_specific = sum(s['stanza_specific_accuracy'] for s in stats_list) / num_sentences
    avg_stanza_general = sum(s['stanza_general_accuracy'] for s in stats_list) / num_sentences
    
    summary = {
        'soal': soal_id,
        'num_sentences': num_sentences,
        'avg_gpt_specific': avg_gpt_specific,
        'avg_gpt_general': avg_gpt_general,
        'avg_stanza_specific': avg_stanza_specific,
        'avg_stanza_general': avg_stanza_general,
        'avg_diff_specific': avg_gpt_specific - avg_stanza_specific,
        'avg_diff_general': avg_gpt_general - avg_stanza_general
    }
    
    soal_summary.append(summary)
    
    print(f"{num_sentences:<8} {avg_gpt_specific:<20.1%} {avg_gpt_general:<18.1%} "
          f"{avg_stanza_specific:<22.1%} {avg_stanza_general:<20.1%} {str(soal_id):<15}")

print("=" * 100)

# Ranking soal berdasarkan performa
print("\n\n=== RANKING SOAL BERDASARKAN PERFORMA GPT ===")
print("(Berdasarkan akurasi spesifik GPT)")
gpt_ranking = sorted(soal_summary, key=lambda x: x['avg_gpt_specific'], reverse=True)

print("=" * 80)
print(f"{'Ranking':<8} {'GPT Spesifik':<15} {'GPT Umum':<15} {'Selisih vs Stanza':<20} {'Soal':<15}")
print("=" * 80)

for rank, summary in enumerate(gpt_ranking, 1):
    print(f"{rank:<8} {summary['avg_gpt_specific']:<15.1%} "
          f"{summary['avg_gpt_general']:<15.1%} {summary['avg_diff_specific']:<20.1%} {str(summary['soal']):<15}")

print("\n\n=== RANKING SOAL BERDASARKAN PERFORMA STANZA ===")
print("(Berdasarkan akurasi spesifik Stanza)")
stanza_ranking = sorted(soal_summary, key=lambda x: x['avg_stanza_specific'], reverse=True)

print("=" * 80)
print(f"{'Ranking':<8} {'Stanza Spesifik':<17} {'Stanza Umum':<15} {'Selisih vs GPT':<20} {'Soal':<15}")
print("=" * 80)

for rank, summary in enumerate(stanza_ranking, 1):
    print(f"{rank:<8} {summary['avg_stanza_specific']:<17.1%} "
          f"{summary['avg_stanza_general']:<15.1%} {-summary['avg_diff_specific']:<20.1%} {str(summary['soal']):<15}")

# Statistik keseluruhan
total_sentences = len(question_stats)
overall_gpt_specific = sum(s['gpt_specific_accuracy'] for s in question_stats) / total_sentences
overall_gpt_general = sum(s['gpt_general_accuracy'] for s in question_stats) / total_sentences
overall_stanza_specific = sum(s['stanza_specific_accuracy'] for s in question_stats) / total_sentences
overall_stanza_general = sum(s['stanza_general_accuracy'] for s in question_stats) / total_sentences

print(f"\n\n=== STATISTIK KESELURUHAN ===")
print(f"Total kalimat dianalisis: {total_sentences}")
print(f"Rata-rata akurasi GPT (spesifik): {overall_gpt_specific:.2%}")
print(f"Rata-rata akurasi GPT (umum): {overall_gpt_general:.2%}")
print(f"Rata-rata akurasi Stanza (spesifik): {overall_stanza_specific:.2%}")
print(f"Rata-rata akurasi Stanza (umum): {overall_stanza_general:.2%}")
print(f"Selisih rata-rata (GPT - Stanza) spesifik: {(overall_gpt_specific - overall_stanza_specific):.2%}")
print(f"Selisih rata-rata (GPT - Stanza) umum: {(overall_gpt_general - overall_stanza_general):.2%}")

# Identifikasi soal dengan performa ekstrem
print(f"\n=== SOAL DENGAN PERFORMA TERBAIK DAN TERBURUK ===")
best_gpt = max(soal_summary, key=lambda x: x['avg_gpt_specific'])
worst_gpt = min(soal_summary, key=lambda x: x['avg_gpt_specific'])
best_stanza = max(soal_summary, key=lambda x: x['avg_stanza_specific'])
worst_stanza = min(soal_summary, key=lambda x: x['avg_stanza_specific'])

print(f"Soal terbaik untuk GPT: {best_gpt['soal']} ({best_gpt['avg_gpt_specific']:.1%})")
print(f"Soal terburuk untuk GPT: {worst_gpt['soal']} ({worst_gpt['avg_gpt_specific']:.1%})")
print(f"Soal terbaik untuk Stanza: {best_stanza['soal']} ({best_stanza['avg_stanza_specific']:.1%})")
print(f"Soal terburuk untuk Stanza: {worst_stanza['soal']} ({worst_stanza['avg_stanza_specific']:.1%})")

# Ekspor hasil ke DataFrame untuk analisis lebih lanjut (opsional)
df_results = pd.DataFrame(question_stats)
print(f"\n=== DATA TERSIMPAN ===")
print("Data hasil analisis tersimpan dalam variabel 'df_results'")
print("Kolom yang tersedia:", list(df_results.columns))
print("\nContoh 5 baris pertama:")
print(df_results.head())

=== ANALISIS AKURASI PER SOAL ===
No   Tokens   GPT Spesifik GPT Umum   Stanza Spesifik Stanza Umum  Selisih Spesifik Selisih Umum Soal           
1    11       100.0%       100.0%     90.9%           90.9%        9.1%            9.1%         Bagian dari bidang biologi yang membutuhkan penggunaan mikroskop, sebagai contoh
2    11       72.7%        90.9%      63.6%           81.8%        9.1%            9.1%         Dalam studi biologi, penggunaan mikroskop diperlukan untuk mempelajari hal seperti
3    11       63.6%        90.9%      63.6%           90.9%        0.0%            0.0%         Ilmu biologi yang membutuhkan penggunaan mikroskop untuk mengkaji, sebagai contoh
4    14       64.3%        85.7%      64.3%           85.7%        0.0%            0.0%         Cabang biologi yang mendorong peneliti untuk menghasilkan bibit unggul melalui hibridisasi (persilangan) 
5    15       73.3%        93.3%      73.3%           93.3%        0.0%            0.0%         Cabang ilmu biologi y