# Comparison Results: OpenAI GPT-4o vs Grok vs Gemini

**Note về Grok API:**
- Hiện tại Grok API chưa có tài liệu chính thức công khai
- Code sử dụng URL giả định: `https://api.x.ai/v1/chat/completions`
- Bạn cần có GROK_API_KEY hợp lệ trong file .env
- Nếu API endpoint khác, hãy cập nhật URL trong function `call_grok_api()`

**Mục đích:**
So sánh hiệu suất của các mô hình LLM khác nhau với các kỹ thuật prompting khác nhau cho bài toán ABSA (Aspect-Based Sentiment Analysis).

In [None]:
import pandas as pd
import json
from os import path
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Định nghĩa đường dẫn
DATA_DIR = r"c:\Users\Admin\Python\ABSA_Prompting\data"
RESULT_DIR = r"c:\Users\Admin\Python\ABSA_Prompting\results"

In [None]:
# Load và so sánh kết quả từ các mô hình và kỹ thuật
results_comparison = []

# Danh sách các kết quả cần so sánh
experiments = [
    # OpenAI GPT-4o
    {"model": "GPT-4o", "technique": "Zero-shot", "file": "ViABSA_BP_Zero-shot.json"},
    {"model": "GPT-4o", "technique": "CoT", "file": "ViABSA_BP_CoT.json"},
    {"model": "GPT-4o", "technique": "Few-shot-Clustering", "file": "ViABSA_BP_Few-shot-Clustering.json"},
    
    # Grok
    {"model": "Grok", "technique": "Zero-shot", "file": "ViABSA_BP_Zero-shot_Grok.json"},
    {"model": "Grok", "technique": "CoT", "file": "ViABSA_BP_CoT_Grok.json"},
    {"model": "Grok", "technique": "Few-shot-Clustering", "file": "ViABSA_BP_Few-shot-Clustering_Grok.json"},
    
    # Gemini
    {"model": "Gemini", "technique": "Zero-shot", "file": "ViABSA_BP_Zero-shot_Gemini.json"},
    {"model": "Gemini", "technique": "CoT", "file": "ViABSA_BP_CoT_Gemini.json"},
    {"model": "Gemini", "technique": "Few-shot-Clustering", "file": "ViABSA_BP_Few-shot-Clustering_Gemini.json"},
]

# Load ground truth
ViABSA_BP_dir = path.join(DATA_DIR, 'ViABSA_BP')
test_file = path.join(ViABSA_BP_dir, 'data_test.csv')
test_df = pd.read_csv(test_file)

def transform_aspect_sentiment(df, start=0, end=None):
    aspects = [
        "stayingpower",
        "texture",
        "smell",
        "price",
        "others",
        "colour",
        "shipping",
        "packing"
    ]

    if end is None:
        end = len(df)

    result = []

    for idx, row in df.iloc[start:end].iterrows():
        entry = {
            "id": str(idx),
            "text": row['data'],
            "sentiments": []
        }

        for aspect in aspects:
            sentiment = row[f"{aspect}_label"]
            if sentiment == 1:
                aspect_sentiment_value = row[aspect]
                if aspect_sentiment_value != 'none':
                    entry["sentiments"].append({
                        "aspect": aspect,
                        "sentiment": aspect_sentiment_value
                    })
                else:
                    entry["sentiments"].append({
                        "aspect": aspect,
                        "sentiment": "unknown"
                    })

        result.append(entry)

    return result

# Setup ground truth
aspects = ['stayingpower', 'texture', 'smell', 'price', 'others', 'colour', 'shipping', 'packing']
test_df[aspects] = test_df[aspects].fillna('none')

for aspect in aspects:
    test_df[aspect + '_label'] = (test_df[aspect] != 'none').astype(int)

ground_truth = transform_aspect_sentiment(test_df, 0, 100)

In [None]:
def evaluate_aspect_sentiment(ground_truth, predictions):
    true_aspects = []
    pred_aspects = []

    true_aspect_sentiments = []
    pred_aspect_sentiments = []

    for gt_entry, pred_entry in zip(ground_truth, predictions):
        gt_sents = gt_entry['sentiments']
        gt_aspect_set = set()
        gt_aspect_sentiment_set = set()

        for item in gt_sents:
            gt_aspect_set.add(item['aspect'])
            gt_aspect_sentiment_set.add((item['aspect'], item['sentiment']))

        true_aspects.append(gt_aspect_set)
        true_aspect_sentiments.append(gt_aspect_sentiment_set)

        pred_sents = pred_entry['results']
        pred_aspect_set = set()
        pred_aspect_sentiment_set = set()

        for item in pred_sents:
            pred_aspect_set.add(item['aspect'])
            pred_aspect_sentiment_set.add((item['aspect'], item['sentiment']))

        pred_aspects.append(pred_aspect_set)
        pred_aspect_sentiments.append(pred_aspect_sentiment_set)

    tp_aspect = sum(len(gt & pred) for gt, pred in zip(true_aspects, pred_aspects))
    fp_aspect = sum(len(pred - gt) for gt, pred in zip(true_aspects, pred_aspects))
    fn_aspect = sum(len(gt - pred) for gt, pred in zip(true_aspects, pred_aspects))

    precision_aspect = tp_aspect / (tp_aspect + fp_aspect + 1e-8)
    recall_aspect = tp_aspect / (tp_aspect + fn_aspect + 1e-8)
    f1_aspect = 2 * precision_aspect * recall_aspect / (precision_aspect + recall_aspect + 1e-8)

    tp_sentiment = sum(len(gt & pred) for gt, pred in zip(true_aspect_sentiments, pred_aspect_sentiments))
    fp_sentiment = sum(len(pred - gt) for gt, pred in zip(true_aspect_sentiments, pred_aspect_sentiments))
    fn_sentiment = sum(len(gt - pred) for gt, pred in zip(true_aspect_sentiments, pred_aspect_sentiments))

    precision_sentiment = tp_sentiment / (tp_sentiment + fp_sentiment + 1e-8)
    recall_sentiment = tp_sentiment / (tp_sentiment + fn_sentiment + 1e-8)
    f1_sentiment = 2 * precision_sentiment * recall_sentiment / (precision_sentiment + recall_sentiment + 1e-8)

    return {
        "Aspect Detection F1": f1_aspect,
        "Sentiment Classification F1": f1_sentiment
    }

# Đánh giá từng experiment
for exp in experiments:
    result_file = path.join(RESULT_DIR, exp["file"])
    
    if os.path.exists(result_file):
        with open(result_file, 'r', encoding='utf-8') as f:
            predictions = json.load(f)
        
        scores = evaluate_aspect_sentiment(ground_truth, predictions)
        
        results_comparison.append({
            "Model": exp["model"],
            "Technique": exp["technique"],
            "Aspect Detection F1": scores["Aspect Detection F1"],
            "Sentiment Classification F1": scores["Sentiment Classification F1"],
            "File": exp["file"]
        })
    else:
        print(f"File not found: {result_file}")

# Tạo DataFrame kết quả
results_df = pd.DataFrame(results_comparison)
print("=== COMPARISON RESULTS ===")
print(results_df)

In [None]:
# Tạo biểu đồ so sánh
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Aspect Detection F1
pivot_aspect = results_df.pivot(index='Technique', columns='Model', values='Aspect Detection F1')
pivot_aspect.plot(kind='bar', ax=axes[0], title='Aspect Detection F1 Score Comparison')
axes[0].set_ylabel('F1 Score')
axes[0].legend(title='Model')
axes[0].tick_params(axis='x', rotation=45)

# Sentiment Classification F1
pivot_sentiment = results_df.pivot(index='Technique', columns='Model', values='Sentiment Classification F1')
pivot_sentiment.plot(kind='bar', ax=axes[1], title='Sentiment Classification F1 Score Comparison')
axes[1].set_ylabel('F1 Score')
axes[1].legend(title='Model')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Tạo heatmap so sánh
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Heatmap cho Aspect Detection
sns.heatmap(pivot_aspect, annot=True, fmt='.4f', cmap='YlOrRd', ax=axes[0])
axes[0].set_title('Aspect Detection F1 Score Heatmap')

# Heatmap cho Sentiment Classification
sns.heatmap(pivot_sentiment, annot=True, fmt='.4f', cmap='YlOrRd', ax=axes[1])
axes[1].set_title('Sentiment Classification F1 Score Heatmap')

plt.tight_layout()
plt.show()

In [None]:
# Tóm tắt kết quả theo từng model
print("\n=== SUMMARY BY MODEL ===")
summary_by_model = results_df.groupby('Model').agg({
    'Aspect Detection F1': ['mean', 'std', 'max'],
    'Sentiment Classification F1': ['mean', 'std', 'max']
}).round(4)
print(summary_by_model)

print("\n=== SUMMARY BY TECHNIQUE ===")
summary_by_technique = results_df.groupby('Technique').agg({
    'Aspect Detection F1': ['mean', 'std', 'max'],
    'Sentiment Classification F1': ['mean', 'std', 'max']
}).round(4)
print(summary_by_technique)

In [None]:
# Tìm best performance
print("\n=== BEST PERFORMANCE ===")
best_aspect = results_df.loc[results_df['Aspect Detection F1'].idxmax()]
best_sentiment = results_df.loc[results_df['Sentiment Classification F1'].idxmax()]

print("Best Aspect Detection:")
print(f"Model: {best_aspect['Model']}, Technique: {best_aspect['Technique']}, F1: {best_aspect['Aspect Detection F1']:.4f}")

print("\nBest Sentiment Classification:")
print(f"Model: {best_sentiment['Model']}, Technique: {best_sentiment['Technique']}, F1: {best_sentiment['Sentiment Classification F1']:.4f}")

In [None]:
# Save kết quả so sánh
results_df.to_csv(path.join(RESULT_DIR, 'comparison_results.csv'), index=False)
print(f"\nResults saved to: {path.join(RESULT_DIR, 'comparison_results.csv')}")