In [14]:
from os import path
import pandas as pd
import numpy as np
import json
import requests
from tqdm import tqdm
from dotenv import load_dotenv
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os

# ƒê·ªãnh nghƒ©a ƒë∆∞·ªùng d·∫´n
DATA_DIR = r"c:\Users\Admin\Python\ABSA_Prompting\data"
RESULT_DIR = r"c:\Users\Admin\Python\ABSA_Prompting\results"
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(RESULT_DIR, exist_ok=True)

In [15]:
load_dotenv()

True

In [16]:
# CONFIG
ViABSA_BP_dir = path.join(DATA_DIR, 'ViABSA_BP')
test_file = path.join(ViABSA_BP_dir, 'data_test.csv')
test_df = pd.read_csv(test_file)

In [17]:
def transform_aspect_sentiment(df, start=0, end=None):
    aspects = [
        "stayingpower",
        "texture",
        "smell",
        "price",
        "others",
        "colour",
        "shipping",
        "packing"
    ]

    if end is None:
        end = len(df)

    result = []

    for idx, row in df.iloc[start:end].iterrows():
        entry = {
            "id": str(idx),
            "text": row['data'],
            "sentiments": []
        }

        for aspect in aspects:
            sentiment = row[f"{aspect}_label"]
            if sentiment == 1: 
                aspect_sentiment_value = row[aspect]
                if aspect_sentiment_value != 'none':
                    entry["sentiments"].append({
                        "aspect": aspect,
                        "sentiment": aspect_sentiment_value
                    })
                else:
                    entry["sentiments"].append({
                        "aspect": aspect,
                        "sentiment": "unknown"
                    })

        result.append(entry)

    return result

In [18]:
# SETUP DATA
aspects = ['stayingpower', 'texture', 'smell', 'price', 'others', 'colour', 'shipping', 'packing']
test_df[aspects] = test_df[aspects].fillna('none')

for aspect in aspects:
    test_df[aspect + '_label'] = (test_df[aspect] != 'none').astype(int)

test_json = transform_aspect_sentiment(test_df, 0, 100)
test_json[:5]

[{'id': '0',
  'text': 'H√†ng ƒë√≥ng g√≥i ƒë·∫πp v√† ch·∫Øc ch·∫Øn, nh√¨n r·∫•t d·ªÖ th∆∞∆°ng v√† ∆∞ng b·ª•ng ·∫°!',
  'sentiments': [{'aspect': 'packing', 'sentiment': 'positive'}]},
 {'id': '1',
  'text': 'C·∫£m gi√°c son b√™n trong r·∫•t l√† √≠t lu√¥n √≠, ƒë√≥ng g√≥i c·∫©n th·∫≠n, d·ªãch nh∆∞ng m√† giao h√†ng kh√° nhanh',
  'sentiments': [{'aspect': 'shipping', 'sentiment': 'positive'},
   {'aspect': 'packing', 'sentiment': 'positive'}]},
 {'id': '2',
  'text': 'Son si√™u ƒë·∫πp lu√¥n √Ω, m√† y h√¨nh m√† ch·ª•p c√≥ th·ªÉ kh√¥ng gi·ªëng l·∫Øm nh∆∞ng nh√¨n ngo√†i th√¨ gi·ªëng nha. Ch·∫•t son m·ªÅm m∆∞·ªõt n√≥i chung l√† r·∫•t th√≠chhhh',
  'sentiments': [{'aspect': 'texture', 'sentiment': 'positive'}]},
 {'id': '3',
  'text': 'Siu ƒë·∫πp lu√¥n \r\nShop ƒë√≥ng g√≥i c·∫©n th·∫≠n l·∫Øm lu√¥n \r\nL·∫°i th√™m c·∫£ qu√† n·ªØa\r\nN√≥i chung l√† th√≠ch l·∫Øm',
  'sentiments': [{'aspect': 'packing', 'sentiment': 'positive'}]},
 {'id': '4',
  'text': 'A√∫hihƒëcyihb gfxxth jj bhgfzƒëE G GHVHBTCE

**√ù t∆∞·ªüng ch·ªçn example cho few-shot**
1.   Ch·ªçn ra nh·ªØng c√¢u ƒëa d·∫°ng v·ªÅ aspect t·ª´ dataset -> g√¥m c·ª•m ch√∫ng l·∫°i ƒë·ªÉ LLMs hi·ªÉu r√µ h∆°n v·ªÅ pattern ƒë·ªÉ h·ªçc
2.   Ch·ªçn ra nh·ªØng c√¢u kh√≥, nh·ªØng c√¢u ph·ª©c t·∫°p h∆°n 

√Åp d·ª•ng K-Means Clustering + Ch·ªçn Centroid + Sampling Example kh√≥

In [19]:
'''
    N·∫øu d√πng TD-IDF th√¨ s·∫Ω ch·ªâ ch·ªçn d·ª±a v√†o t·∫ßn su·∫•t t·ª´, kh√¥ng hi·ªÉu nghƒ©a c·ªßa c√¢u -> b·ªã tr√πng l·∫∑p
    Semantic grouping ƒë·ªÉ ch·ªçn example ƒëa d·∫°ng v·ªÅ n·ªôi dung, kh√¥ng ch·ªâ v·ªÅ c√°c t·ª´
'''
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(test_df['data'].tolist(), show_progress_bar=True)

# KMeans clustering (70% ƒëa d·∫°ng, 30% kh√≥)
n_total = 5
n_diverse = int(n_total * 0.7)
n_hard = n_total - n_diverse

kmeans = KMeans(n_clusters=n_diverse, random_state=42, n_init=10)
kmeans.fit(embeddings)

# L·∫•y diverse example g·∫ßn cluster centroid
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, embeddings)
diverse_examples = test_df.iloc[closest]

print(f"Selected {len(diverse_examples)} diverse examples:")
for i, row in diverse_examples.iterrows():
    print(f"\nExample {i}:\n{row['data']}")

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:12<00:00,  4.16it/s]


Selected 3 diverse examples:

Example 1524:
Son ƒë·∫πp, h√†ng h·ªãnüòÅ, m√™ l·∫Øm lu√¥n, tuy k l·ª≥ nh∆∞ng b√π l·∫°i b√¥i l√™n m√¥i n√≥ kh√¥ng b·ªã kh√¥ m√¥i, xinh l·∫Øm lu√¥n, ƒë√≥ng g√≥i r·∫•t ch·∫Øc ch·∫Øn, gi√° r·∫ª, giao h√†ng t·ª´ TQ v·ªÅ VN n√™n h∆°i l√¢u...nh∆∞ng kh√¥ng saoüòäxinh l√† th√≠chüòÇüòÇ n√™n mua ·ªßng h·ªô shop nh√© m.n

Example 1574:
M√†u son ƒë·∫πp nh∆∞ng h∆°i l√¢u kh√¥ t√≠... 
Ch·∫•t son c≈©ng t·∫°m ƒë∆∞·ª£c
V·ªè son ƒë·∫πp
Son th∆°m c·ª±c luoonnnn

Example 1591:
ƒë·∫πppppppppppp xƒ©uuuuuuuuuuuu hdjhsjsjsjdjdjjdjdjdjdjd


In [20]:
all_indices = np.arange(len(test_df))

remaining_indices = list(set(all_indices) - set(closest))

# Ch·ªçn ng·∫´u nhi√™n n_hard example t·ª´ ph·∫ßn c√≤n l·∫°i 
hard_examples = test_df.iloc[remaining_indices].sample(n=n_hard, random_state=42)

print(f"Selected {len(hard_examples)} hard examples:")
for i, row in hard_examples.iterrows():
    print(f"\nExample {i}:\n{row['data']}")

Selected 2 hard examples:

Example 135:
son s·ªãn . m√†u ƒë·∫πp jaisidiisisjsjsjsjdiididdiidjsjdjjdididi

Example 479:
Giao h√†ng nhanh ch√≥ng. H∆°i nh·ªè so v·ªõi t∆∞·ªüng t∆∞·ª£ng c·ªßa m√¨nh. Z z z nh∆∞ng v·ªõi gi√° ti·ªÅn th√¨ ch·∫•p nh·∫≠n ·∫°. N√™n cho 5 sao


In [21]:
def select_few_shot_examples(df, text_column, n_total=5, model_name='all-MiniLM-L6-v2', random_state=42):
    n_diverse = int(n_total * 0.7)
    n_hard = n_total - n_diverse

    model = SentenceTransformer(model_name)
    embeddings = model.encode(df[text_column].tolist(), show_progress_bar=True)

    kmeans = KMeans(n_clusters=n_diverse, random_state=random_state, n_init=10)
    kmeans.fit(embeddings)

    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, embeddings)
    diverse_df = df.iloc[closest]

    all_indices = np.arange(len(df))
    remaining_indices = list(set(all_indices) - set(closest))
    hard_df = df.iloc[remaining_indices].sample(n=n_hard, random_state=random_state)

    return diverse_df.reset_index(drop=True), hard_df.reset_index(drop=True)

# SET-UP FEW-SHOT EXAMPLES
diverse, hard = select_few_shot_examples(test_df, text_column='data', n_total=5)

few_shot_json =  transform_aspect_sentiment(diverse, 0, 100) + transform_aspect_sentiment(hard, 0, 100)


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:12<00:00,  4.19it/s]


In [22]:
few_shot_json[:10]

[{'id': '0',
  'text': 'Son ƒë·∫πp, h√†ng h·ªãnüòÅ, m√™ l·∫Øm lu√¥n, tuy k l·ª≥ nh∆∞ng b√π l·∫°i b√¥i l√™n m√¥i n√≥ kh√¥ng b·ªã kh√¥ m√¥i, xinh l·∫Øm lu√¥n, ƒë√≥ng g√≥i r·∫•t ch·∫Øc ch·∫Øn, gi√° r·∫ª, giao h√†ng t·ª´ TQ v·ªÅ VN n√™n h∆°i l√¢u...nh∆∞ng kh√¥ng saoüòäxinh l√† th√≠chüòÇüòÇ n√™n mua ·ªßng h·ªô shop nh√© m.n',
  'sentiments': [{'aspect': 'stayingpower', 'sentiment': 'negative'},
   {'aspect': 'texture', 'sentiment': 'positive'},
   {'aspect': 'price', 'sentiment': 'positive'},
   {'aspect': 'shipping', 'sentiment': 'negative'},
   {'aspect': 'packing', 'sentiment': 'positive'}]},
 {'id': '1',
  'text': 'M√†u son ƒë·∫πp nh∆∞ng h∆°i l√¢u kh√¥ t√≠... \nCh·∫•t son c≈©ng t·∫°m ƒë∆∞·ª£c\nV·ªè son ƒë·∫πp\nSon th∆°m c·ª±c luoonnnn',
  'sentiments': [{'aspect': 'texture', 'sentiment': 'positive'},
   {'aspect': 'smell', 'sentiment': 'positive'},
   {'aspect': 'colour', 'sentiment': 'positive'}]},
 {'id': '2',
  'text': 'ƒë·∫πppppppppppp xƒ©uuuuuuuuuuuu hdjhsjsjsjdjdjjdjdjdjdjd',


In [23]:
def evaluate_aspect_sentiment(ground_truth, predictions):
    true_aspects = []
    pred_aspects = []

    true_aspect_sentiments = []
    pred_aspect_sentiments = []

    for gt_entry, pred_entry in zip(ground_truth, predictions):
        gt_sents = gt_entry['sentiments']
        gt_aspect_set = set()
        gt_aspect_sentiment_set = set()

        for item in gt_sents:
            gt_aspect_set.add(item['aspect'])
            gt_aspect_sentiment_set.add((item['aspect'], item['sentiment']))

        true_aspects.append(gt_aspect_set)
        true_aspect_sentiments.append(gt_aspect_sentiment_set)

        pred_sents = pred_entry['results']
        pred_aspect_set = set()
        pred_aspect_sentiment_set = set()

        for item in pred_sents:
            pred_aspect_set.add(item['aspect'])
            pred_aspect_sentiment_set.add((item['aspect'], item['sentiment']))

        pred_aspects.append(pred_aspect_set)
        pred_aspect_sentiments.append(pred_aspect_sentiment_set)

    all_true_aspects = set.union(*true_aspects) if true_aspects else set()
    all_pred_aspects = set.union(*pred_aspects) if pred_aspects else set()

    tp_aspect = sum(len(gt & pred) for gt, pred in zip(true_aspects, pred_aspects))
    fp_aspect = sum(len(pred - gt) for gt, pred in zip(true_aspects, pred_aspects))
    fn_aspect = sum(len(gt - pred) for gt, pred in zip(true_aspects, pred_aspects))

    precision_aspect = tp_aspect / (tp_aspect + fp_aspect + 1e-8)
    recall_aspect = tp_aspect / (tp_aspect + fn_aspect + 1e-8)
    f1_aspect = 2 * precision_aspect * recall_aspect / (precision_aspect + recall_aspect + 1e-8)

    tp_sentiment = sum(len(gt & pred) for gt, pred in zip(true_aspect_sentiments, pred_aspect_sentiments))
    fp_sentiment = sum(len(pred - gt) for gt, pred in zip(true_aspect_sentiments, pred_aspect_sentiments))
    fn_sentiment = sum(len(gt - pred) for gt, pred in zip(true_aspect_sentiments, pred_aspect_sentiments))

    precision_sentiment = tp_sentiment / (tp_sentiment + fp_sentiment + 1e-8)
    recall_sentiment = tp_sentiment / (tp_sentiment + fn_sentiment + 1e-8)
    f1_sentiment = 2 * precision_sentiment * recall_sentiment / (precision_sentiment + recall_sentiment + 1e-8)

    return {
        "Aspect Detection F1": f1_aspect,
        "Sentiment Classification F1": f1_sentiment
    }

In [24]:
# Setup Grok API
def call_grok_api(prompt):
    url = "https://api.x.ai/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {os.getenv('GROK_API_KEY')}",
        "Content-Type": "application/json"
    }
    
    data = {
        "model": "grok-3",
        "messages": [
            {"role": "system", "content": "You are an AI assistant that extracts aspects and their sentiments from text."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0
    }
    
    try:
        response = requests.post(url, headers=headers, json=data, timeout=30)
        if response.status_code == 200:
            result = response.json()
            return result['choices'][0]['message']['content']
        else:
            print(f"API Error: {response.status_code} - {response.text}")
            return None
    except Exception as e:
        print(f"Request Error: {e}")
        return None

def create_clustering_prompt(text, few_shot_examples):
    prompt = "You are an AI assistant that extracts aspects and their sentiments from text.\n\n"
    
    # Th√™m few-shot examples
    for ex in few_shot_examples:
        prompt += f"Extract aspects and sentiments from the following review:\n{ex['text']}\n"
        prompt += f"Result: {json.dumps({'results': ex['sentiments']}, ensure_ascii=False)}\n\n"
    
    # Th√™m c√¢u h·ªèi hi·ªán t·∫°i
    prompt += f"Extract aspects and sentiments from the following review:\n{text}\n"
    prompt += f"Available aspects: {', '.join(aspects)}\n"
    prompt += "Available sentiments: positive, negative, neutral\n"
    prompt += "Return ONLY a valid JSON object in this exact format:\n"
    prompt += '{"results": [{"aspect": "aspect_name", "sentiment": "sentiment_value"}]}'
    
    return prompt

def extract_with_grok_clustering(text, few_shot_examples):
    prompt = create_clustering_prompt(text, few_shot_examples)
    response_text = call_grok_api(prompt)
    
    if response_text:
        try:
            # T√¨m JSON trong response
            start_idx = response_text.find('{')
            end_idx = response_text.rfind('}') + 1
            
            if start_idx != -1 and end_idx != -1:
                json_str = response_text[start_idx:end_idx]
                parsed_output = json.loads(json_str)
                return parsed_output
            else:
                return {"results": []}
        except json.JSONDecodeError:
            print(f"JSON Parse Error: {response_text}")
            return {"results": []}
    else:
        return {"results": []}

predictions = []

# D·ª± ƒëo√°n
for data in tqdm(test_json):
    prediction = extract_with_grok_clustering(data['text'], few_shot_json)
    predictions.append(prediction)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [03:53<00:00,  2.34s/it]


In [25]:
scores = evaluate_aspect_sentiment(test_json, predictions)
print(scores)

{'Aspect Detection F1': 0.9060240913734939, 'Sentiment Classification F1': 0.785542163668457}


In [27]:
result_file = path.join(RESULT_DIR, 'ViABSA_BP_Few-shot-BP-Grok.json')
with open(result_file, 'w') as f:
    json.dump(predictions, f, indent=4, ensure_ascii=False)