In [2]:
import numpy as np
import torch
from transformers import AutoTokenizer,AutoModel
from ASTE_dataloader import load_vocab
from scheme.span_tagging import form_label_id_map, form_sentiment_id_map
from model import base_model

import os
import pickle


In [11]:
# dataset_dir = 'data\ASTE-Data-V2-EMNLP2020'
dataset_dir = 'data\ASTE-Data-V2-EMNLP2020_TRANSLATED_TO_ARABIC'
dataset = '16res'
version = '3D'
bert_model= 'aubmindlab/bert-base-arabertv2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
modelPath = 'outputs\\best_models\\16res_3D_True_best.pkl'

In [12]:
tokenizer = AutoTokenizer.from_pretrained(bert_model)
dataset_dir = dataset_dir + '/' + dataset
vocab = load_vocab(dataset_dir = dataset_dir)
label2id, id2label = form_label_id_map(version)
senti2id, id2senti = form_sentiment_id_map()
vocab['label_vocab'] = dict(label2id=label2id,id2label=id2label)
vocab['senti_vocab'] = dict(senti2id=senti2id,id2senti=id2senti)


In [13]:
def preprocess_sentence(sentence, tokenizer, vocab, max_len=128, lower=True):
    """
    Preprocess a single sentence for inference with the STAGE model.
    
    Args:
        sentence (str): Input sentence as a string
        tokenizer: BERT tokenizer
        vocab: Vocabulary dictionary containing token_vocab and label mappings
        version (str): Tagging scheme version ('3D', '2D', or '1D')
        max_len (int): Maximum length for BERT tokens
        lower (bool): Whether to lowercase the text
        
    Returns:
        dict: Preprocessed input ready for model inference
    """
    # Split sentence into tokens
    tokens = sentence.split()
    
    # Extract required vocabularies
    token_vocab = vocab['token_vocab']
    
    # Get special token IDs
    CLS_id = tokenizer.convert_tokens_to_ids([tokenizer.cls_token])
    SEP_id = tokenizer.convert_tokens_to_ids([tokenizer.sep_token])
    
    # Lowercase if needed
    if lower:
        tokens = [t.lower() for t in tokens]
    
    # Convert to BERT tokens and get word mapping
    def text2bert_id(tokens):
        re_token = []
        word_mapback = []
        word_split_len = []
        for idx, word in enumerate(tokens):
            temp = tokenizer.tokenize(word)
            re_token.extend(temp)
            word_mapback.extend([idx] * len(temp))
            word_split_len.append(len(temp))
        re_id = tokenizer.convert_tokens_to_ids(re_token)
        return re_id, word_mapback, word_split_len
    
    # Process tokens to BERT IDs
    text_raw_bert_indices, word_mapback, _ = text2bert_id(tokens)
    
    # Truncate to max length
    text_raw_bert_indices = text_raw_bert_indices[:max_len]
    word_mapback = word_mapback[:max_len]
    
    # Get token length and check
    length = word_mapback[-1] + 1 if word_mapback else 0
    if length != len(tokens):
        # Truncate tokens if needed
        tokens = tokens[:length]
    
    bert_length = len(word_mapback)
    
    # Convert tokens to IDs
    token_ids = [token_vocab.stoi.get(t, token_vocab.unk_index) for t in tokens[:length]]
    
    # Create processed dictionary
    processed = {
        'token': token_ids,
        'token_length': length,
        'bert_token': CLS_id + text_raw_bert_indices + SEP_id,
        'bert_length': bert_length,
        'bert_word_mapback': word_mapback,
        'golden_label': None  # No labels for inference
    }
    
    # Convert to batch format (single item batch)
    batch = {
        'token': torch.LongTensor([processed['token']]),
        'token_length': torch.tensor([processed['token_length']]),
        'bert_token': torch.LongTensor([processed['bert_token']]),
        'bert_length': torch.tensor([processed['bert_length']]),
        'bert_word_mapback': torch.LongTensor([processed['bert_word_mapback']]),
        'golden_label': None
    }
    
    return batch 

In [14]:
def extract_triplets_from_output(outputs, tokens, id2label, threshold=0.5):
    """
    Extract aspect-sentiment-opinion triplets from model outputs
    
    Args:
        outputs: Model output logits
        tokens: Original sentence tokens
        id2label: Mapping from ID to label string
        threshold: Confidence threshold for predictions
        
    Returns:
        list: Extracted triplets in [(aspect, opinion, sentiment)] format
    """
    # Get the predicted labels for each span
    raw_predicted_table_id = outputs['logits'].argmax(dim=-1).cpu().numpy()[0]
    scores = torch.softmax(outputs['logits'], dim=-1).cpu().numpy()[0]
    
    # Map IDs back to label strings
    predicted_table = [[id2label[x] for x in y] for y in raw_predicted_table_id]
    
    # Extract spans based on the tagging scheme
    aspect_spans = []
    opinion_spans = []
    sentiment_spans = {}
    
    n = len(tokens)
    for i in range(n):
        for j in range(i, n):
            confidence = scores[i, j, raw_predicted_table_id[i, j]]
            if confidence < threshold:
                continue
                
            label = predicted_table[i][j]
            
            # Parse based on tagging scheme (3D/2D/1D)
            if '-' in label:  # 3D or 2D
                parts = label.split('-')
                if len(parts) == 3:  # 3D scheme
                    aspect_part, opinion_part, sentiment_part = parts
                    if aspect_part == 'A':
                        aspect_spans.append((i, j))
                    if opinion_part == 'O':
                        opinion_spans.append((i, j))
                    if sentiment_part not in ['N', '']:
                        sentiment_spans[(i, j)] = sentiment_part
                elif len(parts) == 2:  # 2D scheme
                    aspect_opinion_part, sentiment_part = parts
                    if aspect_opinion_part == 'A':
                        aspect_spans.append((i, j))
                    elif aspect_opinion_part == 'O':
                        opinion_spans.append((i, j))
                    if sentiment_part not in ['N', '']:
                        sentiment_spans[(i, j)] = sentiment_part
            else:  # 1D scheme
                if label == 'A':
                    aspect_spans.append((i, j))
                elif label == 'O':
                    opinion_spans.append((i, j))
                elif label in ['POS', 'NEG', 'NEU']:
                    sentiment_spans[(i, j)] = label
    
    # Extract triplets using greedy inference
    triplets = []
    for a_span in aspect_spans:
        for o_span in opinion_spans:
            # Check if a valid sentiment exists
            sentiment = None
            
            # Case 1: sentiment for the combined span
            combined_span = (min(a_span[0], o_span[0]), max(a_span[1], o_span[1]))
            if combined_span in sentiment_spans:
                sentiment = sentiment_spans[combined_span]
            
            # Case 2: sentiment for aspect span
            elif a_span in sentiment_spans:
                sentiment = sentiment_spans[a_span]
            
            # Case 3: sentiment for opinion span
            elif o_span in sentiment_spans:
                sentiment = sentiment_spans[o_span]
            
            if sentiment:
                aspect_text = " ".join(tokens[a_span[0]:a_span[1]+1])
                opinion_text = " ".join(tokens[o_span[0]:o_span[1]+1])
                triplets.append((aspect_text, opinion_text, sentiment))
    
    return triplets

In [15]:
def get_outputs(model, sentence, device):
    """
    Get model outputs for a given batch
    
    Args:
        model: The trained model
        batch: Preprocessed input batch
        device: Device to run the model on
        
    Returns:
        dict: Model outputs including logits and other information
    """
    model.to(device)
    
    input_batch = preprocess_sentence(
        sentence=sentence,
        tokenizer=tokenizer,
        vocab=vocab,
    )
    
    for k in input_batch:
        if isinstance(input_batch[k], torch.Tensor):
            input_batch[k] = input_batch[k].to(device)

    with torch.no_grad():
        outputs = model(input_batch)
    
    return outputs

In [16]:
#load the pickeled model 
model = torch.load(modelPath, weights_only=False)


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [None]:
import time
sentence = "كان النادل في المطعم غير منظم لكن الطعام لذيذ"
start_time = time.time()
tokens = sentence.split()
outputs = get_outputs(model, sentence, device)
# Extract triplets
triplets = extract_triplets_from_output(
    outputs=outputs,
    tokens=tokens,
    id2label=vocab['label_vocab']['id2label'],
    threshold=0.5  
)
end_time = time.time()
print(f"Inference time: {end_time - start_time:.4f} seconds")

# Display the extracted triplets
print(f"Input: {sentence}")
print("Extracted triplets:")
for aspect, opinion, sentiment in triplets:
    print(f"  • Aspect: '{aspect}', Opinion: '{opinion}', Sentiment: {sentiment}")

Inference time: 0.0496 seconds
Input: كان النادل في المطعم غير منظم لكن الطعام لذيذ
Extracted triplets:
  • Aspect: 'النادل', Opinion: 'غير منظم', Sentiment: NEG
  • Aspect: 'الطعام', Opinion: 'لذيذ', Sentiment: POS
