In [6]:
!pip install gdown
!pip install datasets
!pip install transformers


!gdown --folder "https://drive.google.com/drive/folders/17S2rfDOzBDvzBbZCrPMXgHashy6ob3Lz?usp=sharing"
!gdown --folder "https://drive.google.com/drive/folders/1-BRcY-RUeZe0gvUVAdekkzJlxF4Sdz43?usp=sharing"
!gdown --folder "https://drive.google.com/drive/folders/1GB_3zmE8j-iD7Fdg2wKtgv32IeiDxdMG?usp=sharing"
!gdown --folder "https://drive.google.com/drive/folders/1hMV9W-Gybh8t0e131G7b-BXzQL4kJrdY?usp=sharing"
!gdown --folder "https://drive.google.com/drive/folders/1RL5gUz_Sc_-EfNPmqKala47j3Mt_ODpn?usp=sharing"

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Collecting PySocks!=1.5.7,>=1.5.6
  Downloading PySocks-1.7.1-py3-none-any.whl (16 kB)
Installing collected packages: PySocks, gdown
Successfully installed PySocks-1.7.1 gdown-5.2.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_

In [65]:
ENTITY_ATTRIBUTE_PAIRS = ['FOOD,QUALITY', 'RESTAURANT,GENERAL', 'FOOD,STYLE OPTIONS', 'FOOD,PRICES', 'DRINKS,STYLE OPTIONS', 'SERVICE,GENERAL', 'RESTAURANT,PRICES', 'DRINKS,QUALITY', 'DRINKS,PRICES', 'LOCATION,GENERAL', 'AMBIENCE,GENERAL', 'RESTAURANT,MISCELLANEOUS']

# Pipeline 1: Entity - Attribute Extraction

In [66]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = "./results_bert_20241211_134025"

# Load the model and tokenizer
model_load = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer_load = AutoTokenizer.from_pretrained(model_path)

print("Model and tokenizer loaded successfully!")

Model and tokenizer loaded successfully!


In [67]:
import torch
import numpy as np

def predict_categories(text, model, tokenizer, entity_attribute_combinations):
    """
    Given a text string, use the model to predict which entity-attribute categories
    are present. Returns a list of predicted categories.
    """
    # We'll store predictions here
    predicted_pairs = []

    # Iterate over all possible entity-attribute combinations
    for pair in entity_attribute_combinations:
        # Tokenize the text with the given pair as the "label"
        tokens = tokenizer(text, pair, return_tensors="pt", truncation=True, padding=True)
        tokens = {k: v.to(model.device) for k, v in tokens.items()}

        # Run inference
        with torch.no_grad():
            outputs = model(**tokens)
            logits = outputs.logits
            predicted_label = np.argmax(logits.cpu().numpy(), axis=-1)[0]

        # If the model predicts 1, include this pair in the results
        if predicted_label == 1:
            # Format as in your code: upper, replacing space with underscores and commas with '#'
            formatted_pair = pair.upper().replace(",", "#").replace(" ", "_")
            predicted_pairs.append(formatted_pair)

    return predicted_pairs


text_input = "The pizza crust was the best and the service was a bitbad."
predicted = predict_categories(text_input, model_load, tokenizer_load, ENTITY_ATTRIBUTE_PAIRS)
print("Predicted categories:", predicted)

Predicted categories: ['FOOD#QUALITY', 'SERVICE#GENERAL']


# Pipeline 2: Opinion Target Extraction

In [68]:
label_map = {"O": 0, "B-OTE": 1, "I-OTE": 2}

In [69]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_path_2 = "./results_bert_20241211_205040"

# Load the model and tokenizer
model_load_2 = AutoModelForTokenClassification.from_pretrained(model_path_2)
tokenizer_load_2 = AutoTokenizer.from_pretrained(model_path_2)

print("Model and tokenizer loaded successfully!")

Model and tokenizer loaded successfully!


In [71]:
def predict_ote(model, tokenizer, text, aspect, label_map):
    model.eval()  # Set model to evaluation mode

    # Tokenize the input text
    tokens = tokenizer(
        aspect,
        text,
        truncation=True,
        padding=True,
        return_tensors="pt",
        return_offsets_mapping=True  # Include offsets
    )
    input_ids = tokens["input_ids"]
    attention_mask = tokens["attention_mask"]
    offset_mapping = tokens["offset_mapping"][0].tolist()  # Offset mapping for tokens

    # Perform inference
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Get logits for each token
        predictions = torch.argmax(logits, dim=2)  # Get the predicted class for each token

    # Convert predictions to labels
    label_map_reverse = {v: k for k, v in label_map.items()}  # Reverse the label map
    predicted_labels = [label_map_reverse[label.item()] for label in predictions[0]]

    # Convert tokens to readable format and filter OTEs
    tokens_list = tokenizer.convert_ids_to_tokens(input_ids[0])

    # Determine the index of the first text token (ignoring aspect tokens)
    text_start_index = len(tokenizer(aspect, add_special_tokens=False)["input_ids"])

    ote_tokens = []
    ongoing_ote = False  # Track if we're inside a valid OTE span

    for idx, (token, label, (start, end)) in enumerate(zip(tokens_list, predicted_labels, offset_mapping)):
        if start == 0 and end == 0:  # Skip special tokens like [CLS] or [SEP]
            continue

        if idx < text_start_index:  # Ignore tokens before the main text
            continue

        if label == "B-OTE":  # Start a new OTE span
            ongoing_ote = True
            ote_tokens.append({
                "token": token,
                "start": start,
                "end": end
            })
        elif label == "I-OTE" and ongoing_ote:  # Continue a valid OTE span
            ote_tokens[-1]["token"] += token.replace("##", "")  # Merge sub-token (if any)
            ote_tokens[-1]["end"] = end  # Update the end offset
        else:
            ongoing_ote = False  # Reset if not part of an OTE span

    return ote_tokens


In [72]:
predict_ote(model_load_2, tokenizer_load_2, "The pizza crust was the best and the service was a bit bad", "SERVICE#GENERAL", label_map)

[{'token': 'service', 'start': 37, 'end': 44}]

# Pipeline 3: Sentiment Polarity

In [73]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from transformers import TrainingArguments, Trainer

model_task3 = AutoModelForSequenceClassification.from_pretrained("./model_bert_task3")
tokenizer_task3 = AutoTokenizer.from_pretrained("./tokenizer_bert_base")

def preprocess_function(examples):
    return tokenizer_task3(examples["text"], padding="max_length", truncation=True,  return_tensors="pt")

In [74]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch

# Assuming model and tokenizer are already loaded
def predict_polarity(model, sentence, entity, attribute, target, tokenizer):
    model.eval()

    # Prepend the entity, attribute, and target to the sentence for aspect-driven sentiment analysis
    input_text = f"{target} {attribute} {entity} [SEP] {sentence}"

    # Tokenize the input text
    encoded_inputs = tokenizer(
        input_text,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    input_ids = encoded_inputs["input_ids"]
    attention_mask = encoded_inputs["attention_mask"]

    # Perform inference
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

    return prediction


In [75]:
def ABSA(sentence_input):
    predicted_categories = predict_categories(sentence_input, model_load, tokenizer_load, ENTITY_ATTRIBUTE_PAIRS)
    predicted_targets = []
    
    # Collect each aspect's tokens for the predicted categories
    for category in predicted_categories:
        tokens = predict_ote(model_load_2, tokenizer_load_2, sentence_input, category, label_map)
        for token in tokens:
            predicted_targets.append({"aspect": category, "token": token["token"]})

    # Process each aspect and target for sentiment analysis
    for item in predicted_targets:
        entity = item["aspect"].split('#')[0]
        attribute = item["aspect"].split('#')[1]
        target = item['token']

        prediction = predict_polarity(model_task3, sentence_input, entity, attribute, target, tokenizer_task3)

        print(f"Input: {sentence_input}")
        print(f"Category: {entity}#{attribute}")
        print(f"Target: {target}")
        if prediction == 0:
            print("Polarity: negative")
        elif prediction == 1:
            print("Polarity: neutral")
        else:
            print("Polarity: positive")

        print()

In [84]:
sentence_input = input("Enter a sentence: ")
ABSA(sentence_input)

Input: The steak was expensive
Category: FOOD#PRICES
Target: steak
Polarity: positive



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=1f59986d-c4b4-4654-929d-a1205f783ae5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>