# BERT

In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=41c85b23dc1adc3048692d5d2da266c184ef931be7f7661d98692dcd731fd7aa
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [114]:
!pip install tqdm



In [127]:
from datasets import load_dataset

dataset = load_dataset("alexcadillon/SemEval2016Task5", "restaurants")
dataset

DatasetDict({
    trial: Dataset({
        features: ['reviewId', 'sentences', 'opinions'],
        num_rows: 10
    })
    train: Dataset({
        features: ['reviewId', 'sentences', 'opinions'],
        num_rows: 350
    })
    test: Dataset({
        features: ['reviewId', 'sentences', 'opinions'],
        num_rows: 90
    })
})

In [128]:
train_dataset = dataset['train']
train_dataset

Dataset({
    features: ['reviewId', 'sentences', 'opinions'],
    num_rows: 350
})

In [5]:
test_dataset = dataset["test"]
test_dataset

Dataset({
    features: ['reviewId', 'sentences', 'opinions'],
    num_rows: 90
})

In [9]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [7]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=3)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model = model.to(device)

## Concatenate Aspect Category To Input

In [91]:
label_map = {"O": 0, "B-OTE": 1, "I-OTE": 2}

In [130]:
def preprocess_data(data, tokenizer):
  input_ids = []
  attention_masks = []
  all_labels = []
  sentences = data['sentences']

  for sentence in sentences:
    for item in sentence:
      text = item['text']
      opinions = item['opinions']

      for opinion in opinions:
        category = f"{opinion['category']['entity']}#{opinion['category']['attribute']}"
        tokens = tokenizer(
            category,
            text,
            truncation=False,
            padding=False,
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        # Initizialized label
        labels = ["O"] * len(tokens["input_ids"][0])
        target = opinion["target"]
        start = int(opinion["from"])
        end = int(opinion["to"])

        # print(target)
        # print(start)
        # print(end)
        # print(tokens)

        offsets = tokens["offset_mapping"][0]
        readable_token = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])
        for idx, (token_start, token_end) in enumerate(offsets):
          if token_start >= start and token_end <= end:
            if token_start == start:
              labels[idx] = "B-OTE"
            else:
              labels[idx] = "I-OTE"

        # print(labels)
        # print(readable_token)
        # print(target)
        # print(len(labels), len(readable_token))

        label_ids = [label_map[label] for label in labels]

        input_ids.append(tokens["input_ids"][0])
        attention_masks.append(tokens["attention_mask"][0])
        all_labels.append(torch.tensor(label_ids))

  max_length = max(len(ids) for ids in input_ids)
  padded_input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
  padded_attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
  padded_labels = torch.nn.utils.rnn.pad_sequence(all_labels, batch_first=True, padding_value=label_map["O"])

  return padded_input_ids, padded_attention_masks, padded_labels

In [131]:
input_ids, padded_attention_masks, padded_labels = preprocess_data(train_dataset, tokenizer)

In [132]:
input_ids, padded_attention_masks, padded_labels

(tensor([[  101,   155,  9919,  ...,     0,     0,     0],
         [  101, 12342,  2069,  ...,     0,     0,     0],
         [  101, 12342,  2069,  ...,     0,     0,     0],
         ...,
         [  101, 12342,  2069,  ...,     0,     0,     0],
         [  101,   155,  9919,  ...,     0,     0,     0],
         [  101, 12342,  2069,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         ...,
         [1, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]))

In [133]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(input_ids, padded_attention_masks, padded_labels)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [134]:
from transformers import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)
for epoch in range (3):
  model.train()
  total_loss = 0
  for batch in train_dataloader:
    batch_input_ids, batch_attention_masks, batch_labels = [b.to(device) for b in batch]

    outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  print(f"Epoch {epoch + 1} completed, {total_loss}")



Epoch 1 completed, 43.56141265993938
Epoch 2 completed, 28.496933664311655
Epoch 3 completed, 22.519219536625315


In [187]:
def predict_ote(model, tokenizer, text, aspect, label_map):
    model.eval()  # Set model to evaluation mode

    # Tokenize the input text
    tokens = tokenizer(
        aspect,
        text,
        truncation=True,
        padding=True,
        return_tensors="pt",
        return_offsets_mapping=True  # Include offsets
    )
    input_ids = tokens["input_ids"].to(device)
    attention_mask = tokens["attention_mask"].to(device)
    offset_mapping = tokens["offset_mapping"][0].tolist()  # Offset mapping for tokens

    # Perform inference
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Get logits for each token
        predictions = torch.argmax(logits, dim=2)  # Get the predicted class for each token

    # Convert predictions to labels
    label_map_reverse = {v: k for k, v in label_map.items()}  # Reverse the label map
    predicted_labels = [label_map_reverse[label.item()] for label in predictions[0]]

    # Convert tokens to readable format and filter OTEs
    tokens_list = tokenizer.convert_ids_to_tokens(input_ids[0])

    # Determine the index of the first text token (ignoring aspect tokens)
    text_start_index = len(tokenizer(aspect, add_special_tokens=False)["input_ids"])

    ote_tokens = []
    ongoing_ote = False  # Track if we're inside a valid OTE span

    for idx, (token, label, (start, end)) in enumerate(zip(tokens_list, predicted_labels, offset_mapping)):
        if start == 0 and end == 0:  # Skip special tokens like [CLS] or [SEP]
            continue

        if idx < text_start_index:  # Ignore tokens before the main text
            continue

        if label == "B-OTE":  # Start a new OTE span
            ongoing_ote = True
            ote_tokens.append({
                "token": token,
                "start": start,
                "end": end
            })
        elif label == "I-OTE" and ongoing_ote:  # Continue a valid OTE span
            ote_tokens[-1]["token"] += token.replace("##", "")  # Merge sub-token (if any)
            ote_tokens[-1]["end"] = end  # Update the end offset
        else:
            ongoing_ote = False  # Reset if not part of an OTE span

    return ote_tokens, tokens_list, predicted_labels


In [188]:
predict_ote(model, tokenizer, "The vibes were ok but the food was amazing!", "AMBIENCE#GENERAL", label_map)

([{'token': 'vibes', 'start': 4, 'end': 9}],
 ['[CLS]',
  'AM',
  '##BI',
  '##EN',
  '##CE',
  '#',
  'GE',
  '##NE',
  '##RA',
  '##L',
  '[SEP]',
  'The',
  'v',
  '##ibes',
  'were',
  'ok',
  'but',
  'the',
  'food',
  'was',
  'amazing',
  '!',
  '[SEP]'],
 ['O',
  'O',
  'O',
  'B-OTE',
  'I-OTE',
  'I-OTE',
  'I-OTE',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-OTE',
  'I-OTE',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'])

In [138]:
test_input_ids, test_attention_masks, test_labels = preprocess_data(test_dataset, tokenizer)

In [142]:
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

In [169]:
from seqeval.metrics import f1_score, classification_report
import torch
import pandas as pd

def test_model(model, tokenizer, test_dataloader, label_map, save_path="mismatched_data.csv"):
    model.eval()

    label_map_reverse = {v: k for k, v in label_map.items()}

    all_preds = []
    all_grounds = []
    mismatched_data = []

    with torch.no_grad():
        for batch in test_dataloader:
            batch_input_ids, batch_attention_masks, batch_labels = [b.to(device) for b in batch]

            # Perform inference
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=2)

            for i in range(batch_input_ids.size(0)):  # Loop over batch
                ground_truth = batch_labels[i].tolist()
                prediction = predictions[i].tolist()
                attention_mask = batch_attention_masks[i].tolist()

                preds_seq = []  # Store predictions for this sequence
                grounds_seq = []  # Store ground truth for this sequence
                original_tokens = []  # Store original tokens for this sequence

                for token_idx, (pred, true, mask) in enumerate(zip(prediction, ground_truth, attention_mask)):
                    if mask == 1:  # Ignore padding
                        preds_seq.append(label_map_reverse[pred])  # Convert to string
                        grounds_seq.append(label_map_reverse[true])  # Convert to string
                        original_tokens.append(tokenizer.convert_ids_to_tokens(batch_input_ids[i][token_idx].item()))

                # Save mismatched data
                if preds_seq != grounds_seq:
                    mismatched_data.append({
                        "sentence": tokenizer.decode(batch_input_ids[i], skip_special_tokens=True),
                        "predicted": " ".join(preds_seq),
                        "ground": " ".join(grounds_seq)
                    })

                all_preds.append(preds_seq)
                all_grounds.append(grounds_seq)

    # Calculate metrics
    print("Classification Report:")
    print(classification_report(all_grounds, all_preds))  # seqeval expects nested lists

    print('F1 score:', f1_score(all_grounds, all_preds))  # seqeval expects nested lists

    # Save mismatched data to CSV
    if mismatched_data:
        mismatched_df = pd.DataFrame(mismatched_data)
        mismatched_df.to_csv(save_path, index=False)
        print(f"Mismatched data saved to {save_path}")

    return mismatched_data



In [170]:
mismatches = test_model(model, tokenizer, test_dataloader, label_map)

Classification Report:
              precision    recall  f1-score   support

         OTE       0.77      0.60      0.67      1600

   micro avg       0.77      0.60      0.67      1600
   macro avg       0.77      0.60      0.67      1600
weighted avg       0.77      0.60      0.67      1600

F1 score: 0.6730091613812544
Mismatched data saved to mismatched_data.csv


In [155]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [156]:
# Save the model and tokenizer
model.save_pretrained("./fine_tuned_bert")
tokenizer.save_pretrained("./fine_tuned_bert")

# copy ./results to google drive ./results_bert_date_time
import os
import shutil
from datetime import datetime
# Define source and destination paths
source_path = './fine_tuned_bert'
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
destination_path = f'/content/drive/My Drive/results_bert_{current_time}'

# Ensure the source exists
if os.path.exists(source_path):
    # Copy the directory to Google Drive
    shutil.copytree(source_path, destination_path)
    print(f"Copied '{source_path}' to '{destination_path}'")
else:
    print(f"Source directory '{source_path}' does not exist!")

Copied './fine_tuned_bert' to '/content/drive/My Drive/results_bert_20241211_205040'


In [164]:
mismatches

[{'token': 'sea', 'predicted': 'O', 'ground_truth': 'B-OTE'},
 {'token': '##food', 'predicted': 'O', 'ground_truth': 'I-OTE'},
 {'token': '##EN', 'predicted': 'B-OTE', 'ground_truth': 'O'},
 {'token': '##CE', 'predicted': 'I-OTE', 'ground_truth': 'B-OTE'},
 {'token': '##RA', 'predicted': 'O', 'ground_truth': 'I-OTE'},
 {'token': '##L', 'predicted': 'O', 'ground_truth': 'I-OTE'},
 {'token': '##EN', 'predicted': 'B-OTE', 'ground_truth': 'I-OTE'},
 {'token': '##RA', 'predicted': 'O', 'ground_truth': 'I-OTE'},
 {'token': 'am', 'predicted': 'B-OTE', 'ground_truth': 'I-OTE'},
 {'token': 'Raoul', 'predicted': 'O', 'ground_truth': 'B-OTE'},
 {'token': '##s', 'predicted': 'O', 'ground_truth': 'I-OTE'},
 {'token': 'chicken', 'predicted': 'O', 'ground_truth': 'I-OTE'},
 {'token': 'vegetable', 'predicted': 'O', 'ground_truth': 'I-OTE'},
 {'token': 'soup', 'predicted': 'O', 'ground_truth': 'I-OTE'},
 {'token': 'food', 'predicted': 'B-OTE', 'ground_truth': 'O'},
 {'token': 'portions', 'predicted': '

# Baseline

In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from datasets import load_dataset

dataset = load_dataset("alexcadillon/SemEval2016Task5", "restaurants")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


SemEval2016Task5.py:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/146k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/58.1k [00:00<?, ?B/s]

Generating trial split:   0%|          | 0/10 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/350 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/90 [00:00<?, ? examples/s]

DatasetDict({
    trial: Dataset({
        features: ['reviewId', 'sentences', 'opinions'],
        num_rows: 10
    })
    train: Dataset({
        features: ['reviewId', 'sentences', 'opinions'],
        num_rows: 350
    })
    test: Dataset({
        features: ['reviewId', 'sentences', 'opinions'],
        num_rows: 90
    })
})

In [None]:
train_dataset = dataset['train']
train_dataset

Dataset({
    features: ['reviewId', 'sentences', 'opinions'],
    num_rows: 350
})

In [None]:
test_dataset = dataset["test"]
test_dataset

Dataset({
    features: ['reviewId', 'sentences', 'opinions'],
    num_rows: 90
})

In [None]:
test_dataset["sentences"][0]

[{'sentenceId': 'en_BlueRibbonSushi_478218171:0',
  'text': 'Yum!',
  'opinions': [{'target': 'NULL',
    'category': {'entity': 'FOOD', 'attribute': 'QUALITY'},
    'polarity': 'positive',
    'from': '0',
    'to': '0'}]},
 {'sentenceId': 'en_BlueRibbonSushi_478218171:1',
  'text': 'Serves really good sushi.',
  'opinions': [{'target': 'sushi',
    'category': {'entity': 'FOOD', 'attribute': 'QUALITY'},
    'polarity': 'positive',
    'from': '19',
    'to': '24'}]},
 {'sentenceId': 'en_BlueRibbonSushi_478218171:2',
  'text': 'Not the biggest portions but adequate.',
  'opinions': [{'target': 'portions',
    'category': {'entity': 'FOOD', 'attribute': 'STYLE_OPTIONS'},
    'polarity': 'neutral',
    'from': '16',
    'to': '24'}]},
 {'sentenceId': 'en_BlueRibbonSushi_478218171:3',
  'text': 'Green Tea creme brulee is a must!',
  'opinions': [{'target': 'Green Tea creme brulee',
    'category': {'entity': 'FOOD', 'attribute': 'QUALITY'},
    'polarity': 'positive',
    'from': '0',
  

In [None]:
from collections import defaultdict

ote_dict = defaultdict(set)

for item in train_dataset:
  for sentence in item["sentences"]:
    for opinion in sentence["opinions"]:
      target = opinion["target"].lower()
      if target != "null":
        category = f'{opinion["category"]["entity"]}#{opinion["category"]["attribute"]}'
        ote_dict[category].add(opinion["target"])

ote_dict = {category: list(otes) for category, otes in ote_dict.items()}

In [None]:
ote_dict

{'RESTAURANT#GENERAL': ['Emilio',
  'Shabu-Shabu Restaurant',
  'Cypriot restaurant',
  'restaurant',
  "Rao's",
  'YUKA',
  'Saul',
  'modern Japanese brasserie',
  'pizza place',
  'Bukhara Grill',
  'Gnocchi',
  'Pacifico',
  'Jekyll and Hyde',
  'Jeckll and Hydes',
  'Ginger House',
  'Lucky Strike',
  'Casa la Femme',
  "Roth's",
  'joint',
  'Red Eye',
  'Pizza 33',
  'Red Eye Grill',
  'Casimir',
  'Big Wong',
  "Patsy's Pizza",
  'Planet Thailand',
  "Bloom's",
  'Mizu',
  'Teodora',
  'mare',
  'PLACE',
  'Jekyll and Hyde Pub',
  'Myagi',
  'establishment',
  'pink pony',
  'Yamato',
  'pizzeria',
  'Prune',
  'PIZZA 33',
  'Restaurant Saul',
  'restaraunt',
  'Pastis',
  'Planet Thai',
  'Heartland Brewery',
  'spot',
  'Cafe Spice',
  'bar',
  'trattoria',
  'Casa La Femme',
  'Amma',
  'Areo',
  'east village pizza',
  'Suan',
  'Zucchero Pomodori',
  'Al Di La',
  'Bukhara',
  'Fish',
  'fresh restaurant',
  'Leon',
  'Cafe Noir',
  'Chinese restaurant',
  'Zenkichi',
  'M

In [None]:
prediction_sentences = []

for item in test_dataset:
  for sentence in item["sentences"]:
    text = sentence["text"]
    predictions = []
    for opinion in sentence["opinions"]:
      category = f'{opinion["category"]["entity"]}#{opinion["category"]["attribute"]}'
      ote_list = ote_dict.get(category, [])

      found_ote = None
      start_idx = 0
      end_idx = 0
      for ote in ote_list:
        if ote in text.lower():
          found_ote = ote
          start_idx = text.find(ote)
          end_idx = start_idx + len(ote)
          break

      predictions.append({
          "category": category,
          "predicted_ote": {
              "target": found_ote if found_ote else "null",
              "start_idx": start_idx,
              "end_idx": end_idx
          },
          "ground_truth": {
              "target": opinion["target"].lower(),
              "start_idx": int(opinion["from"]),
              "end_idx": int(opinion["to"])
          }
      })
    prediction_sentences.append({
        "sentenceId": sentence["sentenceId"],
        "text": text,
        "predictions": predictions
    })

In [None]:
count = 0

for i, sentence_data in enumerate(prediction_sentences):
    if len(sentence_data['predictions']) > 1:
        print(f"Sentence {i + 1}: {sentence_data['text']}")
        print(f"Predictions: {sentence_data['predictions']}")
        print("-" * 20)
        count += 1

Sentence 8: It has great sushi and even better service.
Predictions: [{'category': 'FOOD#QUALITY', 'predicted_ote': {'target': 'sushi', 'start_idx': 13, 'end_idx': 18}, 'ground_truth': {'target': 'sushi', 'start_idx': 13, 'end_idx': 18}}, {'category': 'SERVICE#GENERAL', 'predicted_ote': {'target': 'service', 'start_idx': 35, 'end_idx': 42}, 'ground_truth': {'target': 'service', 'start_idx': 35, 'end_idx': 42}}]
--------------------
Sentence 23: The lemon chicken tasted like sticky sweet donuts and the honey walnut prawns, the few they actually give you.....were not good.
Predictions: [{'category': 'FOOD#QUALITY', 'predicted_ote': {'target': 'chicken', 'start_idx': 10, 'end_idx': 17}, 'ground_truth': {'target': 'lemon chicken', 'start_idx': 4, 'end_idx': 17}}, {'category': 'FOOD#QUALITY', 'predicted_ote': {'target': 'chicken', 'start_idx': 10, 'end_idx': 17}, 'ground_truth': {'target': 'honey walnut prawns', 'start_idx': 58, 'end_idx': 77}}, {'category': 'FOOD#STYLE_OPTIONS', 'predicted

In [None]:
count

177

In [None]:
len(prediction_sentences)

676

In [None]:
def compare_prediction(sentence):
  predicted = []
  ground_truth = []
  for prediction in sentence['predictions']:
    if prediction['predicted_ote']['target'] != 'null':
      predicted.append(prediction['predicted_ote']['target'])
    if prediction['ground_truth']['target'] != 'null':
      ground_truth.append(prediction['ground_truth']['target'])
  tp = len(set(predicted) & set(ground_truth))
  fp = len(set(predicted) - set(ground_truth))
  fn = len(set(ground_truth) - set(predicted))
  return tp, fp, fn

In [None]:
compare_prediction(prediction_sentences[663])

(1, 0, 1)

In [None]:
def calculate_f1_score(predictions):
    """
    Calculate Precision, Recall, and F1-Score for OTE extraction.

    Args:
        predictions (list): List of dictionaries with predicted and ground truth OTEs.

    Returns:
        dict: Precision, Recall, and F1-Score.
    """
    total_tp = 0  # True Positives
    total_fp = 0  # False Positives
    total_fn = 0  # False Negatives

    for sentence in prediction_sentences:
        (tp, fp, fn) = compare_prediction(sentence)
        total_tp = total_tp + tp
        total_fp = total_fp + fp
        total_fn = total_fn + fn

    # Calculate metrics
    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1_score
    }

# Example usage
f1_metrics = calculate_f1_score(predictions)
print(f"Precision: {f1_metrics['Precision']:.4f}")
print(f"Recall: {f1_metrics['Recall']:.4f}")
print(f"F1-Score: {f1_metrics['F1-Score']:.4f}")


Precision: 0.6473
Recall: 0.4918
F1-Score: 0.5590


# Shallow Model with CRF

In [None]:
from nltk.tokenize import word_tokenize
from nltk.tag import CRFTagger
from nltk import pos_tag
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
tokens = word_tokenize("I don't eat the pizza")

In [None]:
tokens

['I', 'do', "n't", 'eat', 'the', 'pizza']

In [None]:
def extract_features(tokens, index):
    token = tokens[index]
    return {
        'word': token,
        'is_upper': token.isupper(),
        'is_title': token.istitle(),
        'is_digit': token.isdigit(),
        'prefix_3': token[:3],
        'suffix_3': token[-3:],
        'prev_word': '' if index == 0 else tokens[index - 1],
        'next_word': '' if index == len(tokens) - 1 else tokens[index + 1],
    }

In [None]:
import re
from nltk.tokenize import word_tokenize

def preprocess_tokens(tokens):
    """
    Split tokens on punctuation and normalize case.
    """
    new_tokens = []
    for token in tokens:
        # Split tokens on punctuation like "-", "_", etc.
        split_tokens = re.split(r'[-_]', token)
        new_tokens.extend(split_tokens)
    return new_tokens

def find_token_indices(tokens, target_tokens):
    """
    Align target tokens with the tokenized sentence.
    """
    target_indices = []
    for target_token in target_tokens:
        # Look for the target token in the token list
        for i, token in enumerate(tokens):
            if target_token.lower() in token.lower() and i not in target_indices:
                target_indices.append(i)
                break
    return target_indices

def prepare_crd_data(data):
    X, y = [], []
    for sentence_data in data:
        for sentence in sentence_data['sentences']:
            tokens = word_tokenize(sentence['text'])
            tokens = preprocess_tokens(tokens)  # Normalize tokens
            labels = ['O'] * len(tokens)  # Default labels

            for opinion in sentence['opinions']:
                target = opinion['target'].lower()
                if target != 'null':
                    start_idx = int(opinion['from'])
                    stop_idx = int(opinion['to'])
                    target_text = sentence['text'][start_idx:stop_idx]
                    target_tokens = target_text.split()
                    target_indices = find_token_indices(tokens, target_tokens)

                    # Assign labels
                    for j, idx in enumerate(target_indices):
                        labels[idx] = ('B-' if j == 0 else 'I-') + opinion['category']['entity']

            X.append([extract_features(tokens, i) for i in range(len(tokens))])
            y.append(labels)
    return X, y


In [None]:
(X_train, y_train) = prepare_crd_data(train_dataset)

In [None]:
(X_test, y_test) = prepare_crd_data(test_dataset)

In [None]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn-crfsuite-0.5.0


In [None]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=crf.classes_))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           O       0.96      0.99      0.97      9244
B-RESTAURANT       0.67      0.37      0.48        75
   B-SERVICE       0.85      0.77      0.80       107
      B-FOOD       0.72      0.47      0.57       312
      I-FOOD       0.60      0.31      0.41       168
    B-DRINKS       0.95      0.56      0.70        36
    I-DRINKS       1.00      0.50      0.67        18
   I-SERVICE       1.00      0.33      0.50        15
I-RESTAURANT       1.00      0.11      0.19        19
  B-AMBIENCE       0.77      0.58      0.66        57
  B-LOCATION       0.80      0.40      0.53        10
  I-AMBIENCE       0.00      0.00      0.00        11
  I-LOCATION       0.00      0.00      0.00         8

    accuracy                           0.94     10080
   macro avg       0.72      0.41      0.50     10080
weighted avg       0.94      0.94      0.94     10080



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

# Before calling f1_score
mlb = MultiLabelBinarizer()
y_test_binarized = mlb.fit_transform(y_test)
y_pred_binarized = mlb.transform(y_pred)

# Now, use the binarized labels for f1_score calculation
print(f1_score(y_test_binarized, y_pred_binarized, average='weighted'))

0.8489856475334688


In [None]:
import numpy as np

# Compute Macro F1 and Micro F1
labels = crf.classes_  # List of all classes
f1_scores = []
supports = []

report_dict = metrics.flat_classification_report(
    y_test, y_pred, labels=labels, output_dict=True
)

for label, metrics_data in report_dict.items():
    if label in labels:
        f1_scores.append(metrics_data['f1-score'])
        supports.append(metrics_data['support'])

macro_f1 = np.mean(f1_scores)

total_support = sum(supports)
micro_f1 = sum(f * s for f, s in zip(f1_scores, supports)) / total_support

print("\nOverall Metrics:")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Micro F1 Score: {micro_f1:.4f}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Overall Metrics:
Macro F1 Score: 0.4987
Micro F1 Score: 0.9367


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
