<a href="https://colab.research.google.com/github/lwachowiak/Multilingual-Metaphor-Detection/blob/main/Metaphor_Detection_(Tokenlevel).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports


In [1]:
# !pip install pynvml
# !pip install transformers[torch]

In [2]:
#torch and tranformers for model and training
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import DataCollatorForTokenClassification
from transformers import Trainer, TrainingArguments
import sentencepiece

#sklearn for evaluation
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split


#utilities
import pandas as pd
import glob, os
import time
import datetime
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pickle         # for saving data structures
from pynvml import *  # for checking gpu memory
import re




In [3]:
# !pip uninstall torch -y

In [4]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [5]:
# connect to GPU
device = torch.device('cuda')

print('Connected to GPU:', torch.cuda.get_device_name(0))

Connected to GPU: NVIDIA GeForce RTX 4070 Laptop GPU


# Prepare Data

**Functions for preprocessing and creating of Training Data**

Originally I used *xlm-roberta-base* as model. Now, there are slightly stronger models available in the same parameter range, for example *microsoft/mdeberta-v3-base*


You can try:


*   model_name="xlm-roberta-base"
*   model_name="xlm-roberta-large"
*   model_name="microsoft/mdeberta-v3-base"



In [12]:
model_name="allenai/biomed_roberta_base"
random_validation=True
random_state=1
val_percentage=0.1

Upload files to Google Drive or link to your computer's folder if running locally.

In [15]:
# load datasets
test_data=pd.read_csv("VUA_test_all.csv", engine="python")
train_data=pd.read_csv("VUA_train.csv", engine="python")

In [16]:
def format_for_TokenClf(df) -> list:
  data_list = []
  sentence=[]
  labels=[]
  for index, row in df.iterrows():
    if row["id"][-2:]=="_1" and index!=0:
      data_list.append((sentence, labels))
      sentence=[]
      labels=[]
    if row["label"]==1:
      label="m"
    else:
      label="l"
    sentence.append(str(row["word"]))
    labels.append(label)
    if index==len(df)-1:
      data_list.append((sentence, labels))

  return data_list

In [17]:
test_data=format_for_TokenClf(test_data)
train_data=format_for_TokenClf(train_data)
print(test_data[0])

(['The', 'Labour', 'Party', 'Conference', ':', 'Policy', 'review', 'throws', 'a', 'spanner', 'in', 'the', 'Whitehall', 'machinery'], ['l', 'l', 'l', 'l', 'l', 'l', 'l', 'm', 'l', 'm', 'm', 'l', 'l', 'm'])


In [20]:
train_data, val_data= train_test_split(train_data, shuffle=random_validation, test_size=val_percentage, random_state=random_state)

In [21]:
print("Sentences Train: ", len(train_data))
print("Sentences Val: ", len(val_data))
print("Sentences Test: ", len(test_data))

Sentences Train:  10898
Sentences Val:  1211
Sentences Test:  4080


In [22]:
#train
train_tags=[tup[1] for tup in train_data]
train_texts=[tup[0] for tup in train_data]

#val
val_tags=[tup[1] for tup in val_data]
val_texts=[tup[0] for tup in val_data]

#test
test_tags=[tup[1] for tup in test_data]
test_texts=[tup[0] for tup in test_data]

In [23]:
for text in train_tags:
  if not isinstance(text, list):
    print(text)

In [24]:
print(test_texts[0])
print(test_tags[0])

['The', 'Labour', 'Party', 'Conference', ':', 'Policy', 'review', 'throws', 'a', 'spanner', 'in', 'the', 'Whitehall', 'machinery']
['l', 'l', 'l', 'l', 'l', 'l', 'l', 'm', 'l', 'm', 'm', 'l', 'l', 'm']


# Tokenize

In [43]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=True)

In [45]:
label_list=["l", "m"]
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels=len(label_list)

def tokenize_and_align_labels(texts, tags):
  tokenized_inputs = tokenizer(
      texts,
      padding=True,
      truncation=True,
      # We use this argument because the texts in our dataset are lists of words (with a label for each word).
      is_split_into_words=True,
  )
  labels = []
  for i, label in enumerate(tags):
      word_ids = tokenized_inputs.word_ids(batch_index=i)
      previous_word_idx = None
      label_ids = []
      for word_idx in word_ids:
          # Special tokens have a word id that is None. We set the label to -100 so they are automatically
          # ignored in the loss function.
          if word_idx is None:
              label_ids.append(-100)
          # We set the label for the first token of each word.
          elif word_idx != previous_word_idx:
              label_ids.append(label_to_id[label[word_idx]])
          # For the other tokens in a word, we set the label to either the current label or -100, depending on
          # the label_all_tokens flag.
          else:
              label_ids.append(-100)
          previous_word_idx = word_idx

      labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs

test_input_and_labels = tokenize_and_align_labels(test_texts, test_tags)

val_input_and_labels = tokenize_and_align_labels(val_texts, val_tags)

train_input_and_labels = tokenize_and_align_labels(train_texts, train_tags)

In [47]:
# create dataset
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

test_dataset = OurDataset(test_input_and_labels, test_input_and_labels["labels"])

train_dataset = OurDataset(train_input_and_labels, train_input_and_labels["labels"])

val_dataset = OurDataset(val_input_and_labels, val_input_and_labels["labels"])


In [49]:
test_dataset.__getitem__(0)

{'input_ids': tensor([    0,    20,  4165,  1643,  2815,  4832,  6275,  1551,  6989,    10,
          8968,  1396,    11,     5,   735, 12023, 13922,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

# Training

In [54]:
# how the validation and test scores are computed

def compute_metrics(eval_preds) -> dict:
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [val for sublist in true_labels for val in sublist]
    true_predictions = [val for sublist in true_predictions for val in sublist]

    print(classification_report(true_labels, true_predictions))#, target_names=target_names))

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_predictions, average="weighted")

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [56]:
# training arguments

training_args = TrainingArguments(
    output_dir='./MetaphorExtraction/results',          # output directory
    num_train_epochs=8,                                 # total # of training epochs
    per_device_train_batch_size=8,                      # batch size per device during training
    per_device_eval_batch_size=16,                      # batch size for evaluation
    warmup_steps=0,                                     # number of warmup steps for learning rate scheduler
    weight_decay=0,                                     # strength of weight decay
    learning_rate=2e-5,
    logging_dir='./MetaphorExtraction/logs',            # directory for storing logs
    evaluation_strategy= "epoch",                       # steps or epochs
    save_strategy = "epoch",
    # eval_steps=500,
    # save_total_limit=0,
    load_best_model_at_end=True,                        #loads the model with the best evaluation score
    metric_for_best_model="f1",
    greater_is_better=True
)



In [58]:
# initialize model
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)


pytorch_model.bin:   0%|          | 0.00/656M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at allenai/biomed_roberta_base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
nvmlInit()
h = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(h)
print(f'total    : {info.total}')
print(f'free     : {info.free}')
print(f'used     : {info.used}')

model.safetensors:   0%|          | 0.00/656M [00:00<?, ?B/s]

total    : 8585740288
free     : 8334082048
used     : 251658240


In [62]:
# initialize huggingface trainer
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset = train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

  trainer = Trainer(


In [64]:
# train
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1486,0.129684,0.947333,0.950027,0.947733
2,0.1041,0.137604,0.949646,0.952062,0.950043
3,0.0718,0.14246,0.954476,0.95602,0.954979
4,0.0512,0.175513,0.953391,0.955525,0.953317
5,0.0309,0.196196,0.955333,0.956075,0.95566
6,0.024,0.237231,0.954808,0.956075,0.955285
7,0.0151,0.262172,0.955082,0.95624,0.955534
8,0.0116,0.284935,0.955016,0.95646,0.955507


              precision    recall  f1-score   support

           l       0.96      0.98      0.97     16254
           m       0.83      0.67      0.74      1936

    accuracy                           0.95     18190
   macro avg       0.89      0.83      0.86     18190
weighted avg       0.95      0.95      0.95     18190

              precision    recall  f1-score   support

           l       0.96      0.98      0.97     16254
           m       0.83      0.69      0.75      1936

    accuracy                           0.95     18190
   macro avg       0.90      0.84      0.86     18190
weighted avg       0.95      0.95      0.95     18190

              precision    recall  f1-score   support

           l       0.97      0.98      0.98     16254
           m       0.83      0.74      0.78      1936

    accuracy                           0.96     18190
   macro avg       0.90      0.86      0.88     18190
weighted avg       0.95      0.96      0.95     18190

              preci

TrainOutput(global_step=10904, training_loss=0.05897337867333796, metrics={'train_runtime': 1567.1173, 'train_samples_per_second': 55.633, 'train_steps_per_second': 6.958, 'total_flos': 5517248356386048.0, 'train_loss': 0.05897337867333796, 'epoch': 8.0})

In [68]:
# Score on the test set
trainer.evaluate(test_dataset)

              precision    recall  f1-score   support

           l       0.97      0.98      0.97     51540
           m       0.80      0.75      0.78      6819

    accuracy                           0.95     58359
   macro avg       0.89      0.87      0.87     58359
weighted avg       0.95      0.95      0.95     58359



{'eval_loss': 0.20723897218704224,
 'eval_precision': 0.948598086741333,
 'eval_recall': 0.9497592487876763,
 'eval_f1': 0.9490767337187377,
 'eval_runtime': 16.8959,
 'eval_samples_per_second': 241.479,
 'eval_steps_per_second': 15.092,
 'epoch': 8.0}

In [70]:
from datetime import date
trainer.save_model("./saved-models/metaphor_extraction_"+str(date.today())+"_randVal-"+str(random_validation)+"_"+model_name)

# Using the Model for Inference

In [73]:
from transformers import pipeline

In [75]:
label_list= ['literal',"metaphoric"]
label_dict_relations={ i : l for i, l in enumerate(label_list) }

In [83]:
PATH = "./saved-models/metaphor_extraction_2025-04-15_randVal-True_allenai/biomed_roberta_base"
model_metaphor_detection = AutoModelForTokenClassification.from_pretrained(PATH, id2label=label_dict_relations)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [85]:
pipeline_metaphors=pipeline("ner", model=model_metaphor_detection, tokenizer=tokenizer, aggregation_strategy="simple")

Device set to use cuda:0


In [87]:
pipeline_metaphors("Our love is at crossroads and the company is going into hibernation. The bear is sleeping well.")

[{'entity_group': 'literal',
  'score': 0.9976692,
  'word': 'Our love is',
  'start': 0,
  'end': 11},
 {'entity_group': 'metaphoric',
  'score': 0.99968034,
  'word': ' at crossroads',
  'start': 12,
  'end': 25},
 {'entity_group': 'literal',
  'score': 0.99888617,
  'word': ' and the company is',
  'start': 26,
  'end': 44},
 {'entity_group': 'metaphoric',
  'score': 0.9857975,
  'word': ' going into hibernation',
  'start': 45,
  'end': 67},
 {'entity_group': 'literal',
  'score': 0.873021,
  'word': '. The bear is sleeping well.',
  'start': 67,
  'end': 95}]