<a href="https://colab.research.google.com/github/lwachowiak/Multilingual-Metaphor-Detection/blob/main/Metaphor_Detection_(Tokenlevel).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports


In [1]:
!pip install pynvml
!pip install transformers[torch]

Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynvml
Successfully installed pynvml-11.5.0
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl 

In [2]:
#torch and tranformers for model and training
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import DataCollatorForTokenClassification
from transformers import Trainer, TrainingArguments
import sentencepiece

#sklearn for evaluation
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split


#utilities
import pandas as pd
import glob, os
import time
import datetime
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pickle         # for saving data structures
from pynvml import *  # for checking gpu memory
import re

In [3]:
# connect to GPU
device = torch.device('cuda')

print('Connected to GPU:', torch.cuda.get_device_name(0))

Connected to GPU: Tesla T4


# Prepare Data

**Functions for preprocessing and creating of Training Data**

Originally I used *xlm-roberta-base* as model. Now, there are slightly stronger models available in the same parameter range, for example *microsoft/mdeberta-v3-base*


You can try:


*   model_name="xlm-roberta-base"
*   model_name="xlm-roberta-large"
*   model_name="microsoft/mdeberta-v3-base"



In [4]:
model_name="xlm-roberta-base"
random_validation=True
random_state=1
val_percentage=0.1

Upload files to Google Drive or link to your computer's folder if running locally.

In [7]:
# load datasets
test_data=pd.read_csv("VUA_test_all.csv", engine="python")
train_data=pd.read_csv("VUA_train.csv", engine="python")

In [8]:
def format_for_TokenClf(df) -> list:
  data_list = []
  sentence=[]
  labels=[]
  for index, row in df.iterrows():
    if row["id"][-2:]=="_1" and index!=0:
      data_list.append((sentence, labels))
      sentence=[]
      labels=[]
    if row["label"]==1:
      label="m"
    else:
      label="l"
    sentence.append(str(row["word"]))
    labels.append(label)
    if index==len(df)-1:
      data_list.append((sentence, labels))

  return data_list

In [9]:
test_data=format_for_TokenClf(test_data)
train_data=format_for_TokenClf(train_data)
print(test_data[0])

(['The', 'Labour', 'Party', 'Conference', ':', 'Policy', 'review', 'throws', 'a', 'spanner', 'in', 'the', 'Whitehall', 'machinery'], ['l', 'l', 'l', 'l', 'l', 'l', 'l', 'm', 'l', 'm', 'm', 'l', 'l', 'm'])


In [10]:
train_data, val_data= train_test_split(train_data, shuffle=random_validation, test_size=val_percentage, random_state=random_state)

In [11]:
print("Sentences Train: ", len(train_data))
print("Sentences Val: ", len(val_data))
print("Sentences Test: ", len(test_data))

Sentences Train:  10898
Sentences Val:  1211
Sentences Test:  4080


In [12]:
#train
train_tags=[tup[1] for tup in train_data]
train_texts=[tup[0] for tup in train_data]

#val
val_tags=[tup[1] for tup in val_data]
val_texts=[tup[0] for tup in val_data]

#test
test_tags=[tup[1] for tup in test_data]
test_texts=[tup[0] for tup in test_data]

In [13]:
for text in train_tags:
  if not isinstance(text, list):
    print(text)

In [14]:
print(test_texts[0])
print(test_tags[0])

['The', 'Labour', 'Party', 'Conference', ':', 'Policy', 'review', 'throws', 'a', 'spanner', 'in', 'the', 'Whitehall', 'machinery']
['l', 'l', 'l', 'l', 'l', 'l', 'l', 'm', 'l', 'm', 'm', 'l', 'l', 'm']


# Tokenize

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [16]:
label_list=["l", "m"]
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels=len(label_list)

def tokenize_and_align_labels(texts, tags):
  tokenized_inputs = tokenizer(
      texts,
      padding=True,
      truncation=True,
      # We use this argument because the texts in our dataset are lists of words (with a label for each word).
      is_split_into_words=True,
  )
  labels = []
  for i, label in enumerate(tags):
      word_ids = tokenized_inputs.word_ids(batch_index=i)
      previous_word_idx = None
      label_ids = []
      for word_idx in word_ids:
          # Special tokens have a word id that is None. We set the label to -100 so they are automatically
          # ignored in the loss function.
          if word_idx is None:
              label_ids.append(-100)
          # We set the label for the first token of each word.
          elif word_idx != previous_word_idx:
              label_ids.append(label_to_id[label[word_idx]])
          # For the other tokens in a word, we set the label to either the current label or -100, depending on
          # the label_all_tokens flag.
          else:
              label_ids.append(-100)
          previous_word_idx = word_idx

      labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs

test_input_and_labels = tokenize_and_align_labels(test_texts, test_tags)

val_input_and_labels = tokenize_and_align_labels(val_texts, val_tags)

train_input_and_labels = tokenize_and_align_labels(train_texts, train_tags)

In [17]:
# create dataset
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

test_dataset = OurDataset(test_input_and_labels, test_input_and_labels["labels"])

train_dataset = OurDataset(train_input_and_labels, train_input_and_labels["labels"])

val_dataset = OurDataset(val_input_and_labels, val_input_and_labels["labels"])


In [18]:
test_dataset.__getitem__(0)

{'input_ids': tensor([     0,    581,    239,  38648,  31016, 114732,    152,  80042,   8347,
         104250,      7,     10,  27734,   1679,     23,     70,  22392,  29907,
          36279,   1294,      2,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1

# Training

In [19]:
# how the validation and test scores are computed

def compute_metrics(eval_preds) -> dict:
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [val for sublist in true_labels for val in sublist]
    true_predictions = [val for sublist in true_predictions for val in sublist]

    print(classification_report(true_labels, true_predictions))#, target_names=target_names))

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_predictions, average="weighted")

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [20]:
# training arguments

training_args = TrainingArguments(
    output_dir='./MetaphorExtraction/results',          # output directory
    num_train_epochs=8,                                 # total # of training epochs
    per_device_train_batch_size=8,                      # batch size per device during training
    per_device_eval_batch_size=16,                      # batch size for evaluation
    warmup_steps=0,                                     # number of warmup steps for learning rate scheduler
    weight_decay=0,                                     # strength of weight decay
    learning_rate=2e-5,
    logging_dir='./MetaphorExtraction/logs',            # directory for storing logs
    evaluation_strategy= "epoch",                       # steps or epochs
    save_strategy = "epoch",
    # eval_steps=500,
    # save_total_limit=0,
    load_best_model_at_end=True,                        #loads the model with the best evaluation score
    metric_for_best_model="f1",
    greater_is_better=True
)



In [21]:
# initialize model
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
nvmlInit()
h = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(h)
print(f'total    : {info.total}')
print(f'free     : {info.free}')
print(f'used     : {info.used}')

total    : 16106127360
free     : 15832514560
used     : 273612800


In [23]:
# initialize huggingface trainer
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset = train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

In [None]:
# train
trainer.train()

***** Running training *****
  Num examples = 10898
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10904


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.158,0.138226,0.938776,0.942913,0.938748
2,0.1195,0.130923,0.948559,0.950609,0.949253
3,0.0926,0.134616,0.950395,0.952875,0.95075
4,0.0669,0.148513,0.951833,0.952875,0.952282
5,0.0513,0.165712,0.952289,0.95393,0.952871
6,0.0393,0.203211,0.952352,0.953772,0.952901
7,0.028,0.244468,0.950951,0.952981,0.951561
8,0.0206,0.253753,0.952496,0.954193,0.953077


***** Running Evaluation *****
  Num examples = 1211
  Batch size = 16


              precision    recall  f1-score   support

           l       0.95      0.98      0.97     17013
           m       0.81      0.58      0.68      1958

    accuracy                           0.94     18971
   macro avg       0.88      0.78      0.82     18971
weighted avg       0.94      0.94      0.94     18971



Saving model checkpoint to ./MetaphorExtraction/results/checkpoint-1363
Configuration saved in ./MetaphorExtraction/results/checkpoint-1363/config.json
Model weights saved in ./MetaphorExtraction/results/checkpoint-1363/pytorch_model.bin
tokenizer config file saved in ./MetaphorExtraction/results/checkpoint-1363/tokenizer_config.json
Special tokens file saved in ./MetaphorExtraction/results/checkpoint-1363/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1211
  Batch size = 16


              precision    recall  f1-score   support

           l       0.97      0.98      0.97     17013
           m       0.79      0.70      0.75      1958

    accuracy                           0.95     18971
   macro avg       0.88      0.84      0.86     18971
weighted avg       0.95      0.95      0.95     18971



Saving model checkpoint to ./MetaphorExtraction/results/checkpoint-2726
Configuration saved in ./MetaphorExtraction/results/checkpoint-2726/config.json
Model weights saved in ./MetaphorExtraction/results/checkpoint-2726/pytorch_model.bin
tokenizer config file saved in ./MetaphorExtraction/results/checkpoint-2726/tokenizer_config.json
Special tokens file saved in ./MetaphorExtraction/results/checkpoint-2726/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1211
  Batch size = 16


              precision    recall  f1-score   support

           l       0.96      0.98      0.97     17013
           m       0.83      0.68      0.75      1958

    accuracy                           0.95     18971
   macro avg       0.90      0.83      0.86     18971
weighted avg       0.95      0.95      0.95     18971



Saving model checkpoint to ./MetaphorExtraction/results/checkpoint-4089
Configuration saved in ./MetaphorExtraction/results/checkpoint-4089/config.json
Model weights saved in ./MetaphorExtraction/results/checkpoint-4089/pytorch_model.bin
tokenizer config file saved in ./MetaphorExtraction/results/checkpoint-4089/tokenizer_config.json
Special tokens file saved in ./MetaphorExtraction/results/checkpoint-4089/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1211
  Batch size = 16


              precision    recall  f1-score   support

           l       0.97      0.98      0.97     17013
           m       0.79      0.74      0.77      1958

    accuracy                           0.95     18971
   macro avg       0.88      0.86      0.87     18971
weighted avg       0.95      0.95      0.95     18971



Saving model checkpoint to ./MetaphorExtraction/results/checkpoint-5452
Configuration saved in ./MetaphorExtraction/results/checkpoint-5452/config.json
Model weights saved in ./MetaphorExtraction/results/checkpoint-5452/pytorch_model.bin
tokenizer config file saved in ./MetaphorExtraction/results/checkpoint-5452/tokenizer_config.json
Special tokens file saved in ./MetaphorExtraction/results/checkpoint-5452/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1211
  Batch size = 16


              precision    recall  f1-score   support

           l       0.97      0.98      0.97     17013
           m       0.81      0.73      0.77      1958

    accuracy                           0.95     18971
   macro avg       0.89      0.85      0.87     18971
weighted avg       0.95      0.95      0.95     18971



Saving model checkpoint to ./MetaphorExtraction/results/checkpoint-6815
Configuration saved in ./MetaphorExtraction/results/checkpoint-6815/config.json
Model weights saved in ./MetaphorExtraction/results/checkpoint-6815/pytorch_model.bin
tokenizer config file saved in ./MetaphorExtraction/results/checkpoint-6815/tokenizer_config.json
Special tokens file saved in ./MetaphorExtraction/results/checkpoint-6815/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1211
  Batch size = 16


              precision    recall  f1-score   support

           l       0.97      0.98      0.97     17013
           m       0.80      0.74      0.77      1958

    accuracy                           0.95     18971
   macro avg       0.89      0.86      0.87     18971
weighted avg       0.95      0.95      0.95     18971



Saving model checkpoint to ./MetaphorExtraction/results/checkpoint-8178
Configuration saved in ./MetaphorExtraction/results/checkpoint-8178/config.json
Model weights saved in ./MetaphorExtraction/results/checkpoint-8178/pytorch_model.bin
tokenizer config file saved in ./MetaphorExtraction/results/checkpoint-8178/tokenizer_config.json
Special tokens file saved in ./MetaphorExtraction/results/checkpoint-8178/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1211
  Batch size = 16


              precision    recall  f1-score   support

           l       0.97      0.98      0.97     17013
           m       0.81      0.71      0.76      1958

    accuracy                           0.95     18971
   macro avg       0.89      0.84      0.87     18971
weighted avg       0.95      0.95      0.95     18971



Saving model checkpoint to ./MetaphorExtraction/results/checkpoint-9541
Configuration saved in ./MetaphorExtraction/results/checkpoint-9541/config.json
Model weights saved in ./MetaphorExtraction/results/checkpoint-9541/pytorch_model.bin
tokenizer config file saved in ./MetaphorExtraction/results/checkpoint-9541/tokenizer_config.json
Special tokens file saved in ./MetaphorExtraction/results/checkpoint-9541/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1211
  Batch size = 16


              precision    recall  f1-score   support

           l       0.97      0.98      0.97     17013
           m       0.81      0.73      0.77      1958

    accuracy                           0.95     18971
   macro avg       0.89      0.85      0.87     18971
weighted avg       0.95      0.95      0.95     18971



Saving model checkpoint to ./MetaphorExtraction/results/checkpoint-10904
Configuration saved in ./MetaphorExtraction/results/checkpoint-10904/config.json
Model weights saved in ./MetaphorExtraction/results/checkpoint-10904/pytorch_model.bin
tokenizer config file saved in ./MetaphorExtraction/results/checkpoint-10904/tokenizer_config.json
Special tokens file saved in ./MetaphorExtraction/results/checkpoint-10904/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./MetaphorExtraction/results/checkpoint-10904 (score: 0.9530768829001109).


TrainOutput(global_step=10904, training_loss=0.0736900699199864, metrics={'train_runtime': 7069.4653, 'train_samples_per_second': 12.332, 'train_steps_per_second': 1.542, 'total_flos': 7208018013988224.0, 'train_loss': 0.0736900699199864, 'epoch': 8.0})

In [None]:
# Score on the test set
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 4080
  Batch size = 16


              precision    recall  f1-score   support

           l       0.96      0.98      0.97     51540
           m       0.82      0.71      0.76      6819

    accuracy                           0.95     58359
   macro avg       0.89      0.84      0.87     58359
weighted avg       0.95      0.95      0.95     58359



{'epoch': 8.0,
 'eval_f1': 0.9466133325847397,
 'eval_loss': 0.26670947670936584,
 'eval_precision': 0.946064396352123,
 'eval_recall': 0.9483198821090149,
 'eval_runtime': 85.1608,
 'eval_samples_per_second': 47.909,
 'eval_steps_per_second': 2.994}

In [None]:
from datetime import date
trainer.save_model("./saved-models/metaphor_extraction_"+str(date.today())+"_randVal-"+str(random_validation)+"_"+model_name)

Saving model checkpoint to ./saved-models/metaphor_extraction_2022-02-24_randVal-True_xlm-roberta-base
Configuration saved in ./saved-models/metaphor_extraction_2022-02-24_randVal-True_xlm-roberta-base/config.json
Model weights saved in ./saved-models/metaphor_extraction_2022-02-24_randVal-True_xlm-roberta-base/pytorch_model.bin
tokenizer config file saved in ./saved-models/metaphor_extraction_2022-02-24_randVal-True_xlm-roberta-base/tokenizer_config.json
Special tokens file saved in ./saved-models/metaphor_extraction_2022-02-24_randVal-True_xlm-roberta-base/special_tokens_map.json


# Using the Model for Inference

In [None]:
from transformers import pipeline

In [None]:
label_list= ['literal',"metaphoric"]
label_dict_relations={ i : l for i, l in enumerate(label_list) }

In [None]:
PATH = "./saved-models/my_model"
model_metaphor_detection = AutoModelForTokenClassification.from_pretrained(PATH, id2label=label_dict_relations)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
pipeline_metaphors=pipeline("ner", model=model_metaphor_detection, tokenizer=tokenizer, aggregation_strategy="simple")

In [None]:
pipeline_metaphors("Our love is at crossroads and the company is going into hibernation. The bear is sleeping well.")

[{'end': 11,
  'entity_group': 'l',
  'score': 0.99962264,
  'start': 0,
  'word': 'Our love is'},
 {'end': 25,
  'entity_group': 'metaphoric',
  'score': 0.9950348,
  'start': 12,
  'word': 'at crossroads'},
 {'end': 44,
  'entity_group': 'l',
  'score': 0.99978614,
  'start': 26,
  'word': 'and the company is'},
 {'end': 67,
  'entity_group': 'metaphoric',
  'score': 0.86717546,
  'start': 45,
  'word': 'going into hibernation'},
 {'end': 95,
  'entity_group': 'l',
  'score': 0.9984967,
  'start': 67,
  'word': '. The bear is sleeping well.'}]