<a href="https://colab.research.google.com/github/magantianirudh/Metaphor-Detection-using-NLP/blob/main/Metaphor_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
def identify_metaphor_sentence(txt, metaphor):
  for sentence in nltk.sent_tokenize(txt):
    if metaphor in sentence:
      return sentence

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset

# Load the dataset
df = pd.read_csv("train.csv")

# Drop rows with missing values
df = df.dropna(subset=['text', 'label_boolean'])

#replace metophorids with words
metaphor = {0:'road', 1:'candle', 2:'light', 3:'spice', 4:'ride', 5:'train', 6:'boat'}
df.replace({"metaphorID": metaphor},inplace=True)
# replace the text with the first sentence which contains the metaphor word
df['text'] = df.apply(lambda x: identify_metaphor_sentence(x['text'], x['metaphorID']),axis=1)
df = df.rename(columns={'metaphorID': 'metaphor_word'})

#Drop rows with missing values
df = df.dropna(subset=['text', 'label_boolean'])

# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# (4)Tokenize and encode the text data, including metaphor word embeddings
train_tokens = tokenizer(list(train_data["text"]), list(train_data["metaphor_word"]), padding=True, truncation=True, return_tensors="pt")
test_tokens = tokenizer(list(test_data["text"]), list(test_data["metaphor_word"]), padding=True, truncation=True, return_tensors="pt")


# Create PyTorch DataLoader
train_dataset = TensorDataset(
    train_tokens["input_ids"],
    train_tokens["attention_mask"],
    torch.tensor(list(train_data["label_boolean"].astype(int))),
    torch.tensor(list(train_tokens["input_ids"][:, 1]))  # Use the second token as the metaphor word embedding
)

test_dataset = TensorDataset(
    test_tokens["input_ids"],
    test_tokens["attention_mask"],
    torch.tensor(list(test_data["label_boolean"].astype(int))),
    torch.tensor(list(test_tokens["input_ids"][:, 1]))  # Use the second token as the metaphor word embedding
)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Fine-tune the BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

losses = []
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels, metaphor_ids = batch
        input_ids, attention_mask, labels, metaphor_ids = (
            input_ids.to(device),
            attention_mask.to(device),
            labels.to(device),
            metaphor_ids.to(device),
        )

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    losses.append(loss)
    print(f"epoch {epoch} loss {loss}")

# Evaluate the fine-tuned model
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels, metaphor_ids = batch
        input_ids, attention_mask, labels, metaphor_ids = (
            input_ids.to(device),
            attention_mask.to(device),
            labels.to(device),
            metaphor_ids.to(device),
        )

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Evaluate the accuracy
test_accuracy = accuracy_score(true_labels, predictions)
test_precision = precision_score(true_labels, predictions)
test_recall = recall_score(true_labels, predictions)
test_f1 = f1_score(true_labels, predictions)

print(f"Final Test Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


epoch 0 loss 0.01746472343802452
epoch 1 loss 0.01034976914525032
epoch 2 loss 0.007323809899389744
epoch 3 loss 0.0022718862164765596
epoch 4 loss 0.0008174782851710916
Final Test Accuracy: 0.9155313351498637
Test Precision: 0.9276315789473685, Recall: 0.9690721649484536, F1 Score: 0.9478991596638656


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset

# Load the dataset
df = pd.read_csv("train.csv")

# Drop rows with missing values
df = df.dropna(subset=['text', 'label_boolean'])

#replace metophorids with words
metaphor = {0:'road', 1:'candle', 2:'light', 3:'spice', 4:'ride', 5:'train', 6:'boat'}
df.replace({"metaphorID": metaphor},inplace=True)
# replace the text with the first sentence which contains the metaphor word
df['text'] = df.apply(lambda x: identify_metaphor_sentence(x['text'], x['metaphorID']),axis=1)
df = df.rename(columns={'metaphorID': 'metaphor_word'})

#Drop rows with missing values
df = df.dropna(subset=['text', 'label_boolean'])

# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize and encode the text data, including metaphor word embeddings
train_tokens = tokenizer(list(train_data["text"]), list(train_data["metaphor_word"]), padding=True, truncation=True, return_tensors="pt")
test_tokens = tokenizer(list(test_data["text"]), list(test_data["metaphor_word"]), padding=True, truncation=True, return_tensors="pt")


# Create PyTorch DataLoader
train_dataset = TensorDataset(
    train_tokens["input_ids"],
    train_tokens["attention_mask"],
    torch.tensor(list(train_data["label_boolean"].astype(int))),
    torch.tensor(list(train_tokens["input_ids"][:, 1]))  # Use the second token as the metaphor word embedding
)

test_dataset = TensorDataset(
    test_tokens["input_ids"],
    test_tokens["attention_mask"],
    torch.tensor(list(test_data["label_boolean"].astype(int))),
    torch.tensor(list(test_tokens["input_ids"][:, 1]))  # Use the second token as the metaphor word embedding
)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Fine-tune the BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

losses = []
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels, metaphor_ids = batch
        input_ids, attention_mask, labels, metaphor_ids = (
            input_ids.to(device),
            attention_mask.to(device),
            labels.to(device),
            metaphor_ids.to(device),
        )

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        losses.append(loss)
    print(f"epoch {epoch} loss {loss}")

# Evaluate the fine-tuned model
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels, metaphor_ids = batch
        input_ids, attention_mask, labels, metaphor_ids = (
            input_ids.to(device),
            attention_mask.to(device),
            labels.to(device),
            metaphor_ids.to(device),
        )

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Evaluate the accuracy
test_accuracy = accuracy_score(true_labels, predictions)
test_precision = precision_score(true_labels, predictions)
test_recall = recall_score(true_labels, predictions)
test_f1 = f1_score(true_labels, predictions)

print(f"Final Test Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


epoch 0 loss 0.15183261036872864
epoch 1 loss 0.00574009632691741
epoch 2 loss 0.04795824736356735
epoch 3 loss 0.004408561624586582
epoch 4 loss 0.024183766916394234
Final Test Accuracy: 0.9155313351498637
Test Precision: 0.9304635761589404, Recall: 0.9656357388316151, F1 Score: 0.9477234401349073


In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
test_report = classification_report(true_labels, predictions)
print(test_report)

              precision    recall  f1-score   support

           0       0.85      0.72      0.78        76
           1       0.93      0.97      0.95       291

    accuracy                           0.92       367
   macro avg       0.89      0.84      0.86       367
weighted avg       0.91      0.92      0.91       367



In [7]:
train_tokens

{'input_ids': tensor([[  101,  1045,  2031,  ...,     0,     0,     0],
        [  101, 11504,  2027,  ...,     0,     0,     0],
        [  101,  4937,  5400,  ...,     0,     0,     0],
        ...,
        [  101,  8507,  1011,  ...,     0,     0,     0],
        [  101,  1045,  1005,  ...,     0,     0,     0],
        [  101,  1045,  2079,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}