# **Group Members**

In [None]:
# Ahmad Arif - 444002984

# Musab Iskandar - 444003841

# Yousef Koshak - 444000774

# This project focuses on advancing idiomatic representation in NLP models by implementing and improving upon methodologies discussed in selected scientific papers. The primary objective is to create a modular and organized codebase for idiomaticity representation, leveraging contrastive triplet loss and adaptive contrastive learning techniques. The dataset involves idioms in both literal and figurative contexts, aiming for tasks such as ranking and idiomaticity detection.

# **Importing Libraries**

In [None]:
import ast
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

# **Check for GPU/CPU availability**


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# **Dataset Preprocessing**

In [None]:
class IdiomDataset(Dataset):
    def __init__(self, data_file):
        # Load the data
        self.data = pd.read_csv(data_file, delimiter='\t')

        # Process expected order first
        self.data = self.convert_expected_order(self.data)

        # Then process the full dataset
        self.processed_data = [
            {
                'compound': row['compound'],
                'sentence': row['sentence'],
                'label': row['sentence_type'],
                'images': [
                    {'name': row[f'image{i}_name'], 'caption': row[f'image{i}_caption'], 'position': i}
                    for i in range(1, 6)
                ],
                'expected_order': row['expected_order']  # Now using the numeric positions
            }
            for _, row in self.data.iterrows()
        ]

    def convert_expected_order(self, df):
        # Convert the string representation of list to actual list
        df['expected_order'] = df['expected_order'].apply(
            lambda x: ast.literal_eval(x) if pd.notna(x) else []
        )

        # Create a mapping dictionary for each row
        def get_image_positions(row):
            # Create a dictionary mapping image names to their positions (1-5)
            image_map = {
                row[f'image{i}_name']: i for i in range(1, 6)
            }
            return [image_map[img_name] for img_name in row['expected_order']]

        df['expected_order'] = df.apply(get_image_positions, axis=1)
        return df

    def __len__(self):
        return len(self.processed_data)

    def __getitem__(self, idx):
        return self.processed_data[idx]


def create_dataloader(dataset, batch_size):
    return DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=lambda x: x)

In [None]:
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")
model = AutoModel.from_pretrained(model_name, token="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").to(device)

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

# **Model Architecture**

In [None]:
class IdiomModel(nn.Module):
    def __init__(self, base_model, tokenizer):
        super().__init__()
        self.encoder = base_model
        self.tokenizer = tokenizer
        self.similarity = nn.CosineSimilarity(dim=1)

        # Process similarity scores together
        self.ranking_head = nn.Sequential(
            nn.Linear(5, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 5)
        )

        # Combine text features with similarity patterns
        self.idiom_classifier = nn.Sequential(
            nn.Linear(768 + 5, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2)
        )

    def tokenize_sentence(self, sentence):
        return self.tokenizer(sentence, truncation=True, padding='max_length', max_length=512, return_tensors='pt')

    def forward(self, text_input, caption_inputs):
        # Process text
        if isinstance(text_input, str):
            text_input = self.tokenize_sentence(text_input).to(next(self.encoder.parameters()).device)
        text_features = self.encoder(**text_input).pooler_output

        # Process captions
        caption_features_list = [
            self.encoder(**self.tokenize_sentence(caption).to(next(self.encoder.parameters()).device) if isinstance(caption, str) else caption).pooler_output
            for caption in caption_inputs
        ]

        # Calculate similarity scores for all captions at once
        similarity_scores = torch.zeros(1, len(caption_inputs), device=next(self.encoder.parameters()).device)
        for i, caption_feat in enumerate(caption_features_list):
            similarity_scores[0, i] = self.similarity(text_features, caption_feat)

        # Get final scores using similarity pattern
        scores = self.ranking_head(similarity_scores.squeeze(0))  # Remove batch dimension for ranking head
        scores = scores.unsqueeze(0)  # Add batch dimension back

        # Combine text features with similarity pattern for classification
        combined_features = torch.cat([text_features, similarity_scores], dim=1)
        logits = self.idiom_classifier(combined_features)

        return scores, logits

In [None]:
# Initialize model
model = IdiomModel(model, tokenizer).to(device)

# Freeze encoder
for param in model.encoder.parameters():
    param.requires_grad = False

# Train ranking head and classifier only
for param in model.ranking_head.parameters():
    param.requires_grad = True
for param in model.idiom_classifier.parameters():
    param.requires_grad = True

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)

In [None]:
def count_parameters(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    return f"Trainable parameters: {trainable:,}\nTotal parameters: {total:,}"

print(count_parameters(model))

Trainable parameters: 473,223
Total parameters: 278,516,871


# **Testing**

In [None]:
test_sentence = "The eager beaver finished work early."
test_captions = [
    "A hardworking person at desk",
    "A beaver building dam",
    "A lazy person sleeping",
    "An office worker",
    "A construction site"
]

# Test forward pass
with torch.no_grad():
    scores, logits = model(test_sentence, test_captions)

print("Ranking scores shape:", scores.shape)
print("Classification logits shape:", logits.shape)
print("\nRanking scores:", scores.squeeze().tolist())
print("Classification probabilities:", F.softmax(logits, dim=1).squeeze().tolist())

Ranking scores shape: torch.Size([1, 5])
Classification logits shape: torch.Size([1, 2])

Ranking scores: [-0.11701613664627075, 0.016937240958213806, -0.7893982529640198, -0.10941161215305328, -0.33258140087127686]
Classification probabilities: [0.4085644483566284, 0.5914355516433716]


In [None]:
print(model)

IdiomModel(
  (encoder): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

# **Training**

In [None]:
def calculate_order_accuracy(expected_order, pred_order):
    total_penalty = 0
    max_penalty = 0
    n = len(expected_order)

    # Create position mapping from predicted to expected
    pred_to_expected = {pred: exp for exp, pred in enumerate(expected_order)}

    for i, pred_pos in enumerate(pred_order):
        expected_idx = pred_to_expected[pred_pos]
        position_diff = abs(i - expected_idx)

        # Penalize based on how far off the position is
        penalty = position_diff / (n-1)
        total_penalty += penalty
        max_penalty += 1

    weighted_accuracy = 1 - (total_penalty / max_penalty)
    return weighted_accuracy

In [None]:
def train_epoch(model, train_dataloader, optimizer, device):
    model.train()
    total_loss = 0
    criterion_ranking = nn.MarginRankingLoss(margin=0.2)
    criterion_classifier = nn.CrossEntropyLoss()

    true_labels = []
    pred_labels = []
    all_order_accuracies = []
    batch_order_accuracies = []

    for batch_idx, batch in enumerate(train_dataloader):
        optimizer.zero_grad()

        sentences = [item['sentence'] for item in batch]
        caption_groups = [[img['caption'] for img in item['images']] for item in batch]
        labels = torch.tensor([1 if item['label'] == 'idiomatic' else 0 for item in batch]).to(device)

        batch_scores = []
        batch_logits = []

        for i, (sentence, captions) in enumerate(zip(sentences, caption_groups)):
            scores, logits = model(sentence, captions)
            pred_label = 'idiomatic' if F.softmax(logits, dim=1)[0][1] > 0.5 else 'literal'
            true_labels.append(batch[i]['label'])
            pred_labels.append(pred_label)
            batch_scores.append(scores)
            batch_logits.append(logits)

        scores = torch.stack(batch_scores)
        logits = torch.stack(batch_logits)

        batch_loss = 0
        for idx, item in enumerate(batch):
            scores = batch_scores[idx].squeeze().tolist()
            expected_order = item['expected_order']

            # Get predicted order based on scores
            pred_order = list(range(1, 6))
            pred_order.sort(key=lambda x: scores[x-1], reverse=True)

            # Calculate weighted order accuracy
            order_accuracy = calculate_order_accuracy(expected_order, pred_order)
            batch_order_accuracies.append(order_accuracy)
            all_order_accuracies.append(order_accuracy)

            # Create target rankings
            target_ranks = torch.zeros_like(batch_scores[idx])
            for rank, pos in enumerate(expected_order):
                target_ranks[0, pos-1] = len(expected_order) - rank - 1

            ranking_loss = criterion_ranking(batch_scores[idx], target_ranks, torch.ones_like(batch_scores[idx]))
            label = 1 if item['label'] == 'idiomatic' else 0
            classification_loss = criterion_classifier(batch_logits[idx], torch.tensor([label]).to(device))
            batch_loss += (ranking_loss + classification_loss)

            print("\n" + "="*50)
            print(f"Compound: {item['compound']}")
            print(f"Sentence: {item['sentence']}")
            print(f"Expected: {' -> '.join(map(str, expected_order))}")
            print(f"Predicted: {' -> '.join(map(str, pred_order))}")
            print(f"Weighted order accuracy: {order_accuracy:.3f}")
            print("="*50)

        batch_loss.backward()
        optimizer.step()
        total_loss += batch_loss.item()
        batch_order_accuracies = []

    # Calculate final metrics
    avg_order_accuracy = sum(all_order_accuracies) / len(all_order_accuracies)
    macro_f1 = f1_score([1 if label == 'idiomatic' else 0 for label in true_labels],
                       [1 if label == 'idiomatic' else 0 for label in pred_labels],
                       average='macro')

    print("\nOverall Training Metrics:")
    print("="*50)
    print(f"Average Weighted Order Accuracy: {avg_order_accuracy:.3f}")
    print(f"Macro F1 Score: {macro_f1:.3f}")
    print("="*50)

    return total_loss, true_labels, pred_labels

# **Evaluation**

In [None]:
def evaluate_model(model, test_loader, device, threshold=0.5):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            for item in batch:
                # Get model predictions
                scores, logits = model(item['sentence'], [img['caption'] for img in item['images']])

                # Get predicted image order
                pred_scores = scores.squeeze().cpu().numpy()
                pred_order = np.argsort(-pred_scores)  # Descending order

                # Get predicted idiomaticity
                # Get classification probabilities
                probs = F.softmax(logits.squeeze(), dim=0)
                # Lower threshold since we expect more idiomatic cases
                pred_label = 'idiomatic' if probs[1].item() > threshold else 'literal'

                # Store predictions
                prediction = {
                    'compound': item['compound'],
                    'sentence': item['sentence'],
                    'predicted_order': [item['images'][i]['name'] for i in pred_order],
                    'predicted_type': pred_label,
                    'confidence_scores': pred_scores.tolist()
                }
                predictions.append(prediction)

    return predictions

def save_predictions(predictions, output_file):
    df = pd.DataFrame(predictions)
    df.to_csv(output_file, index=False)
    print(f"\nPredictions saved to {output_file}")

# **Utility**

In [None]:
# Create datasets
train_dataset = IdiomDataset("subtask_a_train.tsv")  # Your input file
test_dataset = IdiomDataset("subtask_a_dev.tsv")  # Your test file

# split
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

train_dataloader = create_dataloader(train_subset, batch_size=2)
val_dataloader = create_dataloader(val_subset, batch_size=2)
test_dataloader = create_dataloader(test_dataset, batch_size=2)


# Training loop
epochs = 10
best_threshold = 0.5  # Default

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    _, true_labels, pred_labels = train_epoch(model, train_dataloader, optimizer, device)

    print("\nClassification Report:")
    print(classification_report(true_labels, pred_labels))
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, pred_labels))


Epoch 1/10

Compound: chicken feed
Sentence: Feed pets inside and do not leave pet, livestock or chicken feed outside.
Expected: 2 -> 4 -> 1 -> 5 -> 3
Predicted: 4 -> 5 -> 1 -> 2 -> 3
Weighted order accuracy: 0.700

Compound: two-way street
Sentence: In many cases, two-way streets result in twice as many pedestrian accidents as one way.
Expected: 1 -> 4 -> 3 -> 5 -> 2
Predicted: 4 -> 2 -> 1 -> 3 -> 5
Weighted order accuracy: 0.600

Compound: nest egg
Sentence: Some would be better off with no insurance and using the money saved to build a nest egg to see them through lean times or pay off their debt.
Expected: 1 -> 2 -> 3 -> 4 -> 5
Predicted: 1 -> 3 -> 4 -> 2 -> 5
Weighted order accuracy: 0.800

Compound: inner circle
Sentence: Formal or informal contacts with the prime minister or his inner circle are now greatly prized by the persuaders.
Expected: 1 -> 3 -> 2 -> 4 -> 5
Predicted: 4 -> 1 -> 2 -> 3 -> 5
Weighted order accuracy: 0.700

Compound: pipe dream
Sentence: The result is that 

In [None]:
# Generate predictions using threshold
print("\nGenerating predictions for dev set...")
predictions = evaluate_model(model, test_dataloader, device, threshold=best_threshold)
save_predictions(predictions, 'dev_predictions.csv')


Generating predictions for dev set...

Predictions saved to dev_predictions.csv


In [None]:
results = pd.read_csv('/content/dev_predictions.csv')
display(results)

Unnamed: 0,compound,sentence,predicted_order,predicted_type,confidence_scores
0,monkey business,"Architecturally magnificent, it is as clean as...","['61570020623.png', '04129294826.png', '568752...",literal,"[1.9956674575805664, 1.4788403511047363, 1.654..."
1,grass roots,"From time to time, however, grass roots may tu...","['72263394122.png', '24221318591.png', '502789...",literal,"[2.007617950439453, 1.4447171688079834, 1.6336..."
2,marching orders,"The soldiers, their faces etched with determin...","['47713768923.png', '06066676443.png', '151833...",literal,"[2.0180211067199707, 1.4212193489074707, 1.565..."
3,panda car,We'd been waiting for ages before the panda ca...,"['52866539701.png', '29615068658.png', '422837...",literal,"[1.9957070350646973, 1.5021121501922607, 1.511..."
4,bread and butter,"It's Steve's bread and butter, and has been fo...","['96897123911.png', '29490290047.png', '780763...",idiomatic,"[1.970732569694519, 1.5331761837005615, 1.6334..."
5,chocolate teapot,Lindt has made a classy dark chocolate teapot ...,"['26164380699.png', '10077984673.png', '242493...",literal,"[2.0239853858947754, 1.4339900016784668, 1.657..."
6,pig's ear,"Thus, when it wrote the new partition table it...","['61623278645.png', '28970148854.png', '521676...",literal,"[1.9895310401916504, 1.4434893131256104, 1.629..."
7,best man,"All stories aside, Neil is a great mate and it...","['94431397208.png', '06108380384.png', '575043...",idiomatic,"[2.023242235183716, 1.4534589052200317, 1.6393..."
8,big cheese,I send Ticknor a big cheese which I long ago p...,"['67845423017.png', '11983691096.png', '227840...",literal,"[2.0410099029541016, 1.4384148120880127, 1.644..."
9,eager beaver,My staff and I were hungry to get young eager ...,"['55745563240.png', '25688411798.png', '518485...",literal,"[1.9515639543533325, 1.5524277687072754, 1.633..."
