# **Group Members**

In [None]:
# Ahmad Arif - 444002984

# Musab Iskandar - 444003841

# Yousef Koshak - 444000774

# This project focuses on advancing idiomatic representation in NLP models by implementing and improving upon methodologies discussed in selected scientific papers. The primary objective is to create a modular and organized codebase for idiomaticity representation, leveraging contrastive triplet loss and adaptive contrastive learning techniques. The dataset involves idioms in both literal and figurative contexts, aiming for tasks such as ranking and idiomaticity detection.

# **Importing Libraries**

In [None]:
import ast
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

# **Check for GPU/CPU availability**


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# **Dataset Preprocessing**

In [None]:
class IdiomDataset(Dataset):
    def __init__(self, data_file):
        # Load the data
        self.data = pd.read_csv(data_file, delimiter='\t')

        # Process expected order first
        self.data = self.convert_expected_order(self.data)

        # Then process the full dataset
        self.processed_data = [
            {
                'compound': row['compound'],
                'sentence': row['sentence'],
                'label': row['sentence_type'],
                'images': [
                    {'name': row[f'image{i}_name'], 'caption': row[f'image{i}_caption'], 'position': i}
                    for i in range(1, 6)
                ],
                'expected_order': row['expected_order']  # Now using the numeric positions
            }
            for _, row in self.data.iterrows()
        ]

    def convert_expected_order(self, df):
        # Convert the string representation of list to actual list
        df['expected_order'] = df['expected_order'].apply(
            lambda x: ast.literal_eval(x) if pd.notna(x) else []
        )

        # Create a mapping dictionary for each row
        def get_image_positions(row):
            # Create a dictionary mapping image names to their positions (1-5)
            image_map = {
                row[f'image{i}_name']: i for i in range(1, 6)
            }
            return [image_map[img_name] for img_name in row['expected_order']]

        df['expected_order'] = df.apply(get_image_positions, axis=1)
        return df

    def __len__(self):
        return len(self.processed_data)

    def __getitem__(self, idx):
        return self.processed_data[idx]


def create_dataloader(dataset, batch_size):
    return DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=lambda x: x)

In [None]:
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")
model = AutoModel.from_pretrained(model_name, token="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").to(device)

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

# **Model Architecture**

In [None]:
class IdiomModel(nn.Module):
    def __init__(self, base_model, tokenizer):
        super().__init__()
        self.encoder = base_model
        self.tokenizer = tokenizer
        self.triplet_loss = nn.TripletMarginLoss(margin=0.5)

        # Process similarity scores together
        self.ranking_head = nn.Sequential(
            nn.Linear(5, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 5)
        )

        # Combine text features with similarity patterns
        self.idiom_classifier = nn.Sequential(
            nn.Linear(768 + 5, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2)
        )

    def tokenize_sentence(self, sentence):
        return self.tokenizer(sentence, truncation=True, padding='max_length', max_length=512, return_tensors='pt')

    def forward(self, text_input, caption_inputs):
        # Process text
        if isinstance(text_input, str):
            text_input = self.tokenize_sentence(text_input).to(next(self.encoder.parameters()).device)
        text_features = self.encoder(**text_input).pooler_output

        # Process captions
        caption_features_list = [
            self.encoder(**self.tokenize_sentence(caption).to(next(self.encoder.parameters()).device) if isinstance(caption, str) else caption).pooler_output
            for caption in caption_inputs
        ]

        # Calculate similarity scores (simulating triplet loss approach)
        similarity_scores = torch.zeros(1, len(caption_inputs), device=next(self.encoder.parameters()).device)
        for i, caption_feat in enumerate(caption_features_list):
            # Use negative distance as similarity score
            similarity_scores[0, i] = -torch.norm(text_features - caption_feat)

        # Get final scores using ranking head
        scores = self.ranking_head(similarity_scores.squeeze(0))
        scores = scores.unsqueeze(0)

        # Combine text features with similarity pattern for classification
        combined_features = torch.cat([text_features, similarity_scores], dim=1)
        logits = self.idiom_classifier(combined_features)

        return scores, logits

In [None]:
model = IdiomModel(model, tokenizer).to(device)

# Freeze encoder
for param in model.encoder.parameters():
    param.requires_grad = False

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)

# Count trainable and total parameters
def count_parameters(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    return f"Trainable parameters: {trainable:,}\nTotal parameters: {total:,}"

print(count_parameters(model))

Trainable parameters: 473,223
Total parameters: 278,516,871


In [None]:
def count_parameters(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    return f"Trainable parameters: {trainable:,}\nTotal parameters: {total:,}"

print(count_parameters(model))

Trainable parameters: 473,223
Total parameters: 278,516,871


In [None]:
print(model)

IdiomModel(
  (encoder): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

# Training

In [None]:
def calculate_order_accuracy(expected_order, pred_order):
    total_penalty = 0
    max_penalty = 0
    n = len(expected_order)

    # Create position mapping from predicted to expected
    pred_to_expected = {pred: exp for exp, pred in enumerate(expected_order)}

    for i, pred_pos in enumerate(pred_order):
        expected_idx = pred_to_expected[pred_pos]
        position_diff = abs(i - expected_idx)

        # Penalize based on how far off the position is
        penalty = position_diff / (n-1)
        total_penalty += penalty
        max_penalty += 1

    weighted_accuracy = 1 - (total_penalty / max_penalty)
    return weighted_accuracy

In [None]:
def train_epoch(model, train_dataloader, optimizer, device):
    model.train()
    total_loss = 0
    criterion_ranking = nn.TripletMarginLoss(margin=0.5)
    criterion_classifier = nn.CrossEntropyLoss()

    true_labels = []
    pred_labels = []
    all_order_accuracies = []
    batch_order_accuracies = []

    for batch_idx, batch in enumerate(train_dataloader):
        optimizer.zero_grad()

        sentences = [item['sentence'] for item in batch]
        caption_groups = [[img['caption'] for img in item['images']] for item in batch]
        labels = torch.tensor([1 if item['label'] == 'idiomatic' else 0 for item in batch]).to(device)

        batch_scores = []
        batch_logits = []
        batch_caption_features = []

        for i, (sentence, captions) in enumerate(zip(sentences, caption_groups)):
            scores, logits = model(sentence, captions)

            # Process features and predictions
            caption_features = [
                model.encoder(**model.tokenize_sentence(caption).to(device)).pooler_output
                for caption in captions
            ]

            pred_label = 'idiomatic' if F.softmax(logits, dim=1)[0][1] > 0.5 else 'literal'
            true_labels.append(batch[i]['label'])
            pred_labels.append(pred_label)

            batch_scores.append(scores)
            batch_logits.append(logits)
            batch_caption_features.append(caption_features)

        scores = torch.stack(batch_scores)
        logits = torch.stack(batch_logits)

        batch_loss = 0
        for idx, item in enumerate(batch):
            scores = batch_scores[idx].squeeze().tolist()
            expected_order = item['expected_order']

            # Get predicted order based on scores
            pred_order = list(range(1, 6))
            pred_order.sort(key=lambda x: scores[x-1], reverse=True)

            # Calculate weighted order accuracy
            order_accuracy = calculate_order_accuracy(expected_order, pred_order)
            batch_order_accuracies.append(order_accuracy)
            all_order_accuracies.append(order_accuracy)

            print("\n" + "="*50)
            print(f"Compound: {item['compound']}")
            print(f"Sentence: {item['sentence']}")
            print(f"Expected: {' -> '.join(map(str, expected_order))}")
            print(f"Predicted: {' -> '.join(map(str, pred_order))}")
            print(f"Weighted order accuracy: {order_accuracy:.3f}")
            print("="*50)

            # Calculate losses
            if len(batch_caption_features[idx]) >= 3:
                anchor = batch_caption_features[idx][0]
                positive = batch_caption_features[idx][1]
                negative = batch_caption_features[idx][2]
                ranking_loss = criterion_ranking(anchor, positive, negative)
            else:
                ranking_loss = torch.tensor(0.0).to(device)

            label = 1 if item['label'] == 'idiomatic' else 0
            classification_loss = criterion_classifier(batch_logits[idx], torch.tensor([label]).to(device))
            batch_loss += (ranking_loss + classification_loss)

        batch_loss.backward()
        optimizer.step()
        total_loss += batch_loss.item()
        batch_order_accuracies = []

    # Calculate final metrics
    avg_order_accuracy = sum(all_order_accuracies) / len(all_order_accuracies)
    macro_f1 = f1_score([1 if label == 'idiomatic' else 0 for label in true_labels],
                       [1 if label == 'idiomatic' else 0 for label in pred_labels],
                       average='macro')

    print("\nOverall Training Metrics:")
    print("="*50)
    print(f"Average Weighted Order Accuracy: {avg_order_accuracy:.3f}")
    print(f"Macro F1 Score: {macro_f1:.3f}")
    print("="*50)

    return total_loss, true_labels, pred_labels

# **Evaluation**

In [None]:
def evaluate_model(model, test_loader, device, threshold=0.5):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            for item in batch:
                # Get model predictions
                scores, logits = model(item['sentence'], [img['caption'] for img in item['images']])

                # Get predicted image order
                pred_scores = scores.squeeze().cpu().numpy()
                pred_order = np.argsort(-pred_scores)  # Descending order

                # Get predicted idiomaticity
                probs = F.softmax(logits.squeeze(), dim=0)
                pred_label = 'idiomatic' if probs[1].item() > threshold else 'literal'

                # Store predictions
                prediction = {
                    'compound': item['compound'],
                    'sentence': item['sentence'],
                    'predicted_order': [item['images'][i]['name'] for i in pred_order],
                    'predicted_type': pred_label,
                    'confidence_scores': pred_scores.tolist()
                }
                predictions.append(prediction)

    return predictions

def save_predictions(predictions, output_file):
    df = pd.DataFrame(predictions)
    df.to_csv(output_file, index=False)
    print(f"\nPredictions saved to {output_file}")

# **Utility**

In [None]:
# Create datasets
train_dataset = IdiomDataset("subtask_a_train.tsv")  # Your input file
test_dataset = IdiomDataset("subtask_a_dev.tsv")  # Your test file

# split
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

train_dataloader = create_dataloader(train_subset, batch_size=2)
val_dataloader = create_dataloader(val_subset, batch_size=2)
test_dataloader = create_dataloader(test_dataset, batch_size=2)


# Training loop
epochs = 10
best_threshold = 0.5  # Default

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    _, true_labels, pred_labels = train_epoch(model, train_dataloader, optimizer, device)

    print("\nClassification Report:")
    print(classification_report(true_labels, pred_labels))
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, pred_labels))


Epoch 1/10

Compound: baby blues
Sentence: However, the symptoms of baby blues are mild and usually disappear after a couple of weeks.
Expected: 1 -> 3 -> 2 -> 4 -> 5
Predicted: 4 -> 5 -> 2 -> 3 -> 1
Weighted order accuracy: 0.400

Compound: piece of cake
Sentence: He picked up the piece of cake and took a large bite, chewing noisily.
Expected: 3 -> 1 -> 2 -> 5 -> 4
Predicted: 5 -> 1 -> 4 -> 2 -> 3
Weighted order accuracy: 0.500

Compound: cat's eyes
Sentence: American roads in the South do have cat's eyes but we don't have them in regions with snow because the plows would scrape them up every winter.
Expected: 3 -> 1 -> 2 -> 5 -> 4
Predicted: 5 -> 3 -> 4 -> 2 -> 1
Weighted order accuracy: 0.500

Compound: rocket science
Sentence: This isn't rocket science: he talks about basic business skills like negotiation, time management and creativity.
Expected: 5 -> 3 -> 4 -> 1 -> 2
Predicted: 4 -> 3 -> 5 -> 1 -> 2
Weighted order accuracy: 0.800

Compound: white elephant
Sentence: Miami's orig

In [None]:
# Generate predictions using threshold
print("\nGenerating predictions for dev set...")
predictions = evaluate_model(model, test_dataloader, device, threshold=best_threshold)
save_predictions(predictions, 'dev_predictions.csv')


Generating predictions for dev set...

Predictions saved to dev_predictions.csv


In [None]:
results = pd.read_csv('/content/dev_predictions.csv')
display(results)

Unnamed: 0,compound,sentence,predicted_order,predicted_type,confidence_scores
0,monkey business,"Architecturally magnificent, it is as clean as...","['94990180734.png', '61570020623.png', '041292...",idiomatic,"[0.023638322949409485, -0.398685097694397, -0...."
1,grass roots,"From time to time, however, grass roots may tu...","['88610497135.png', '24221318591.png', '722633...",idiomatic,"[0.05026097595691681, -0.40419119596481323, -0..."
2,marching orders,"The soldiers, their faces etched with determin...","['65125915005.png', '47713768923.png', '060666...",idiomatic,"[0.0336979478597641, -0.41178202629089355, -0...."
3,panda car,We'd been waiting for ages before the panda ca...,"['63963578494.png', '52866539701.png', '296150...",idiomatic,"[0.014060094952583313, -0.3933364152908325, -0..."
4,bread and butter,"It's Steve's bread and butter, and has been fo...","['98148844665.png', '96897123911.png', '294902...",idiomatic,"[0.004512980580329895, -0.3675770163536072, -0..."
5,chocolate teapot,Lindt has made a classy dark chocolate teapot ...,"['69053841503.png', '10077984673.png', '261643...",idiomatic,"[0.14190296828746796, -0.4403229355812073, -0...."
6,pig's ear,"Thus, when it wrote the new partition table it...","['76297721311.png', '61623278645.png', '289701...",idiomatic,"[0.03372420370578766, -0.48626816272735596, -0..."
7,best man,"All stories aside, Neil is a great mate and it...","['95555063454.png', '94431397208.png', '061083...",idiomatic,"[0.041716501116752625, -0.4505104422569275, -0..."
8,big cheese,I send Ticknor a big cheese which I long ago p...,"['83903589871.png', '11983691096.png', '678454...",idiomatic,"[0.12604857981204987, -0.47187817096710205, -0..."
9,eager beaver,My staff and I were hungry to get young eager ...,"['97713215186.png', '25688411798.png', '557455...",idiomatic,"[0.04884962737560272, -0.4033234119415283, -0...."
