In [16]:
# ================================
# Imports
# ================================

import numpy as np
import pandas as pd
import time
import os
import nltk
import yaml
import wandb
import matplotlib.pyplot as plt
import spacy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr


In [17]:
# ================================
# Setup
# ================================


nltk.download('punkt')


nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Oana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
# ================================
# Preprocessing Functions
# ================================

def preprocess_sentence_spacy(sentence, tokenize=True):
    """
    Preprocesează o propoziție folosind spaCy.

    Args:
        sentence (str): Propoziția de preprocesat.
        tokenize (bool): Dacă este True, se tokenizează propoziția.

    Returns:
        list: Lista de tokeni.
    """
    if tokenize:
        doc = nlp(sentence.lower())
        return [token.text for token in doc]
    else:
        return sentence.lower().split()


In [None]:
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

"""
This file contains the definition of encoders used in https://arxiv.org/pdf/1705.02364.pdf
"""

import numpy as np
import time

import torch
import torch.nn as nn


class InferSent(nn.Module):

    def __init__(self, config):
        super(InferSent, self).__init__()
        self.bsize = config['bsize']
        self.word_emb_dim = config['word_emb_dim']
        self.enc_lstm_dim = config['enc_lstm_dim']
        self.pool_type = config['pool_type']
        self.dpout_model = config['dpout_model']
        self.version = 1 if 'version' not in config else config['version']

        self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1,
                                bidirectional=True, dropout=self.dpout_model)

        assert self.version in [1, 2]
        if self.version == 1:
            self.bos = '<s>'
            self.eos = '</s>'
            self.max_pad = True
            self.moses_tok = False
        elif self.version == 2:
            self.bos = '<p>'
            self.eos = '</p>'
            self.max_pad = False
            self.moses_tok = True

    def is_cuda(self):
        # either all weights are on cpu or they are on gpu
        return self.enc_lstm.bias_hh_l0.data.is_cuda

    def forward(self, sent_tuple):
        # sent_len: [max_len, ..., min_len] (bsize)
        # sent: (seqlen x bsize x worddim)
        sent, sent_len = sent_tuple

        # Sort by length (keep idx)
        sent_len_sorted, idx_sort = np.sort(sent_len)[::-1], np.argsort(-sent_len)
        sent_len_sorted = sent_len_sorted.copy()
        idx_unsort = np.argsort(idx_sort)

        idx_sort = torch.from_numpy(idx_sort).cuda() if self.is_cuda() \
            else torch.from_numpy(idx_sort)
        sent = sent.index_select(1, idx_sort)

        # Handling padding in Recurrent Networks
        sent_packed = nn.utils.rnn.pack_padded_sequence(sent, sent_len_sorted)
        sent_output = self.enc_lstm(sent_packed)[0]  # seqlen x batch x 2*nhid
        sent_output = nn.utils.rnn.pad_packed_sequence(sent_output)[0]

        # Un-sort by length
        idx_unsort = torch.from_numpy(idx_unsort).cuda() if self.is_cuda() \
            else torch.from_numpy(idx_unsort)
        sent_output = sent_output.index_select(1, idx_unsort)

        # Pooling
        if self.pool_type == "mean":
            sent_len = torch.FloatTensor(sent_len.copy()).unsqueeze(1).cuda()
            emb = torch.sum(sent_output, 0).squeeze(0)
            emb = emb / sent_len.expand_as(emb)
        elif self.pool_type == "max":
            if not self.max_pad:
                sent_output[sent_output == 0] = -1e9
            emb = torch.max(sent_output, 0)[0]
            if emb.ndimension() == 3:
                emb = emb.squeeze(0)
                assert emb.ndimension() == 2

        return emb

    def set_w2v_path(self, w2v_path):
        self.w2v_path = w2v_path

    def get_word_dict(self, sentences, tokenize=True):
        # create vocab of words
        word_dict = {}
        sentences = [s.split() if not tokenize else self.tokenize(s) for s in sentences]
        for sent in sentences:
            for word in sent:
                if word not in word_dict:
                    word_dict[word] = ''
        word_dict[self.bos] = ''
        word_dict[self.eos] = ''
        return word_dict

    def get_w2v(self, word_dict):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # create word_vec with w2v vectors
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                if word in word_dict:
                    word_vec[word] = np.fromstring(vec, sep=' ')
        print('Found %s(/%s) words with w2v vectors' % (len(word_vec), len(word_dict)))
        return word_vec

    def get_w2v_k(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # create word_vec with k first w2v vectors
        k = 0
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                if k <= K:
                    word_vec[word] = np.fromstring(vec, sep=' ')
                    k += 1
                if k > K:
                    if word in [self.bos, self.eos]:
                        word_vec[word] = np.fromstring(vec, sep=' ')

                if k > K and all([w in word_vec for w in [self.bos, self.eos]]):
                    break
        return word_vec

    def build_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        word_dict = self.get_word_dict(sentences, tokenize)
        self.word_vec = self.get_w2v(word_dict)
        print('Vocab size : %s' % (len(self.word_vec)))

    # build w2v vocab with k most frequent words
    def build_vocab_k_words(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        self.word_vec = self.get_w2v_k(K)
        print('Vocab size : %s' % (K))

    def update_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'warning : w2v path not set'
        assert hasattr(self, 'word_vec'), 'build_vocab before updating it'
        word_dict = self.get_word_dict(sentences, tokenize)

        # keep only new words
        for word in self.word_vec:
            if word in word_dict:
                del word_dict[word]

        # udpate vocabulary
        if word_dict:
            new_word_vec = self.get_w2v(word_dict)
            self.word_vec.update(new_word_vec)
        else:
            new_word_vec = []
        print('New vocab size : %s (added %s words)'% (len(self.word_vec), len(new_word_vec)))

    def get_batch(self, batch):
        # sent in batch in decreasing order of lengths
        # batch: (bsize, max_len, word_dim)
        embed = np.zeros((len(batch[0]), len(batch), self.word_emb_dim))

        for i in range(len(batch)):
            for j in range(len(batch[i])):
                embed[j, i, :] = self.word_vec[batch[i][j]]

        return torch.FloatTensor(embed)

    def tokenize(self, s):
        return [token.text for token in nlp(s)]

    def prepare_samples(self, sentences, bsize, tokenize, verbose):
        if not tokenize:
            # sentences este o listă de liste de tokeni
            sentences = [[self.bos] + s + [self.eos] for s in sentences]
        else:
            # sentences este o listă de șiruri de caractere
            sentences = [[self.bos] + self.tokenize(s) + [self.eos] for s in sentences]
        n_w = np.sum([len(x) for x in sentences])

        # filters words without w2v vectors
        for i in range(len(sentences)):
            s_f = [word for word in sentences[i] if word in self.word_vec]
            if not s_f:
                import warnings
                warnings.warn('No words in "%s" (idx=%s) have w2v vectors. \
                           Replacing by "%s"..' % (sentences[i], i, self.eos))
                sf = [self.eos]
            sentences[i] = s_f

        lengths = np.array([len(s) for s in sentences])
        n_wk = np.sum(lengths)
        if verbose:
            print('Nb words kept : %s/%s (%.1f%s)' % (
                    n_wk, n_w, 100.0 * n_wk / n_w, '%'))

        # sort by decreasing length
        lengths, idx_sort = np.sort(lengths)[::-1], np.argsort(-lengths)
        sentences = [sentences[i] for i in idx_sort]

        return sentences, lengths, idx_sort


    def encode(self, sentences, bsize=64, tokenize=True, verbose=False):
        tic = time.time()

        # Pregătirea eșantioanelor
        sentences, lengths, idx_sort = self.prepare_samples(
            sentences, bsize, tokenize, verbose)

        embeddings = []
        for stidx in range(0, len(sentences), bsize):
            batch_sentences = sentences[stidx:stidx + bsize]
            batch_lengths = lengths[stidx:stidx + bsize]

            batch = self.get_batch(batch_sentences)
            if self.is_cuda():
                batch = batch.cuda()
            with torch.no_grad():
                batch_embeddings = self.forward((batch, batch_lengths)).data.cpu().numpy()
            embeddings.append(batch_embeddings)
        embeddings = np.vstack(embeddings)

        # Un-sort
        idx_unsort = np.argsort(idx_sort)
        embeddings = embeddings[idx_unsort]

        if verbose:
            print('Speed : %.1f sentences/s (%s mode, bsize=%s)' % (
                    len(embeddings)/(time.time()-tic),
                    'gpu' if self.is_cuda() else 'cpu', bsize))
        return embeddings

    def visualize(self, sent, tokenize=True):

        sent = sent.split() if not tokenize else self.tokenize(sent)
        sent = [[self.bos] + [word for word in sent if word in self.word_vec] + [self.eos]]

        if ' '.join(sent[0]) == '%s %s' % (self.bos, self.eos):
            import warnings
            warnings.warn('No words in "%s" have w2v vectors. Replacing \
                           by "%s %s"..' % (sent, self.bos, self.eos))
        batch = self.get_batch(sent)

        if self.is_cuda():
            batch = batch.cuda()
        output = self.enc_lstm(batch)[0]
        output, idxs = torch.max(output, 0)
        # output, idxs = output.squeeze(), idxs.squeeze()
        idxs = idxs.data.cpu().numpy()
        argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))]

        # visualize model
        import matplotlib.pyplot as plt
        x = range(len(sent[0]))
        y = [100.0 * n / np.sum(argmaxs) for n in argmaxs]
        plt.xticks(x, sent[0], rotation=45)
        plt.bar(x, y)
        plt.ylabel('%')
        plt.title('Visualisation of words importance')
        plt.show()

        return output, idxs

In [2]:
import pandas as pd
import spacy
import os

# Încarcă modelul de limbă în spaCy
nlp = spacy.load('en_core_web_sm')

def preprocess_sentence_spacy(sentence, tokenize=True):
    if tokenize:
        doc = nlp(sentence.lower())
        return [token.text for token in doc]
    else:
        return sentence.lower().split()

# Încarcă setul de date
data = pd.read_csv(r'C:\facultate an 3\projects-simquery\data\sts_train.csv', delimiter='\t')

# Filtrează datele pentru a elimina rândurile cu valori lipsă
data = data.dropna(subset=['sent_1', 'sent_2', 'sim'])

# Extrage propozițiile și scorurile
sentences_1 = data['sent_1'].tolist()
sentences_2 = data['sent_2'].tolist()
similarities = data['sim'].tolist()

# Tokenizează propozițiile folosind spaCy
sentences_1 = [preprocess_sentence_spacy(s) for s in sentences_1]
sentences_2 = [preprocess_sentence_spacy(s) for s in sentences_2]

print("Tokenization successful using spaCy!")


Tokenization successful using spaCy!


In [3]:
import torch
import nltk
nltk.download('punkt')

# Configurarea parametrilor modelului
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': 1,
}

model = InferSent(params_model)

model.set_w2v_path(r'C:\facultate an 3\projects-simquery\GloVe\glove.840B.300d.txt')

# Construirea vocabularului pe baza propozițiilor din setul de date
sentences = [' '.join(s) for s in sentences_1 + sentences_2]  # combină tokenii înapoi în propoziții
model.build_vocab(sentences, tokenize=True)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Oana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Found 11220(/12518) words with w2v vectors
Vocab size : 11220


In [4]:
# Continuă să combini tokenii înapoi în propoziții
sentences_1_str = [' '.join(s) for s in sentences_1]
sentences_2_str = [' '.join(s) for s in sentences_2]

embeddings_1 = model.encode(sentences_1_str, tokenize=True)
embeddings_2 = model.encode(sentences_2_str, tokenize=True)

print("Embeddings generated successfully!")


Embeddings generated successfully!


In [5]:
import numpy as np

# Convertește embedding-urile și scorurile în numpy arrays
X1 = np.array(embeddings_1)
X2 = np.array(embeddings_2)
y = np.array(similarities).astype(float)

# Normalizarea scorurilor
y = y / 5.0  # Dacă scorurile sunt între 0 și 5


In [6]:
from sklearn.model_selection import train_test_split

X1_train, X1_val, X2_train, X2_val, y_train, y_val = train_test_split(
    X1, X2, y, test_size=0.2, random_state=42
)


In [7]:
class SimilarityModel(nn.Module):
    def __init__(self, input_size):
        super(SimilarityModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size * 4, 1024),  # Increased input size for more features
            nn.ReLU(),
            nn.Dropout(0.5),  # Added dropout for regularization
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 1)  # Output layer
        )
        
    def forward(self, x1, x2):
        # Compute interaction features
        diff = torch.abs(x1 - x2)
        prod = x1 * x2
        features = torch.cat([x1, x2, diff, prod], dim=1)
        output = self.fc(features)
        return output.squeeze()

In [9]:
from torch.utils.data import Dataset, DataLoader

class STSDataset(Dataset):
    def __init__(self, embeddings1, embeddings2, similarities):
        self.embeddings1 = embeddings1
        self.embeddings2 = embeddings2
        self.similarities = similarities
        
    def __len__(self):
        return len(self.similarities)
    
    def __getitem__(self, idx):
        x1 = self.embeddings1[idx]
        x2 = self.embeddings2[idx]
        y = self.similarities[idx]
        return x1, x2, y

# Create dataset instances
train_dataset = STSDataset(X1_train, X2_train, y_train)
val_dataset = STSDataset(X1_val, X2_val, y_val)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [15]:
# ================================
# Training Setup with wandb
# ================================

# Ensure that all necessary modules are imported
import yaml
import wandb
import torch.optim as optim  # Import optim module

# Load YAML configuration
with open(r"C:\facultate an 3\projects-simquery\config.yaml", "r") as file:
    config_data = yaml.safe_load(file)

# Initialize wandb
wandb.init(
    project="inferSent-project",
    config={
        "learning_rate": 1e-4,
        "epochs": config_data["model"]["parameters"]["epochs"],
        "batch_size": batch_size,
        "model_version": 1,
        "word_emb_dim": 300,
        "enc_lstm_dim": 2048,
        "pool_type": 'max',
        "dropout": 0.5,  # Updated dropout rate
        "accuracy_threshold": 0.5
    },
    name=config_data["model"]["name"],
    reinit=True
)
config = wandb.config

num_epochs = config.epochs
threshold = config.accuracy_threshold
best_val_loss = float('inf')
patience = 5  # Increased patience for early stopping
trigger_times = 0

# Instantiate the similarity model
input_size = X1_train.shape[1]
model_sim = SimilarityModel(input_size)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Changed loss function for binary classification
optimizer = optim.Adam(model_sim.parameters(), lr=config.learning_rate, weight_decay=1e-5)  # Added weight decay

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
                                                 factor=0.1, patience=2, verbose=True)

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_sim = model_sim.to(device)

# Initialize lists to store metrics
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

# ================================
# Training and Validation Functions
# ================================

def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    total_correct = 0
    all_preds = []
    all_labels = []
    
    for x1_batch, x2_batch, y_batch in loader:
        x1_batch = torch.tensor(x1_batch).to(device).float()
        x2_batch = torch.tensor(x2_batch).to(device).float()
        y_batch = torch.tensor(y_batch).to(device).float()

        optimizer.zero_grad()
        outputs = model(x1_batch, x2_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x1_batch.size(0)

        # Calculate accuracy
        preds = torch.sigmoid(outputs).detach().cpu().numpy()
        labels = y_batch.detach().cpu().numpy()
        preds_binary = (preds >= threshold).astype(int)
        labels_binary = (labels >= threshold).astype(int)
        total_correct += (preds_binary == labels_binary).sum()

        # Store predictions and labels
        all_preds.extend(preds_binary)
        all_labels.extend(labels_binary)
    
    avg_loss = total_loss / len(loader.dataset)
    avg_accuracy = total_correct / len(loader.dataset)
    
    return avg_loss, avg_accuracy, all_preds, all_labels

def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for x1_batch, x2_batch, y_batch in loader:
            x1_batch = torch.tensor(x1_batch).to(device).float()
            x2_batch = torch.tensor(x2_batch).to(device).float()
            y_batch = torch.tensor(y_batch).to(device).float()

            outputs = model(x1_batch, x2_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item() * x1_batch.size(0)

            preds = torch.sigmoid(outputs).detach().cpu().numpy()
            labels = y_batch.detach().cpu().numpy()
            preds_binary = (preds >= threshold).astype(int)
            labels_binary = (labels >= threshold).astype(int)
            total_correct += (preds_binary == labels_binary).sum()

            # Store predictions and labels
            all_preds.extend(preds_binary)
            all_labels.extend(labels_binary)
    
    avg_loss = total_loss / len(loader.dataset)
    avg_accuracy = total_correct / len(loader.dataset)
    
    return avg_loss, avg_accuracy, all_preds, all_labels

# ================================
# Training Loop
# ================================

for epoch in range(num_epochs):
    train_loss, train_acc, train_preds, train_labels = train(model_sim, train_loader, optimizer, criterion, device)
    val_loss, val_acc, val_preds, val_labels = validate(model_sim, val_loader, criterion, device)

    # Save metrics
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1}/{num_epochs}, "
          f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f}, "
          f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")
    
    # Log metrics to wandb
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "train_accuracy": train_acc,
        "val_loss": val_loss,
        "val_accuracy": val_acc
    })
    
    # Learning rate scheduler step
    scheduler.step(val_loss)
    
    # Early Stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        trigger_times = 0
        model_path = f"best_model_epoch_{epoch+1}.pth"
        torch.save(model_sim.state_dict(), model_path)
        print(f"Validation loss decreased. Model saved to {model_path}")
    else:
        trigger_times += 1
        print(f"No improvement in validation loss for {trigger_times} epochs.")
        if trigger_times >= patience:
            print("Early stopping triggered!")
            break


  x1_batch = torch.tensor(x1_batch).to(device).float()
  x2_batch = torch.tensor(x2_batch).to(device).float()
  y_batch = torch.tensor(y_batch).to(device).float()
  x1_batch = torch.tensor(x1_batch).to(device).float()
  x2_batch = torch.tensor(x2_batch).to(device).float()
  y_batch = torch.tensor(y_batch).to(device).float()


Epoch 1/40, Training Loss: 0.6775, Training Accuracy: 0.6387, Validation Loss: 0.6514, Validation Accuracy: 0.7493
Validation loss decreased. Model saved to best_model_epoch_1.pth
Epoch 2/40, Training Loss: 0.6330, Training Accuracy: 0.7423, Validation Loss: 0.6078, Validation Accuracy: 0.7711
Validation loss decreased. Model saved to best_model_epoch_2.pth
Epoch 3/40, Training Loss: 0.6121, Training Accuracy: 0.7516, Validation Loss: 0.6079, Validation Accuracy: 0.7798
No improvement in validation loss for 1 epochs.
Epoch 4/40, Training Loss: 0.6013, Training Accuracy: 0.7651, Validation Loss: 0.5954, Validation Accuracy: 0.7859
Validation loss decreased. Model saved to best_model_epoch_4.pth
Epoch 5/40, Training Loss: 0.5946, Training Accuracy: 0.7725, Validation Loss: 0.5943, Validation Accuracy: 0.7807
Validation loss decreased. Model saved to best_model_epoch_5.pth
Epoch 6/40, Training Loss: 0.5888, Training Accuracy: 0.7869, Validation Loss: 0.5854, Validation Accuracy: 0.7868
Va

In [None]:
conf_matrix = confusion_matrix(val_labels_binned, val_preds_binned)
accuracy = accuracy_score(val_labels_binned, val_preds_binned)
precision = precision_score(val_labels_binned, val_preds_binned, average='binary')
recall = recall_score(val_labels_binned, val_preds_binned, average='binary')
f1 = f1_score(val_labels_binned, val_preds_binned, average='binary')

print(f"Confusion Matrix Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

NameError: name 'precision_score' is not defined

In [None]:
test_data = [
    {"premise": "The cat is on the mat.", "hypothesis": "A cat is sitting on a rug.", "true_score": 4.5},
    {"premise": "The sun is shining brightly.", "hypothesis": "It is raining heavily.", "true_score": 0.0},
    {"premise": "A man is playing the guitar.", "hypothesis": "A person is strumming a musical instrument.", "true_score": 4.8},
    {"premise": "She is reading a book.", "hypothesis": "She is watching a movie.", "true_score": 1.0},
    {"premise": "The car is red.", "hypothesis": "The vehicle is blue.", "true_score": 0.5}
]

# Convert test data into tensors (simulate preprocessing real test data)
test_premises = [example["premise"] for example in test_data]
test_hypotheses = [example["hypothesis"] for example in test_data]
true_scores = torch.tensor([example["true_score"] for example in test_data]).float()

# Tokenize and embed the test premises and hypotheses (replace with your embedding method)
# For demonstration, random embeddings are used (replace with actual embeddings)
test_premises_embedded = torch.tensor([embeddings_1[i % len(embeddings_1)] for i in range(len(test_premises))]).float()
test_hypotheses_embedded = torch.tensor([embeddings_1[i % len(embeddings_1)] for i in range(len(test_hypotheses))]).float()

# Move to the appropriate device
test_premises_embedded = test_premises_embedded.to(device)
test_hypotheses_embedded = test_hypotheses_embedded.to(device)
true_scores = true_scores.to(device)

# Evaluate the model on test data
model_sim.eval()
with torch.no_grad():
    predicted_scores = model_sim(test_premises_embedded, test_hypotheses_embedded).squeeze()

# Compare predictions with true scores
print("Test Results:")
for i, (premise, hypothesis, true_score, predicted_score) in enumerate(zip(test_premises, test_hypotheses, true_scores, predicted_scores)):
    print(f"Example {i+1}:")
    print(f"  Premise: {premise}")
    print(f"  Hypothesis: {hypothesis}")
    print(f"  True Score: {true_score.item():.2f}")
    print(f"  Predicted Score: {predicted_score.item():.2f}")
    print(f"  Error: {abs(true_score.item() - predicted_score.item()):.2f}")
    print("-" * 40)


Test Results:
Example 1:
  Premise: The cat is on the mat.
  Hypothesis: A cat is sitting on a rug.
  True Score: 4.50
  Predicted Score: 0.92
  Error: 3.58
----------------------------------------
Example 2:
  Premise: The sun is shining brightly.
  Hypothesis: It is raining heavily.
  True Score: 0.00
  Predicted Score: 0.89
  Error: 0.89
----------------------------------------
Example 3:
  Premise: A man is playing the guitar.
  Hypothesis: A person is strumming a musical instrument.
  True Score: 4.80
  Predicted Score: 1.08
  Error: 3.72
----------------------------------------
Example 4:
  Premise: She is reading a book.
  Hypothesis: She is watching a movie.
  True Score: 1.00
  Predicted Score: 0.93
  Error: 0.07
----------------------------------------
Example 5:
  Premise: The car is red.
  Hypothesis: The vehicle is blue.
  True Score: 0.50
  Predicted Score: 0.82
  Error: 0.32
----------------------------------------


  test_premises_embedded = torch.tensor([embeddings_1[i % len(embeddings_1)] for i in range(len(test_premises))]).float()


In [None]:
test_data = [
    {"premise": "The cat is on the mat.", "hypothesis": "A cat is sitting on a rug.", "true_label": 1},
    {"premise": "The sun is shining brightly.", "hypothesis": "It is raining heavily.", "true_label": 0},
    {"premise": "A man is playing the guitar.", "hypothesis": "A person is strumming a musical instrument.", "true_label": 1},
    {"premise": "She is reading a book.", "hypothesis": "She is watching a movie.", "true_label": 0},
    {"premise": "The car is red.", "hypothesis": "The vehicle is blue.", "true_label": 0}
]

test_premises = [example["premise"] for example in test_data]
test_hypotheses = [example["hypothesis"] for example in test_data]
true_labels = torch.tensor([example["true_label"] for example in test_data]).float()


test_premises_embedded = torch.tensor([embeddings_1[i % len(embeddings_1)] for i in range(len(test_premises))]).float()
test_hypotheses_embedded = torch.tensor([embeddings_1[i % len(embeddings_1)] for i in range(len(test_hypotheses))]).float()


test_premises_embedded = test_premises_embedded.to(device)
test_hypotheses_embedded = test_hypotheses_embedded.to(device)
true_labels = true_labels.to(device)

model_sim.eval()
with torch.no_grad():
    predictions = model_sim(test_premises_embedded, test_hypotheses_embedded)
    predicted_labels = (predictions >= threshold).float()

for i, (premise, hypothesis, true_label, predicted_label) in enumerate(zip(test_premises, test_hypotheses, true_labels, predicted_labels)):
    print(f"Example {i+1}:")
    print(f"  Premise: {premise}")
    print(f"  Hypothesis: {hypothesis}")
    print(f"  True Label: {true_label.item()}")
    print(f"  Predicted Label: {predicted_label.item()}")
    print(f"  {'Correct' if true_label.item() == predicted_label.item() else 'Incorrect'}")
    print("-" * 40)


Example 1:
  Premise: The cat is on the mat.
  Hypothesis: A cat is sitting on a rug.
  True Label: 1.0
  Predicted Label: 1.0
  Correct
----------------------------------------
Example 2:
  Premise: The sun is shining brightly.
  Hypothesis: It is raining heavily.
  True Label: 0.0
  Predicted Label: 1.0
  Incorrect
----------------------------------------
Example 3:
  Premise: A man is playing the guitar.
  Hypothesis: A person is strumming a musical instrument.
  True Label: 1.0
  Predicted Label: 1.0
  Correct
----------------------------------------
Example 4:
  Premise: She is reading a book.
  Hypothesis: She is watching a movie.
  True Label: 0.0
  Predicted Label: 1.0
  Incorrect
----------------------------------------
Example 5:
  Premise: The car is red.
  Hypothesis: The vehicle is blue.
  True Label: 0.0
  Predicted Label: 1.0
  Incorrect
----------------------------------------
