In [1]:
import sys
print(sys.executable)

D:\Projects\TextSimilarity\venv\Scripts\python.exe


In [3]:
from transformers import BertModel, BertTokenizer, AdamW
import torch
import torch.nn as nn
from datasets import load_dataset
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
data_train = load_dataset("stsb_multi_mt", name="en", split="train")
data_test = load_dataset("stsb_multi_mt", name="en", split="test")
data_dev = load_dataset("stsb_multi_mt", name="en", split="dev")

# data_train = data_train.select(range(10))

In [6]:
print(len(data_train), len(data_test), len(data_dev))
print(data_train['sentence1'][0])
print(data_train['sentence2'][0])
print(data_train['similarity_score'][0])

5749 1379 1500
A plane is taking off.
An air plane is taking off.
5.0


In [7]:
class SentenceSimilarityModel(nn.Module):
    def __init__(self, model_name):
        super(SentenceSimilarityModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.regression_head = nn.Linear(self.bert.config.hidden_size, 1)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        # TODO: After BERT output the encoding, need to add more layers to get the sim_score?
        sim_score = self.regression_head(cls_output)
        return sim_score

In [68]:
# Load model and tokenizer
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = SentenceSimilarityModel(model_name)
model.to(device)
# Assume 'sentences1' and 'sentences2' are your paired sentences
# and 'similarity_scores' are your human-annotated similarity scores
inputs = tokenizer(data_train['sentence1'], data_train['sentence2'], padding=True, truncation=True, return_tensors='pt')
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
similarity_scores = torch.tensor(data_train['similarity_score'])

batch_size = 64  # You can change the batch size as needed
train_data = torch.utils.data.TensorDataset(input_ids, attention_mask, similarity_scores)
train_loader = DataLoader(train_data, batch_size=batch_size)

# Loss and Optimizer
loss_fn = nn.MSELoss()  # Mean Squared Error Loss
optimizer = AdamW(model.parameters(), lr=5e-5)
# scheduler = StepLR(optimizer, step_size=3, gamma=0.1)

In [69]:
num_epochs = 5
for epoch in range(num_epochs):
    # Inner loop for mini-batches with tqdm progress bar
    loop = tqdm(train_loader, leave=True)  # leave=True ensures that the progress bars are replaced with new ones at each epoch
    for batch in loop:
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        model.train()
        
        # Forward pass
        predicted_similarity_scores = model(input_ids, attention_mask)
        
        # Compute loss
        loss = loss_fn(predicted_similarity_scores.squeeze(), labels.float())
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Update the progress bar description
        loop.set_description(f'Epoch {epoch + 1}/{num_epochs}')
        loop.set_postfix(loss=loss.item())
    # scheduler.step()

Epoch 1/5: 100%|██████████| 90/90 [01:45<00:00,  1.17s/it, loss=1.75] 
Epoch 2/5: 100%|██████████| 90/90 [02:04<00:00,  1.38s/it, loss=0.577]
Epoch 3/5: 100%|██████████| 90/90 [01:53<00:00,  1.26s/it, loss=0.415]
Epoch 4/5: 100%|██████████| 90/90 [01:53<00:00,  1.26s/it, loss=0.228]
Epoch 5/5: 100%|██████████| 90/90 [01:49<00:00,  1.22s/it, loss=0.26] 


In [70]:
# Assuming you've already loaded the validation data into `data_dev`
inputs_dev = tokenizer(data_dev['sentence1'], data_dev['sentence2'], padding=True, truncation=True, return_tensors='pt')
input_ids_dev = inputs_dev['input_ids']
attention_mask_dev = inputs_dev['attention_mask']
similarity_scores_dev = torch.tensor(data_dev['similarity_score'])

dev_data = torch.utils.data.TensorDataset(input_ids_dev, attention_mask_dev, similarity_scores_dev)
dev_loader = DataLoader(dev_data, batch_size=batch_size)

def compute_validation_loss(model, dev_loader, loss_fn):
    model.eval()  # set the model to evaluation mode
    total_loss = 0.0
    num_batches = 0

    with torch.no_grad():  # deactivate autograd engine to reduce memory usage and speed up computations
        for batch in dev_loader:
            input_ids, attention_mask, labels = batch

            # If using GPU
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            # Forward pass
            predicted_similarity_scores = model(input_ids, attention_mask)

            # Compute loss
            loss = loss_fn(predicted_similarity_scores.squeeze(), labels.float())
            total_loss += loss.item()
            num_batches += 1

    return total_loss / num_batches

# Compute the validation loss
val_loss = compute_validation_loss(model, dev_loader, loss_fn)
print(f'Validation Loss: {val_loss}')


Validation Loss: 0.7153321790198485


In [73]:
def get_similarity_score(model, sentence1, sentence2, tokenizer):
    """
    Compute the similarity score for two sentences using a trained model.
    
    Args:
    model (nn.Module): The trained model.
    sentence1 (str): The first sentence.
    sentence2 (str): The second sentence.
    tokenizer (BertTokenizer): The tokenizer.
    
    Returns:
    float: The similarity score between the two sentences.
    """
    # Tokenize the sentences
    inputs = tokenizer(sentence1, sentence2, padding=True, truncation=True, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    print(input_ids, attention_mask)
    
    # Compute similarity score
    with torch.no_grad():  # Deactivates autograd, reduces memory usage and speeds up computations
        sim_score = model(input_ids, attention_mask)
    
    # Convert the similarity score to a single float and return it
    return sim_score.item()

In [88]:
model.eval()  # Set the model to evaluation mode

sentence1 = "The black dog is running through the snow."
sentence2 = "A race car driver is driving his car through the mud."
similarity_score = get_similarity_score(model, sentence1, sentence2, tokenizer)

print("Similarity Score:", similarity_score)

tensor([[  101, 10117, 15045, 17835, 10124, 18020, 11222, 10105, 64321,   119,
           102,   138, 14025, 13000, 25926, 10124, 37897, 10226, 13000, 11222,
         10105, 12361, 10162,   119,   102]], device='cuda:0') tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]], device='cuda:0')
Similarity Score: 0.4855020046234131


In [43]:
for name, param in model.regression_head.named_parameters():
    print(name, param.data)

weight tensor([[-9.8576e-03, -8.6186e-02, -1.9419e-02, -5.8699e-02, -6.8748e-02,
          1.2147e-03,  7.1022e-02, -7.2849e-02, -1.3442e-02, -8.5468e-02,
         -8.9295e-02,  8.7370e-02,  9.4590e-02,  2.0929e-02,  4.2192e-02,
          8.0297e-02, -3.4667e-03, -7.2941e-02, -4.7128e-03, -2.2853e-02,
         -1.3593e-01, -1.3975e-02,  1.0986e-01,  8.2436e-02,  9.2455e-02,
          1.4437e-02, -9.6229e-02,  7.1659e-04, -2.0385e-03,  8.1047e-02,
         -6.9119e-02,  6.8833e-03, -5.5303e-02,  7.0761e-02, -8.9178e-03,
         -4.5705e-02,  1.3978e-02, -2.7497e-02,  1.3547e-01,  8.3929e-02,
          1.0169e-01,  2.1116e-02, -1.4193e-02, -1.0319e-02, -4.5860e-02,
         -1.1053e-02,  9.9705e-02,  2.9289e-02, -8.8129e-02,  1.2879e-01,
          1.1823e-01, -1.0034e-02, -1.4605e-01, -1.5781e-03,  4.6638e-02,
          5.7965e-03, -7.6121e-02,  9.6768e-04, -5.2582e-02, -1.6819e-03,
         -5.5089e-02,  1.1726e-01, -1.1702e-02,  1.1708e-02, -7.1751e-03,
          9.1619e-02, -5.5705e-

In [50]:
sample_tensor = torch.randn(size=(1, model.bert.config.hidden_size)).to(device)  # Create a random tensor with the appropriate size
with torch.no_grad():
    print(model.regression_head(sample_tensor))

tensor([[2.3802]], device='cuda:0')


In [71]:
input_ids_tensor = torch.tensor([[  101, 14535, 10105, 13192, 10393, 10115,   112,   188, 10590, 33626,
           119,   102, 10117, 12229,   118, 33626, 46291, 46935, 13663,   102]])
attention_mask_tensor = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

# Check if CUDA is available and move tensors to GPU if available, else keep them on CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_ids_tensor = input_ids_tensor.to(device)
attention_mask_tensor = attention_mask_tensor.to(device)

# Move your model to the same device
model = model.to(device)

# Pass tensors through model.bert
with torch.no_grad():
    outputs = model.bert(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)

# The outputs are a tuple. The first entry is the hidden states and the second (optional) is the pooled CLS token
hidden_states = outputs.last_hidden_state

In [72]:
print(hidden_states)

tensor([[[-0.0487,  0.0999, -0.0588,  ...,  0.0827, -0.0242, -0.2865],
         [ 0.0220,  0.0951, -0.0224,  ...,  0.1847, -0.0065, -0.3365],
         [ 0.0509,  0.0744, -0.0815,  ...,  0.1365, -0.0614, -0.3466],
         ...,
         [ 0.1415,  0.1406, -0.1518,  ...,  0.2477, -0.0445, -0.3622],
         [ 0.0645,  0.1314, -0.1587,  ...,  0.1056, -0.0793, -0.3208],
         [ 0.0803,  0.1288, -0.1300,  ...,  0.0326, -0.1209, -0.3192]]],
       device='cuda:0')


In [None]:
# TODO: Expand the scale from sentence similarity to paragraph/article similartity using trained sentence model