In [2]:
import sys
print(sys.executable)

/Users/boyangwan/Desktop/ML_Projects/TextSimilarity/venv/bin/python3


In [7]:
from transformers import BertModel, BertTokenizer, AdamW
import torch
import torch.nn as nn
from datasets import load_dataset
from tqdm import tqdm
from torch.utils.data import DataLoader

In [15]:
data_train = load_dataset("stsb_multi_mt", name="en", split="train")
data_test = load_dataset("stsb_multi_mt", name="en", split="test")
data_dev = load_dataset("stsb_multi_mt", name="en", split="dev")

data_train = data_train.select(range(10))

In [16]:
print(len(data_train), len(data_test), len(data_dev))
print(data_train['sentence1'][0])
print(data_train['sentence2'][0])
print(data_train['similarity_score'][0])

10 1379 1500
A plane is taking off.
An air plane is taking off.
5.0


In [17]:
class SentenceSimilarityModel(nn.Module):
    def __init__(self, model_name):
        super(SentenceSimilarityModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.regression_head = nn.Linear(self.bert.config.hidden_size, 1)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        sim_score = self.regression_head(cls_output)
        return sim_score

In [18]:

# Load model and tokenizer
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = SentenceSimilarityModel(model_name)

# Assume 'sentences1' and 'sentences2' are your paired sentences
# and 'similarity_scores' are your human-annotated similarity scores
inputs = tokenizer(data_train['sentence1'], data_train['sentence2'], padding=True, truncation=True, return_tensors='pt')
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
similarity_scores = torch.tensor(data_train['similarity_score'])

batch_size = 32  # You can change the batch size as needed
train_data = torch.utils.data.TensorDataset(input_ids, attention_mask, similarity_scores)
train_loader = DataLoader(train_data, batch_size=batch_size)

# Loss and Optimizer
loss_fn = nn.MSELoss()  # Mean Squared Error Loss
optimizer = AdamW(model.parameters(), lr=1e-5)

In [19]:
num_epochs = 3
for epoch in range(num_epochs):
    # Inner loop for mini-batches with tqdm progress bar
    loop = tqdm(train_loader, leave=True)  # leave=True ensures that the progress bars are replaced with new ones at each epoch
    for batch in loop:
        input_ids, attention_mask, labels = batch
        model.train()
        
        # Forward pass
        predicted_similarity_scores = model(input_ids, attention_mask)
        
        # Compute loss
        loss = loss_fn(predicted_similarity_scores.squeeze(), labels.float())
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Update the progress bar description
        loop.set_description(f'Epoch {epoch + 1}/{num_epochs}')
        loop.set_postfix(loss=loss.item())


Epoch 1/3: 100%|██████████| 1/1 [00:00<00:00,  1.60it/s, loss=12.4]
Epoch 2/3: 100%|██████████| 1/1 [00:00<00:00,  2.10it/s, loss=9.82]
Epoch 3/3: 100%|██████████| 1/1 [00:00<00:00,  2.16it/s, loss=5.98]


In [20]:
def get_similarity_score(model, sentence1, sentence2, tokenizer):
    """
    Compute the similarity score for two sentences using a trained model.
    
    Args:
    model (nn.Module): The trained model.
    sentence1 (str): The first sentence.
    sentence2 (str): The second sentence.
    tokenizer (BertTokenizer): The tokenizer.
    
    Returns:
    float: The similarity score between the two sentences.
    """
    # Tokenize the sentences
    inputs = tokenizer(sentence1, sentence2, padding=True, truncation=True, return_tensors='pt')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    # Compute similarity score
    with torch.no_grad():  # Deactivates autograd, reduces memory usage and speeds up computations
        sim_score = model(input_ids, attention_mask)
    
    # Convert the similarity score to a single float and return it
    return sim_score.item()

In [24]:
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = SentenceSimilarityModel(model_name)
model.eval()  # Set the model to evaluation mode

sentence1 = "This is a test sentence."
sentence2 = "Hello how are you? get_similarity_score"
similarity_score = get_similarity_score(model, sentence1, sentence2, tokenizer)

print("Similarity Score:", similarity_score)

Similarity Score: 0.010244056582450867


In [None]:
# TODO: Expand the scale from sentence similarity to paragraph/article similartity using trained sentence model