In [1]:
!pip install datasets



In [2]:
!pip install --upgrade sentence-transformers



In [3]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from datasets import load_dataset

# Load the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load the dataset
dataset = load_dataset('stsb_multi_mt', 'en', split='train')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 5749
})

In [4]:
dataset[0]

{'sentence1': 'A plane is taking off.',
 'sentence2': 'An air plane is taking off.',
 'similarity_score': 5.0}

In [5]:
# Convert the dataset into the InputExample format
train_examples = [
    InputExample(
        texts=[example['sentence1'], example['sentence2']],
        label=float(example['similarity_score']) / 5.0  # Normalize to [0, 1]
    )
    for example in dataset
]

In [6]:
# Define DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

In [7]:
from sentence_transformers import losses

# Define the loss function
train_loss = losses.CosineSimilarityLoss(model)

In [8]:
# Fine-tune the model
num_epochs = 4

# Note: `fit` method from `SentenceTransformer` does not take extra arguments like `num_items_in_batch`
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=int(len(train_dataloader) * num_epochs * 0.1),  # 10% of total steps
    output_path='./fine_tuned_sentence_similarity_model'
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mgusmang525[0m ([33mgusmang525-stec[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.0219
1000,0.014


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [9]:
# Save the fine-tuned model
model.save('./fine_tuned_sentence_similarity_model')

# Load the fine-tuned model for inference
model = SentenceTransformer('./fine_tuned_sentence_similarity_model')

In [10]:
# Example sentences
sentence1 = "This is a test sentence."
sentence2 = "This is a sample sentence."

# Encode and calculate similarity
embeddings = model.encode([sentence1, sentence2])
similarity = model.similarity(embeddings[0], embeddings[1])
print(f"Similarity: {similarity}")


Similarity: tensor([[0.7405]])
