#### Fine Tuning Sentence Tranformer
- https://huggingface.co/blog/how-to-train-sentence-transformers

In [1]:
%%capture
%pip install sentence-transformers
#  sentence-transformers is a python framework for state-of-the-art sentence, text and image embeddings. It is backed by the popular HuggingFace transformers library. It provides a simple interface for computing embeddings while hiding the complex machinery behind it. It also supports fine-tuning of embeddings models on custom datasets.

In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader
from sentence_transformers import InputExample, SentenceTransformer
import scipy

In [25]:
# Prepare your training data
train_examples = []

# Generate train examples with keywords and labels
train_examples = []

# premise,hypothesis, label

# Example 1
sentence1 = "The importance of renewable energy"
sentence2 = "Renewable energy sources for a sustainable future"
label = 1  # Similar sentences

train_examples.append(InputExample(texts=[sentence1, sentence2], label=label))

# Example 2
sentence1 = "Applications of artificial intelligence in healthcare"
sentence2 = "Artificial intelligence advancements in medical diagnosis"
label = 1  # Similar sentences

train_examples.append(InputExample(texts=[sentence1, sentence2], label=label))

# Example 3
sentence1 = "Climate change effects on biodiversity"
sentence2 = "The impact of climate change on ecosystems"
label = 0  # Dissimilar sentences

train_examples.append(InputExample(texts=[sentence1, sentence2], label=label))

In [26]:
# Load pre-trained Sentence Transformer model
model_name = 'sentence-transformers/distilbert-base-nli-mean-tokens'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/distilbert-base-nli-mean-tokens and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# Prepare your training data
train_examples = train_examples

In [28]:
# Tokenize and convert train examples to features
train_features = tokenizer.batch_encode_plus(
    [(example.texts[0], example.texts[1]) for example in train_examples],
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)
train_labels = torch.tensor([example.label for example in train_examples])

In [29]:
# Fine-tuning setup
train_dataset = torch.utils.data.TensorDataset(train_features['input_ids'],
                                               train_features['attention_mask'],
                                               train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [30]:
# Fine-tuning loop
num_epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss:.4f}')


Epoch 1/3 - Average Loss: 0.6316
Epoch 2/3 - Average Loss: 0.5565
Epoch 3/3 - Average Loss: 0.4972


In [31]:
# Save the fine-tuned model
model.save_pretrained('fine_tuned_model')
tokenizer.save_pretrained('fine_tuned_model')

('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/vocab.txt',
 'fine_tuned_model/added_tokens.json',
 'fine_tuned_model/tokenizer.json')

In [32]:
# Load the fine-tuned model
model_name = 'fine_tuned_model'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [40]:
# Create SentenceTransformer for encoding
sentence_transformer = SentenceTransformer(model_name)

Some weights of the model checkpoint at fine_tuned_model were not used when initializing DistilBertModel: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [65]:
# Example inference
queries = ['The importance of renewable energy',]
answers = ['The impact of climate change on ecosystems','necessity of renewable energy',]

In [66]:
# Encode queries and answers into embeddings
query_embeddings = sentence_transformer.encode(queries, convert_to_tensor=True)
answer_embeddings = sentence_transformer.encode(answers, convert_to_tensor=True)

In [67]:
# Calculate cosine similarity between queries and answers
cosine_scores = 1 - scipy.spatial.distance.cdist(query_embeddings.cpu(), answer_embeddings.cpu(), 'cosine')

In [68]:
# Print results
for i, query in enumerate(queries):
    print(f'Query: {query}')
    print('Top 2 Answers:')
    for j in range(len(answers)):
        answer = answers[j]
        score = cosine_scores[i][j]
        print(f'Answer: {answer}  Score: {score:.4f}')
    print()

Query: The importance of renewable energy
Top 2 Answers:
Answer: The impact of climate change on ecosystems  Score: 0.5618
Answer: necessity of renewable energy  Score: 0.9523

