# 1. Pair-based fine-tuning
- Minh Nguyen
- 11/13/2024

Fine-tuning is about adapting a pre-trained model to a specific task by training it further on task-specific data.
- Dataset: AllNLI (Natural Language Inference) provides pairs of sentences labeld as entailments (similar), neutral, or contradiction (dissimilar).
- Task: Sentence similarity, where the goal is to predict how semantically similar 2 setences are.
- Loss Function: Pair-based loss like CosineSimilarityLoss, which optimizes the model to produce embeddings closer for similar pairs and farther apart for dissimilar pairs.

### Load the dataset
- The dataset contains pairs of sentences (premise, hypothesis) and a label:
    - label = 2: entailment (similar)
    - label = 1: neutral (partially related)
    - label = 0: contradict (dissimilar)

In [1]:
from datasets import load_dataset

# Load the AllNLI dataset
dataset = load_dataset("sentence-transformers/all-nli", "pair-class")

# Shuffle the dataset and select a random subset
train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))  # Random 10,000 examples
eval_dataset = dataset["test"].shuffle(seed=42).select(range(200))     # Random 2,000 examples

# Check the sizes
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")

  from .autonotebook import tqdm as notebook_tqdm


Training examples: 1000
Validation examples: 200


In [2]:
# Preprocess the data: Convert entailment to 1, contradict to 0, and neutral to 0.5

# Map labels to similarity sores
def map_labels_to_scores(example):
    if example['label'] == 2: example['score'] = 1.0
    elif example['label'] == 0: example['score'] = 0.0
    else: example['score'] = 0.5
    return example

# Apply mapping
train_dataset = train_dataset.map(map_labels_to_scores)
eval_dataset = eval_dataset.map(map_labels_to_scores)

# Keep only relevant columns
train_dataset = train_dataset.select_columns(['premise', 'hypothesis', 'score'])
eval_dataset = eval_dataset.select_columns(['premise', 'hypothesis', 'score'])

print(len(train_dataset))
print(len(eval_dataset))

1000
200


In [3]:
# Convert data to InputExample objects required by Sentence Transformers
from sentence_transformers import InputExample

# Convert to InputExample format
train_examples = [
    InputExample(
        texts=[row['premise'], row['hypothesis']],
        label=float(row['score'])
    )
    for row in train_dataset    
]

# Convert to InputExample format
eval_examples = [
    InputExample(
        texts=[row['premise'], row['hypothesis']],
        label=float(row['score'])
    )
    for row in eval_dataset    
]


In [4]:
# Create PyTorch data loader
from torch.utils.data import DataLoader

# Create DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
eval_dataloader = DataLoader(eval_examples, batch_size=16)

In [5]:
# Load Pre-trained Model
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

In [6]:
from sentence_transformers.losses import CosineSimilarityLoss

# Define the loss function Cosine Similarity
loss = CosineSimilarityLoss(model=model)

In [7]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an evaluator
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=[example.texts[0] for example in eval_examples],
    sentences2=[example.texts[1] for example in eval_examples],
    scores=[example.label for example in eval_examples]
)

# Fine-tune the model
model.fit(
    train_objectives=[(train_dataloader, loss)],
    evaluator=evaluator,
    evaluation_steps=100,  # Evaluate every 100 steps
    epochs=1,  # Increase epochs for better performance
    warmup_steps=100,  # Use warmup for stability
    output_path="output/fine_tuned_allnli_model_1",  # Save path
    save_best_model=True  # Save only the best-performing model
)


100%|██████████| 63/63 [01:12<00:00,  1.15s/it]

{'train_runtime': 72.6376, 'train_samples_per_second': 13.767, 'train_steps_per_second': 0.867, 'train_loss': 0.2825991009909009, 'epoch': 1.0}





In [9]:
# Evaluate fine-tune model
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_model(model, dataset):
    embeddings1 = model.encode(dataset['premise'], batch_size=16, convert_to_tensor=True)
    embeddings2 = model.encode(dataset['hypothesis'], batch_size=16, convert_to_tensor=True)
    
    # Compute pairwise cosine similarity
    similarities = cosine_similarity(embeddings1.cpu(), embeddings2.cpu()).diagonal()
    return similarities

# Evaluate the model
predicted_scores = evaluate_model(model, eval_dataset)
ground_truth_scores = eval_dataset['score']

pearson_corr = pearsonr(predicted_scores, ground_truth_scores)
spearman_corr = spearmanr(predicted_scores, ground_truth_scores)

print(f"Pearson Correlation: {pearson_corr[0]:.4f}")
print(f"Spearman Correlation: {spearman_corr[0]:.4f}")

Pearson Correlation: -0.5711
Spearman Correlation: -0.5625


In [10]:
# Load the fine-tuned model
fine_tuned_model = SentenceTransformer("output/fine_tuned_allnli_model_1")

# Query and corpus
query = "What is the capital of France?"
corpus = [
    "Paris is the capital of France.",
    "France is a country in Europe.",
    "The Eiffel Tower is located in Paris.",
    "London is the capital of the UK."
]

# Encode query and corpus
query_embedding = fine_tuned_model.encode([query])
corpus_embeddings = fine_tuned_model.encode(corpus)

# Compute similarities
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(query_embedding, corpus_embeddings).flatten()
most_relevant_idx = similarities.argmax()

print(f"Query: {query}")
print(f"Most Relevant Sentence: {corpus[most_relevant_idx]}")


Query: What is the capital of France?
Most Relevant Sentence: Paris is the capital of France.
