# [Sentence-BERT](https://arxiv.org/pdf/1908.10084.pdf)

[Reference Code](https://www.pinecone.io/learn/series/nlp/train-sentence-transformers-softmax/)

******************************
ST125214 _ Maung Maung Kyi Tha
Training BERT from the scratch
******************************

In [None]:
import os
import math
import re
from   random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# setting device to GPU cuda if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Seet my seed
SEED = 75
torch.manual_seed(SEED)

# Making sure we get the same results on each run
torch.backends.cudnn.deterministic = True

# Disable user warnings for neater output
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

cuda


## 1. Data

### Train, Test, Validation 

In [None]:
# import and use only snli dataset due to memory constraints
import datasets
snli = datasets.load_dataset('snli')
snli['train'].features

{'premise': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)}

In [3]:
# there are -1 values in the label feature, these are where no class could be decided so we remove
snli = snli.filter(
    lambda x: 0 if x['label'] == -1 else 1
)

In [None]:
np.unique(snli['train']['label'])
#snli also have -1

array([0, 1, 2])

In [None]:
# Assuming you have your two DatasetDict objects named snli and mnli
from datasets import DatasetDict
# Merge the two DatasetDict objects
raw_dataset = DatasetDict({
    'train': snli['train'].shuffle(seed=75).select(list(range(1500))),
    'test': snli['test'].shuffle(seed=75).select(list(range(200))),
    'validation': snli['validation'].shuffle(seed=75).select(list(range(500))),
})
#remove .select(list(range(1000))) in order to use full dataset
# Now, merged_dataset_dict contains the combined datasets from snli and mnli
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1500
    })
})

## 2. Preprocessing

In [None]:
# from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Create Custom Tokenizer for this project

def tokenizer(sentences, max_length, padding='max_length', truncation=True):
    tokenized_outputs = {"input_ids": [], "attention_mask": []}
    for sentence in sentences:
        tokens = sentence.lower().split()
        token_ids = [word2id.get(token, word2id['[UNK]']) for token in tokens]
        if truncation and len(token_ids) > max_length:
            token_ids = token_ids[:max_length]
        attention_mask = [1] * len(token_ids)
        if padding == 'max_length':
            padding_length = max_length - len(token_ids)
            token_ids += [word2id['[PAD]']] * padding_length
            attention_mask += [0] * padding_length
        tokenized_outputs["input_ids"].append(token_ids)
        tokenized_outputs["attention_mask"].append(attention_mask)
    return tokenized_outputs

In [None]:
import pickle

# loading previously saved parameters
state_dict = torch.load('./model/bert_model.pt.1')
with open('./model/bert.param', 'rb') as f:
    bert_param = pickle.load(f)
    word2id = bert_param['word2id']

In [8]:
print(len(word2id))
vocab_size = len(word2id)

105358


In [9]:
def preprocess_function(examples):
    max_seq_length = 128
    padding = 'max_length'
    # Tokenize the premise
    premise_result = tokenizer(
        examples['premise'], padding=padding, max_length=max_seq_length, truncation=True)
    #num_rows, max_seq_length
    # Tokenize the hypothesis
    hypothesis_result = tokenizer(
        examples['hypothesis'], padding=padding, max_length=max_seq_length, truncation=True)
    #num_rows, max_seq_length
    # Extract labels
    labels = examples["label"]
    #num_rows
    return {
        "premise_input_ids": premise_result["input_ids"],
        "premise_attention_mask": premise_result["attention_mask"],
        "hypothesis_input_ids": hypothesis_result["input_ids"],
        "hypothesis_attention_mask": hypothesis_result["attention_mask"],
        "labels" : labels
    }

tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
)

tokenized_datasets = tokenized_datasets.remove_columns(['premise','hypothesis','label'])
tokenized_datasets.set_format("torch")

In [10]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1500
    })
})

## 3. Data loader

In [11]:
from torch.utils.data import DataLoader

# initialize the dataloader
batch_size = 32
train_dataloader = DataLoader(
    tokenized_datasets['train'], 
    batch_size=batch_size, 
    shuffle=True
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], 
    batch_size=batch_size
)
test_dataloader = DataLoader(
    tokenized_datasets['test'], 
    batch_size=batch_size
)

In [12]:
for batch in train_dataloader:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_attention_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_attention_mask'].shape)
    print(batch['labels'].shape)
    break

torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32])


## 4. Model

In [None]:
# I will use BERT Model Definition from previous notebook
class Embedding(nn.Module):
    def __init__(self, vocab_size, max_len, n_segments, d_model, device):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(max_len, d_model)
        self.seg_embed = nn.Embedding(n_segments, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.device = device

    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long).to(self.device).unsqueeze(0).expand_as(x)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)

class BERT(nn.Module):
    def __init__(self, n_layers, n_heads, d_model, d_ff, d_k, 
                 n_segments, vocab_size, max_len, device):
        super(BERT, self).__init__()
        self.embedding = Embedding(vocab_size, max_len, n_segments, d_model, device)
        self.layers = nn.ModuleList([EncoderLayer(n_heads, d_model, d_ff, d_k, device) 
                                     for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, d_model)
        self.activ = nn.Tanh()
        self.linear = nn.Linear(d_model, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)
        self.decoder = nn.Linear(d_model, vocab_size, bias=False)
        self.decoder_bias = nn.Parameter(torch.zeros(vocab_size))
        self.device = device

    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids, self.device)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        return output

    # New helper for sentence encoding (for classification, similar to S-BERT)
    def get_last_hidden_state(self, input_ids, attention_mask):
        # Create dummy segment_ids (all zeros)
        segment_ids = torch.zeros_like(input_ids).to(self.device)
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids, self.device)
        for layer in self.layers:
            output, _ = layer(output, enc_self_attn_mask)
        return output

        return output

def get_attn_pad_mask(seq_q, seq_k, device):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1).to(device)
    return pad_attn_mask.expand(batch_size, len_q, len_k)

class EncoderLayer(nn.Module):
    def __init__(self, n_heads, d_model, d_ff, d_k, device):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention(n_heads, d_model, d_k, device)
        self.pos_ffn = PoswiseFeedForwardNet(d_model, d_ff)

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
        enc_outputs = self.pos_ffn(enc_outputs)
        return enc_outputs, attn

class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, d_model, d_k, device):
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_k = d_k
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_k * n_heads)
        self.device = device

    def forward(self, Q, K, V, attn_mask):
        residual, batch_size = Q, Q.size(0)
        q_s = self.W_Q(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        k_s = self.W_K(K).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        v_s = self.W_V(V).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        attn_mask = attn_mask.unsqueeze(1).repeat(1, self.n_heads, 1, 1)
        context, attn = ScaledDotProductAttention(self.d_k, self.device)(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_k)
        output = nn.Linear(self.n_heads * self.d_k, self.d_model).to(self.device)(context)
        return nn.LayerNorm(self.d_model).to(self.device)(output + residual), attn

class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k, device):
        super(ScaledDotProductAttention, self).__init__()
        self.scale = torch.sqrt(torch.FloatTensor([d_k])).to(device)

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / self.scale
        scores.masked_fill_(attn_mask, -1e9)
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context, attn

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.fc2(F.gelu(self.fc1(x)))

In [None]:
# define the parameters as in the previous notebook
id2word = {v: k for k, v in word2id.items()}
vocab_size = len(word2id)
max_len = 1000
n_layers = 6
n_heads = 8
d_model = 768
d_ff = d_model * 4
d_k = d_v = 64
n_segments = 2

In [None]:
# Initialize and load BERT model
model = BERT(
    n_layers=n_layers,
    n_heads=n_heads,
    d_model=d_model,
    d_ff=d_ff,
    d_k=d_k,
    n_segments=n_segments,
    vocab_size=vocab_size,
    max_len=max_len,
    device=device
).to(device)

model.load_state_dict(torch.load('./model/bert_model.pt'))
model.eval()

BERT(
  (embedding): Embedding(
    (tok_embed): Embedding(105358, 768)
    (pos_embed): Embedding(1000, 768)
    (seg_embed): Embedding(2, 768)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (enc_self_attn): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=512, bias=True)
        (W_K): Linear(in_features=768, out_features=512, bias=True)
        (W_V): Linear(in_features=768, out_features=512, bias=True)
      )
      (pos_ffn): PoswiseFeedForwardNet(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
    )
  )
  (fc): Linear(in_features=768, out_features=768, bias=True)
  (activ): Tanh()
  (linear): Linear(in_features=768, out_features=768, bias=True)
  (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (dec

### Pooling
SBERT adds a pooling operation to the output of BERT / RoBERTa to derive a fixed sized sentence embedding

In [17]:
# define mean pooling function
def mean_pool(token_embeds, attention_mask):
    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool

## 5. Loss Function

## Classification Objective Function 
We concatenate the sentence embeddings $u$ and $v$ with the element-wise difference  $\lvert u - v \rvert $ and multiply the result with the trainable weight  $ W_t ∈  \mathbb{R}^{3n \times k}  $:

$ o = \text{softmax}\left(W^T \cdot \left(u, v, \lvert u - v \rvert\right)\right) $

where $n$ is the dimension of the sentence embeddings and k the number of labels. We optimize cross-entropy loss. This structure is depicted in Figure 1.

## Regression Objective Function. 
The cosine similarity between the two sentence embeddings $u$ and $v$ is computed (Figure 2). We use means quared-error loss as the objective function.

(Manhatten / Euclidean distance, semantically  similar sentences can be found.)

<img src="./figures/sbert-architecture.png" >

In [None]:
def configurations(u,v):
    # build the |u-v| tensor
    uv = torch.sub(u, v)   # batch_size,hidden_dim
    uv_abs = torch.abs(uv) # batch_size,hidden_dim
    
    # concatenate u, v, |u-v|
    x = torch.cat([u, v, uv_abs], dim=-1) # batch_size, 3*hidden_dim
    return x

# cosine similarity function
def cosine_similarity(u, v):
    u = u.flatten()  # Convert (1, 768) → (768,)
    v = v.flatten()  # Convert (1, 768) → (768,)

    dot_product = np.dot(u, v) 
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)

    similarity = dot_product / (norm_u * norm_v)
    return similarity


<img src="./figures/sbert-ablation.png" width="350" height="300">

In [None]:
classifier_head = torch.nn.Linear(768*3, 3).to(device)

# define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
optimizer_classifier = torch.optim.Adam(classifier_head.parameters(), lr=2e-5)

# define the loss function
criterion = nn.CrossEntropyLoss()

In [None]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optimizer, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps )

# then during the training loop we update the scheduler per step
scheduler.step()
scheduler_classifier = get_linear_schedule_with_warmup(
		optimizer_classifier, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps )

# then during the training loop we update the scheduler per step
scheduler_classifier.step()



## 6. Training

In [None]:
# training loop for the model
from tqdm.auto import tqdm

num_epoch = 3
# 1 epoch should be enough, increase if wanted
# I choose 3 epoch after experimentation
for epoch in range(num_epoch):
    # set model to training mode
    model.train()  
    classifier_head.train()
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
        # zero all gradients on each new step
        optimizer.zero_grad()
        optimizer_classifier.zero_grad()
        
        # move input data to GPU
        input_ids_a = batch['premise_input_ids'].to(device)
        input_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u = model.get_last_hidden_state(input_ids_a, attention_a)
        v = model.get_last_hidden_state(input_ids_b, attention_b)

         # get the mean pooled vectors
        u_mean_pool = mean_pool(u, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v, attention_b) # batch_size, hidden_dim
        
        # build the |u-v| tensor
        uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
        uv_abs = torch.abs(uv) # batch_size,hidden_dim
        
        # concatenate u, v, |u-v|
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
        
        # process concatenated tensor through classifier_head
        x = classifier_head(x) #batch_size, classifer
        
        # calculate the 'softmax-loss' between predicted and true label
        loss = criterion(x, labels)
        
        # using loss, calculate gradients and then optimizerize
        loss.backward()
        optimizer.step()
        optimizer_classifier.step()

        scheduler.step() # update learning rate scheduler
        scheduler_classifier.step()
        
    print(f'Epoch: {epoch + 1} | loss = {loss.item():.6f}')

  0%|          | 0/47 [00:00<?, ?it/s]

Epoch: 1 | loss = 3.842142


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch: 2 | loss = 2.044975


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch: 3 | loss = 3.300965


In [None]:
# evaluate the model for similarity
model.eval()
classifier_head.eval()

# initialize the total similarity
total_similarity = 0.0

# iterate over the evaluation dataloader
with torch.no_grad():
    for step, batch in enumerate(eval_dataloader):

        # prepare batches and move to device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        label = batch['labels'].to(device)
        
        # extract token embeddings using the helper method
        u = model.get_last_hidden_state(inputs_ids_a, attention_a)
        v = model.get_last_hidden_state(inputs_ids_b, attention_b)
        
        # get the mean pooled vectors (shape: [batch_size, hidden_dim])
        u_mean_pool = mean_pool(u, attention_a)  # [B, H]
        v_mean_pool = mean_pool(v, attention_b)  # [B, H]
        
        # Compute cosine similarity for each sample in the batch using PyTorch
        cos_sim = (u_mean_pool * v_mean_pool).sum(dim=1) / ( torch.norm(u_mean_pool, dim=1) * torch.norm(v_mean_pool, dim=1) + 1e-8 )
        
        # Average the similarity over the batch (a scalar)
        similarity_score = cos_sim.mean().item()
        total_similarity += similarity_score

# Calculate the average similarity over the entire evaluation dataset    
average_similarity = total_similarity / len(eval_dataloader)
print(f"Average Cosine Similarity: {average_similarity:.4f}")


Average Cosine Similarity: 0.9926


## 7. Inference

In [None]:
def calculate_similarity(model, tokenizer, sentence_a, sentence_b, device, max_length=128):
    # Use the custom tokenizer which expects a list of sentences.
    inputs_a = tokenizer([sentence_a], max_length=max_length, padding='max_length', truncation=True)
    inputs_b = tokenizer([sentence_b], max_length=max_length, padding='max_length', truncation=True)
    
    # Convert lists to torch tensors and move to device.
    input_ids_a = torch.tensor(inputs_a['input_ids']).to(device)
    attention_a = torch.tensor(inputs_a['attention_mask']).to(device)   
    input_ids_b = torch.tensor(inputs_b['input_ids']).to(device)
    attention_b = torch.tensor(inputs_b['attention_mask']).to(device)
    
    # Use the model's helper method to get token embeddings.
    u = model.get_last_hidden_state(input_ids_a, attention_a)  # shape: [1, seq_len, hidden_dim]
    v = model.get_last_hidden_state(input_ids_b, attention_b)  # shape: [1, seq_len, hidden_dim]
    
    # Get mean pooled sentence embeddings (shape: [1, hidden_dim])
    u_mean = mean_pool(u, attention_a).detach().cpu().numpy()
    v_mean = mean_pool(v, attention_b).detach().cpu().numpy()
    
    # Calculate cosine similarity (result is a scalar)
    similarity_score = cosine_similarity(u_mean, v_mean)
    return similarity_score

# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity(model, tokenizer, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")


Cosine Similarity: 0.9950


In [None]:
# savve the final model
torch.save({
    'model_state_dict': model.state_dict(),
    'classifier_head_state_dict': classifier_head.state_dict(),
    'word2id': word2id,
    'id2word': id2word,
    'vocab_size': vocab_size
}, './model/final_model.pth')

# Evaluation and Analysis

In [None]:
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the saved model and classifier head from previous task
checkpoint = torch.load('./model/final_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
classifier_head.load_state_dict(checkpoint['classifier_head_state_dict'])

# Set the model and classifier head to evaluation mode
model.eval()
classifier_head.eval()

# Initialize lists to store true labels and predictions
true_labels = []
predictions = []

# Run inference on the test set
with torch.no_grad():
    for batch in test_dataloader:
        input_ids_a = batch['premise_input_ids'].to(device)
        input_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)

        # Extract true labels
        labels = batch['labels'].to(device)
        
        # Extract token embeddings from BERT at last_hidden_state
        u = model.get_last_hidden_state(input_ids_a, attention_a)
        v = model.get_last_hidden_state(input_ids_b, attention_b)
        
        # Get the mean pooled vectors
        u_mean_pool = mean_pool(u, attention_a)
        v_mean_pool = mean_pool(v, attention_b)
        
        # Build the |u-v| tensor
        uv_abs = torch.abs(u_mean_pool - v_mean_pool)
        
        # Concatenate u, v, |u-v|
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1)
        
        # Process concatenated tensor through classifier_head
        logits = classifier_head(x)
        
        # Get the predicted labels
        preds = torch.argmax(logits, dim=1)
        
        # Store true labels and predictions
        true_labels.extend(labels.cpu().numpy())
        predictions.extend(preds.cpu().numpy())

# Calculate performance metrics
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')

# Print the performance metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.3400
Precision: 0.1156
Recall: 0.3400
F1-score: 0.1725


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
