In [1]:
import logging
import os
import csv
import numpy as np
from typing import List, Union
import math
from tqdm.autonotebook import trange

import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter

from sentence_transformers.evaluation import SentenceEvaluator
from sentence_transformers import InputExample

logger = logging.getLogger(__name__)


class LossEvaluator(SentenceEvaluator):

    def __init__(self, loader, device: str = "cpu", loss_model: nn.Module = None, name: str = '', log_dir: str = None,
                 show_progress_bar: bool = False, write_csv: bool = True):

        """
        Evaluate a model based on the loss function.
        The returned score is loss value.
        The results are written in a CSV and Tensorboard logs.
        :param loader: Data loader object
        :param loss_model: loss module object
        :param name: Name for the output
        :param log_dir: path for tensorboard logs 
        :param show_progress_bar: If true, prints a progress bar
        :param write_csv: Write results to a CSV file
        """

        self.loader = loader
        self.write_csv = write_csv
        self.logs_writer = SummaryWriter(log_dir=log_dir)
        self.name = name
        self.loss_model = loss_model
        
        # move model to gpu:  lidija-jovanovska
        self.device = device #torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        loss_model.to(self.device)

        if show_progress_bar is None:
            show_progress_bar = (
                    logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG)
        self.show_progress_bar = show_progress_bar

        self.csv_file: str = "loss_evaluation" + ("_" + name if name else '') + "_results.csv"
        #self.csv_file: str = "triplet_evaluation" + ("_" + name if name else "") + "_results.csv"
        self.csv_headers = ["epoch", "steps", "loss"]
    
    
    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        if epoch != -1:
            if steps == -1:
                out_txt = " after epoch {}:".format(epoch)
            else:
                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"
            
        logger.info("LossEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
        self.loss_model.eval()

        loss_value = 0
        self.loader.collate_fn = model.smart_batching_collate
        num_batches = len(self.loader)
        data_iterator = iter(self.loader)

        with torch.no_grad():
            for _ in trange(num_batches, desc="Iteration", smoothing=0.05, disable=not self.show_progress_bar):
                sentence_features, labels = next(data_iterator)
                #move data to GPU: lidija-jovanovska
                for i in range(0, len(sentence_features)):
                    for key, value in sentence_features[i].items():
                        sentence_features[i][key] = sentence_features[i][key].to(self.device)
                labels = labels.to(self.device)
                loss_value += self.loss_model(sentence_features, labels).item()
                #loss_value += self.loss_model(sentence_features).item()

        final_loss = loss_value / num_batches
        if output_path is not None and self.write_csv:

            csv_path = os.path.join(output_path, self.csv_file)
            output_file_exists = os.path.isfile(csv_path)
            with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
                writer = csv.writer(f)
                if not output_file_exists:
                    writer.writerow(self.csv_headers)

                writer.writerow([epoch, steps, final_loss])
            
            logger.info("Validation loss:\t{:.2f}".format(final_loss))

            # ...log the running loss
            self.logs_writer.add_scalar('val_loss',
                                        final_loss,
                                        steps)

        self.loss_model.zero_grad()
        self.loss_model.train()

        return final_loss

  from tqdm.autonotebook import trange
2023-05-28 18:41:26.660379: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
import simi

import pandas as pd
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

EMBEDDING = models.Transformer('bert-base-uncased')
POOLING = models.Pooling(EMBEDDING.get_word_embedding_dimension())) # MEAN-pooling
# POOLING = models.Pooling(EMBEDDING.get_word_embedding_dimension(), pooling_mode="cls")) # CLS-pooling

MODEL = SentenceTransformer(modules=[EMBEDDING, POOLING])

RANDOM_STATE = 1

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
df = pd.read_csv("abstracts-arxiv-dataset.csv", index_col=0)
df["categories"] = df["categories"].map(lambda c: tuple(c.split()))
df = df[df["categories"].map(lambda c: len(tuple(filter(lambda s: s.startswith("math") or s.startswith("stat"), c)))>0)]
df = df.reset_index(drop=True)
df.sample(5)

Unnamed: 0,categories,doi,text
431987,"(cs.LG, eess.SP, stat.ML)",,New Perspectives on the Use of Online Learning...
382432,"(math.NT,)",,Some Results on Linearized Trinomials that Spl...
598035,"(math.AG, math.RA)",,Differential Operators on Azumaya algebra and ...
254524,"(stat.ME,)",,"Some comments about ""Penalising model componen..."
55515,"(math.NT,)",10.1142/S1793042110003654,Construction of Self-Dual Integral Normal Base...


In [4]:
X_train, X_test = train_test_split(df, train_size=0.8, random_state=RANDOM_STATE)
print("train:", len(X_train), "test:", len(X_test))

train: 520388 test: 130097


In [5]:
retrain_dataset = datasets.DenoisingAutoEncoderDataset(X_train["text"].reset_index(drop=True))
retrain_dataloader = DataLoader(retrain_dataset, shuffle=True, batch_size=8)
retrain_loss = losses.DenoisingAutoEncoderLoss(MODEL, decoder_name_or_path='bert-base-uncased', tie_encoder_decoder=True)

When tie_encoder_decoder=True, the decoder_name_or_path will be invalid.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.8.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.query.bias', 'be

In [6]:
eval_dataset = datasets.DenoisingAutoEncoderDataset(X_test["text"].reset_index(drop=True))
eval_dataloader = DataLoader(eval_dataset, shuffle=True, batch_size=8)
evaluator = LossEvaluator(eval_dataloader, loss_model=retrain_loss, device="cuda:0", show_progress_bar=True)

In [None]:
# Call the fit method
MODEL.fit(
    train_objectives=[(retrain_dataloader, retrain_loss)],
    evaluator=evaluator,
    evaluation_steps=10000,
    save_best_model=True,
    output_path='bert+mean-pooling+re-train_tsdae_abstracts_arxiv',
    epochs=3,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True
)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/65049 [00:00<?, ?it/s]

In [None]:
MODEL.save('bert+mean-pooling+re-train_tsdae_abstracts_arxiv')

In [None]:
MODEL.evaluate(evaluator)