In [1]:
!pip install sentence_transformers
!pip install ipywidgets --user
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, InputExample
from sentence_transformers import util
import logging
from datetime import datetime
import gzip
from tqdm import tqdm
import os

In [2]:
def get_train_samples(sentences_path):
    train_samples = []
    df = pd.read_csv(sentences_path, sep=",")
    for i in tqdm(range(len(df))):
      train_samples.append(InputExample(texts=[df.iloc[i,0], df.iloc[i,1]]))
    return train_samples

def fine_tune_bertlike_model(sentences_path,
                             model_name: str,
                             train_batch_size: int,
                             max_seq_length: int,
                             learning_rate: str,
                             num_epochs: int,
                             train_samples: list):
    
    print(f"FINE TUNE BERT LIKE MODEL max_seq_length:{max_seq_length}, train_batch_size:{train_batch_size}, learning_rate:{learning_rate}")
    print(f"Cuda is available:{torch.cuda.is_available()}")
    print(f"GPU total available memory {round(torch.cuda.get_device_properties(0).total_memory / 1000000000, 2)} GB")
    
    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()])

    # Save path to store our model
    model_output_path = f"./models/train_simcse-{max_seq_length}_{train_batch_size}_{learning_rate}"
    word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # We train our model using the MultipleNegativesRankingLoss
    train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
    logging.info("Train sentences: {}".format(len(train_samples)))
    train_loss = losses.MultipleNegativesRankingLoss(model)

    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              epochs=num_epochs,
              warmup_steps=warmup_steps,
              optimizer_params={'lr': 3e-5},
              checkpoint_path=model_output_path,
              show_progress_bar=True,
              use_amp=False)  # Set to True, if your GPU supports FP16 cores)

In [None]:
# Main Function - Used to call bertlike fine tuner
output_name = "./data/dataset_titles_abstract_large.csv"

train_samples = get_train_samples(output_name)

In [None]:
# !pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
model_name = 'allenai/specter'
train_batch_size = 40
max_seq_length = 300
learning_rate = "3e-5"
num_epochs = 1

fine_tune_bertlike_model(sentences_path=output_name,
                         model_name=model_name,
                         train_batch_size=train_batch_size,
                         max_seq_length=max_seq_length,
                         learning_rate=learning_rate,
                         num_epochs=num_epochs,
                         train_samples=train_samples)

In [None]:
!zip -r /home/jupyter/models/trained_models.zip /home/jupyter/models/train_simcse-300_40_3e-5