In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
DATASET_PUBMED_RCT = './gdrive/Shareddrives/DATASETS/PUBMED_RCT/'
OUTPUT_MODEL_DIR = './gdrive/Shareddrives/MODELS/'

In [3]:
!pip install transformers -q
!pip install sentence_transformers -q

[K     |████████████████████████████████| 4.7 MB 13.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 28.7 MB/s 
[K     |████████████████████████████████| 101 kB 9.8 MB/s 
[K     |████████████████████████████████| 596 kB 48.4 MB/s 
[K     |████████████████████████████████| 85 kB 3.6 MB/s 
[K     |████████████████████████████████| 1.3 MB 30.9 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [4]:
import pandas as pd

In [5]:
df_train = pd.read_parquet(DATASET_PUBMED_RCT + 'train.parquet')
df_dev = pd.read_parquet(DATASET_PUBMED_RCT + 'df_dev_triplets.parquet')
df_test = pd.read_parquet(DATASET_PUBMED_RCT + 'df_test_triplets.parquet')

In [6]:
df_train.columns

Index(['pmid', 'label', 'sentence', 'label_id'], dtype='object')

In [7]:
df_dev.columns

Index(['anchor', 'positive', 'negative'], dtype='object')

In [8]:
# same text but with different labels
print(f'{len(df_train.index)}')
mask = df_train.groupby('sentence')['label'].transform('nunique') > 1
df_train = df_train[~mask].copy()
print(f'{len(df_train.index)}')

180040
179892


In [9]:
from sentence_transformers import InputExample
from tqdm import tqdm

train_set = []
guid = 1
for idx, row in tqdm(df_train.iterrows(), total=len(df_train.index)):
    train_set.append(InputExample(
        guid=guid,
        texts=[row['sentence']],
        label=row['label_id']
    ))
    guid += 1
len(train_set)

100%|██████████| 179892/179892 [00:23<00:00, 7705.26it/s] 


179892

In [10]:
dev_set = []

guid = 1
for idx, row in tqdm(df_dev.iterrows(), total=len(df_dev.index)):
    dev_set.append(InputExample(
        guid=guid,
        texts=[row['anchor'], row['positive'], row['negative']],
    ))
    guid += 1
len(dev_set)

100%|██████████| 30212/30212 [00:02<00:00, 11405.99it/s]


30212

In [11]:
test_set = []

guid = 1
for idx, row in tqdm(df_test.iterrows(), total=len(df_test.index)):
    test_set.append(InputExample(
        guid=guid,
        texts=[row['anchor'], row['positive'], row['negative']],
    ))
    guid += 1
len(test_set)

100%|██████████| 30122/30122 [00:02<00:00, 10463.30it/s]


30122

In [12]:
import logging
from sentence_transformers import LoggingHandler

logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [13]:
from datetime import datetime
model_name = 'allenai/scibert_scivocab_uncased'
model_file_name = model_name.split('/')[-1] + '_PubMedRCT_TripletAll'

train_batch_size = 16
output_path = (
    OUTPUT_MODEL_DIR
    + model_file_name
    + "-"
    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
output_path

'./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16'

In [14]:
from sentence_transformers import models, SentenceTransformer

bert = models.Transformer(model_name)
pooler = models.Pooling(
    bert.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[bert, pooler])

Downloading config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading vocab.txt:   0%|          | 0.00/223k [00:00<?, ?B/s]

2022-08-08 15:45:36 - Use pytorch device: cuda


In [15]:
from sentence_transformers import datasets
from torch.utils.data import DataLoader

loader = DataLoader(train_set, shuffle=True, batch_size=train_batch_size)

In [16]:
from sentence_transformers import losses

train_loss = losses.BatchAllTripletLoss(model=model)

In [17]:
from sentence_transformers.evaluation import TripletEvaluator

dev_evaluator = TripletEvaluator.from_input_examples(
    dev_set, write_csv=True, show_progress_bar=True, name='pubmed-rct-dev'
)

In [18]:
logging.info("Performance before fine-tuning:")
dev_evaluator(model)

2022-08-08 15:45:36 - Performance before fine-tuning:
2022-08-08 15:45:36 - TripletEvaluator: Evaluating the model on pubmed-rct-dev dataset:


Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 15:48:44 - Accuracy Cosine Distance:   	63.08
2022-08-08 15:48:44 - Accuracy Manhattan Distance:	62.20
2022-08-08 15:48:44 - Accuracy Euclidean Distance:	62.13



0.6308420495167483

In [19]:
logging.info("Evaluating model on test set")
test_evaluator = TripletEvaluator.from_input_examples(
    test_set, write_csv=True, show_progress_bar=True, name='pubmed-rct-test'
)
test_evaluator(model)

2022-08-08 15:48:44 - Evaluating model on test set
2022-08-08 15:48:44 - TripletEvaluator: Evaluating the model on pubmed-rct-test dataset:


Batches:   0%|          | 0/1883 [00:00<?, ?it/s]

Batches:   0%|          | 0/1883 [00:00<?, ?it/s]

Batches:   0%|          | 0/1883 [00:00<?, ?it/s]

2022-08-08 15:51:40 - Accuracy Cosine Distance:   	61.99
2022-08-08 15:51:40 - Accuracy Manhattan Distance:	61.76
2022-08-08 15:51:40 - Accuracy Euclidean Distance:	61.37



0.6198791580904323

In [20]:
num_epochs = 20

warmup_steps = int(len(loader) * num_epochs  * 0.1)  # 10% of train data

In [21]:
model_output_path = output_path + '/model'
checkpoint_output_path = output_path + '/checkpoint'

print(model_output_path)
print(checkpoint_output_path)

./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/model
./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint


In [22]:
%%time
model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=model_output_path,
    show_progress_bar=True,
    evaluator=dev_evaluator,
    save_best_model=True,
    checkpoint_save_total_limit=1,
    checkpoint_path=checkpoint_output_path    
)  

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 15:53:06 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/500
2022-08-08 15:54:32 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/1000
2022-08-08 15:55:56 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/1500
2022-08-08 15:57:21 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/2000
2022-08-08 15:58:44 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/2500
2022-08-08 16:00:08 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/3000
2022-08-08 16:01:32 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 16:26:11 - Accuracy Cosine Distance:   	94.03
2022-08-08 16:26:11 - Accuracy Manhattan Distance:	94.04
2022-08-08 16:26:11 - Accuracy Euclidean Distance:	94.02

2022-08-08 16:26:11 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/model


Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 16:26:55 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/11500
2022-08-08 16:28:19 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/12000
2022-08-08 16:29:42 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/12500
2022-08-08 16:31:06 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/13000
2022-08-08 16:32:30 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/13500
2022-08-08 16:33:53 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/14000
2022-08-08 16:35:18 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 17:00:28 - Accuracy Cosine Distance:   	94.24
2022-08-08 17:00:28 - Accuracy Manhattan Distance:	94.28
2022-08-08 17:00:28 - Accuracy Euclidean Distance:	94.28

2022-08-08 17:00:28 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/model


Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 17:00:32 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/22500
2022-08-08 17:01:57 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/23000
2022-08-08 17:03:20 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/23500
2022-08-08 17:04:44 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/24000
2022-08-08 17:06:09 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/24500
2022-08-08 17:07:32 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/25000
2022-08-08 17:08:55 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 17:35:02 - Accuracy Cosine Distance:   	94.14
2022-08-08 17:35:02 - Accuracy Manhattan Distance:	94.17
2022-08-08 17:35:02 - Accuracy Euclidean Distance:	94.17



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 17:35:44 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/34000
2022-08-08 17:37:08 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/34500
2022-08-08 17:38:33 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/35000
2022-08-08 17:39:57 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/35500
2022-08-08 17:41:21 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/36000
2022-08-08 17:42:45 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/36500
2022-08-08 17:44:10 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 18:09:24 - Accuracy Cosine Distance:   	94.04
2022-08-08 18:09:24 - Accuracy Manhattan Distance:	94.10
2022-08-08 18:09:24 - Accuracy Euclidean Distance:	94.07



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 18:09:29 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/45000
2022-08-08 18:10:52 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/45500
2022-08-08 18:12:17 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/46000
2022-08-08 18:13:39 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/46500
2022-08-08 18:15:04 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/47000
2022-08-08 18:16:27 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/47500
2022-08-08 18:17:50 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 18:43:48 - Accuracy Cosine Distance:   	93.91
2022-08-08 18:43:48 - Accuracy Manhattan Distance:	93.81
2022-08-08 18:43:48 - Accuracy Euclidean Distance:	93.88



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 18:44:34 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/56500
2022-08-08 18:45:57 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/57000
2022-08-08 18:47:21 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/57500
2022-08-08 18:48:44 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/58000
2022-08-08 18:50:07 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/58500
2022-08-08 18:51:32 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/59000
2022-08-08 18:52:56 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 19:18:20 - Accuracy Cosine Distance:   	93.63
2022-08-08 19:18:20 - Accuracy Manhattan Distance:	93.65
2022-08-08 19:18:20 - Accuracy Euclidean Distance:	93.64



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 19:18:27 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/67500
2022-08-08 19:19:50 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/68000
2022-08-08 19:21:14 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/68500
2022-08-08 19:22:36 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/69000
2022-08-08 19:24:00 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/69500
2022-08-08 19:25:25 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/70000
2022-08-08 19:26:49 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 19:52:51 - Accuracy Cosine Distance:   	93.68
2022-08-08 19:52:51 - Accuracy Manhattan Distance:	93.66
2022-08-08 19:52:51 - Accuracy Euclidean Distance:	93.70



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 19:53:40 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/79000
2022-08-08 19:55:03 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/79500
2022-08-08 19:56:26 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/80000
2022-08-08 19:57:49 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/80500
2022-08-08 19:59:14 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/81000
2022-08-08 20:00:37 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/81500
2022-08-08 20:02:00 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 20:27:11 - Accuracy Cosine Distance:   	93.40
2022-08-08 20:27:11 - Accuracy Manhattan Distance:	93.41
2022-08-08 20:27:11 - Accuracy Euclidean Distance:	93.39



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 20:27:19 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/90000
2022-08-08 20:28:43 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/90500
2022-08-08 20:30:05 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/91000
2022-08-08 20:31:29 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/91500
2022-08-08 20:32:53 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/92000
2022-08-08 20:34:16 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/92500
2022-08-08 20:35:38 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 21:01:06 - Accuracy Cosine Distance:   	93.02
2022-08-08 21:01:06 - Accuracy Manhattan Distance:	93.06
2022-08-08 21:01:06 - Accuracy Euclidean Distance:	93.06



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 21:01:54 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/101500
2022-08-08 21:03:16 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/102000
2022-08-08 21:04:38 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/102500
2022-08-08 21:05:58 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/103000
2022-08-08 21:07:21 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/103500
2022-08-08 21:08:44 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/104000
2022-08-08 21:10:04 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_Triple

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 21:34:48 - Accuracy Cosine Distance:   	93.07
2022-08-08 21:34:48 - Accuracy Manhattan Distance:	93.04
2022-08-08 21:34:48 - Accuracy Euclidean Distance:	93.06



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 21:34:58 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/112500
2022-08-08 21:36:19 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/113000
2022-08-08 21:37:41 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/113500
2022-08-08 21:39:02 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/114000
2022-08-08 21:40:26 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/114500
2022-08-08 21:41:48 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/115000
2022-08-08 21:43:11 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_Triple

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 22:08:22 - Accuracy Cosine Distance:   	93.43
2022-08-08 22:08:22 - Accuracy Manhattan Distance:	93.46
2022-08-08 22:08:22 - Accuracy Euclidean Distance:	93.43



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 22:09:11 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/124000
2022-08-08 22:10:33 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/124500
2022-08-08 22:11:56 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/125000
2022-08-08 22:13:20 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/125500
2022-08-08 22:14:41 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/126000
2022-08-08 22:16:03 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/126500
2022-08-08 22:17:24 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_Triple

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 22:41:55 - Accuracy Cosine Distance:   	93.24
2022-08-08 22:41:55 - Accuracy Manhattan Distance:	93.24
2022-08-08 22:41:55 - Accuracy Euclidean Distance:	93.24



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 22:42:06 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/135000
2022-08-08 22:43:28 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/135500
2022-08-08 22:44:48 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/136000
2022-08-08 22:46:11 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/136500
2022-08-08 22:47:34 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/137000
2022-08-08 22:48:55 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/137500
2022-08-08 22:50:16 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_Triple

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 23:15:32 - Accuracy Cosine Distance:   	93.32
2022-08-08 23:15:32 - Accuracy Manhattan Distance:	93.33
2022-08-08 23:15:32 - Accuracy Euclidean Distance:	93.30



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 23:16:23 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/146500
2022-08-08 23:17:44 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/147000
2022-08-08 23:19:06 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/147500
2022-08-08 23:20:27 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/148000
2022-08-08 23:21:48 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/148500
2022-08-08 23:23:12 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/149000
2022-08-08 23:24:34 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_Triple

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-08 23:49:16 - Accuracy Cosine Distance:   	93.44
2022-08-08 23:49:16 - Accuracy Manhattan Distance:	93.46
2022-08-08 23:49:16 - Accuracy Euclidean Distance:	93.44



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-08 23:49:29 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/157500
2022-08-08 23:50:53 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/158000
2022-08-08 23:52:16 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/158500
2022-08-08 23:53:37 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/159000
2022-08-08 23:55:00 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/159500
2022-08-08 23:56:20 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/160000
2022-08-08 23:57:42 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_Triple

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-09 00:23:00 - Accuracy Cosine Distance:   	93.04
2022-08-09 00:23:00 - Accuracy Manhattan Distance:	93.08
2022-08-09 00:23:00 - Accuracy Euclidean Distance:	93.07



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-09 00:23:54 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/169000
2022-08-09 00:25:16 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/169500
2022-08-09 00:26:36 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/170000
2022-08-09 00:27:58 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/170500
2022-08-09 00:29:21 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/171000
2022-08-09 00:30:43 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/171500
2022-08-09 00:32:06 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_Triple

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-09 00:56:44 - Accuracy Cosine Distance:   	93.32
2022-08-09 00:56:44 - Accuracy Manhattan Distance:	93.35
2022-08-09 00:56:44 - Accuracy Euclidean Distance:	93.34



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-09 00:57:00 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/180000
2022-08-09 00:58:24 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/180500
2022-08-09 00:59:46 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/181000
2022-08-09 01:01:09 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/181500
2022-08-09 01:02:32 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/182000
2022-08-09 01:03:56 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/182500
2022-08-09 01:05:20 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_Triple

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-09 01:30:52 - Accuracy Cosine Distance:   	93.31
2022-08-09 01:30:52 - Accuracy Manhattan Distance:	93.37
2022-08-09 01:30:52 - Accuracy Euclidean Distance:	93.33



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-09 01:31:49 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/191500
2022-08-09 01:33:11 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/192000
2022-08-09 01:34:33 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/192500
2022-08-09 01:35:54 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/193000
2022-08-09 01:37:18 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/193500
2022-08-09 01:38:40 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/194000
2022-08-09 01:40:01 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_Triple

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-09 02:05:09 - Accuracy Cosine Distance:   	93.46
2022-08-09 02:05:09 - Accuracy Manhattan Distance:	93.43
2022-08-09 02:05:09 - Accuracy Euclidean Distance:	93.46



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-09 02:05:26 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/202500
2022-08-09 02:06:51 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/203000
2022-08-09 02:08:18 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/203500
2022-08-09 02:09:44 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/204000
2022-08-09 02:11:08 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/204500
2022-08-09 02:12:32 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/205000
2022-08-09 02:13:56 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_Triple

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-09 02:39:47 - Accuracy Cosine Distance:   	93.63
2022-08-09 02:39:47 - Accuracy Manhattan Distance:	93.67
2022-08-09 02:39:47 - Accuracy Euclidean Distance:	93.65



Iteration:   0%|          | 0/11244 [00:00<?, ?it/s]

2022-08-09 02:40:44 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/214000
2022-08-09 02:42:08 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/214500
2022-08-09 02:43:31 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/215000
2022-08-09 02:44:55 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/215500
2022-08-09 02:46:19 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/216000
2022-08-09 02:47:42 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/216500
2022-08-09 02:49:06 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_Triple

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

Batches:   0%|          | 0/1889 [00:00<?, ?it/s]

2022-08-09 03:13:43 - Accuracy Cosine Distance:   	93.49
2022-08-09 03:13:43 - Accuracy Manhattan Distance:	93.51
2022-08-09 03:13:43 - Accuracy Euclidean Distance:	93.51

2022-08-09 03:13:43 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16/checkpoint/224880
CPU times: user 9h 41min 47s, sys: 1h 21min 58s, total: 11h 3min 45s
Wall time: 11h 22min 4s


In [23]:
logging.info("Evaluating model on test set")
test_evaluator = TripletEvaluator.from_input_examples(
    test_set, write_csv=True, show_progress_bar=True, name='pubmed-rct-test'
)
model.evaluate(test_evaluator)

2022-08-09 03:13:45 - Evaluating model on test set
2022-08-09 03:13:45 - TripletEvaluator: Evaluating the model on pubmed-rct-test dataset:


Batches:   0%|          | 0/1883 [00:00<?, ?it/s]

Batches:   0%|          | 0/1883 [00:00<?, ?it/s]

Batches:   0%|          | 0/1883 [00:00<?, ?it/s]

2022-08-09 03:16:38 - Accuracy Cosine Distance:   	92.78
2022-08-09 03:16:38 - Accuracy Manhattan Distance:	92.89
2022-08-09 03:16:38 - Accuracy Euclidean Distance:	92.80



0.9288891839851271

In [24]:
model.save(output_path + '_last')

2022-08-09 03:16:38 - Save model to ./gdrive/Shareddrives/MODELS/scibert_scivocab_uncased_PubMedRCT_TripletAll-2022-08-08_15-45-16_last
