### Install Required Libraries

In [2]:
!pip install datasets huggingface_hub transformers pandas numpy sentence-transformers[train]

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting sentence-transformers[train]
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting scikit-learn (from sentence-transformers[train])
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy (from sentence-transformers[train])
  Downloading scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6

### Get the Dataset

from datasets import load_dataset

In [4]:
from datasets import load_dataset
ds = load_dataset('vblagoje/PubMedQA_instruction')
ds

README.md:   0%|          | 0.00/498 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/274M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/986k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/272458 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'context', 'response', 'category'],
        num_rows: 272458
    })
    test: Dataset({
        features: ['instruction', 'context', 'response', 'category'],
        num_rows: 1000
    })
})

In [5]:
df = ds['train'].to_pandas()
df.head()

Unnamed: 0,instruction,context,response,category
0,Are group 2 innate lymphoid cells ( ILC2s ) in...,Chronic rhinosinusitis (CRS) is a heterogeneou...,"As ILC2s are elevated in patients with CRSwNP,...",closed_qa
1,Does vagus nerve contribute to the development...,Phosphatidylethanolamine N-methyltransferase (...,Neuronal signals via the hepatic vagus nerve c...,closed_qa
2,Does psammaplin A induce Sirtuin 1-dependent a...,Psammaplin A (PsA) is a natural product isolat...,PsA significantly inhibited MCF-7/adr cells pr...,closed_qa
3,Is methylation of the FGFR2 gene associated wi...,This study examined links between DNA methylat...,We identified a novel biologically plausible c...,closed_qa
4,Do tumor-infiltrating immune cell profiles and...,Tumor microenvironment immunity is associated ...,Breast cancer immune cell subpopulation profil...,closed_qa


In [7]:
df = df[['instruction', 'context']][:10000]
df.shape

(10000, 2)

### Find the similarities in the dataset

In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load an embedding model
model = SentenceTransformer("all-mpnet-base-v2")

# Encode all job descriptions
pubmed_context_embeddings = model.encode(df['context'].to_list())

# compute similarities
similarities = model.similarity(pubmed_context_embeddings, pubmed_context_embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
similarities

tensor([[ 1.0000,  0.1936,  0.0455,  ...,  0.3044,  0.0730,  0.1556],
        [ 0.1936,  1.0000,  0.1235,  ...,  0.1283,  0.1448,  0.1225],
        [ 0.0455,  0.1235,  1.0000,  ...,  0.0513,  0.2917,  0.0557],
        ...,
        [ 0.3044,  0.1283,  0.0513,  ...,  1.0000, -0.0067,  0.1110],
        [ 0.0730,  0.1448,  0.2917,  ..., -0.0067,  1.0000,  0.0759],
        [ 0.1556,  0.1225,  0.0557,  ...,  0.1110,  0.0759,  1.0000]])

### create dataset with positive and negative contexts

In [12]:
# match least context least similar to positive match as the negative match

# get sorted indexes of simiarlities
similarities_argsorted = np.argsort(similarities.numpy(), axis=1)

# initialize list to store negative pairs
negative_pair_index_list = []

for i in range(len(similarities)):

    # Start with the smallest similarity index for the current row
    j = 0
    index = int(similarities_argsorted[i][j])

    # Ensure the index is unique
    while index in negative_pair_index_list:
        j += 1  # Move to the next smallest index
        index = int(similarities_argsorted[i][j])  # Fetch next smallest index

    negative_pair_index_list.append(index)

# add negative pairs to df
df['context_neg'] = df['context'].iloc[negative_pair_index_list].values

In [14]:
df.head()

Unnamed: 0,instruction,context,context_neg
0,Do competency assessment of primary care physi...,To design and test a program that assesses cli...,Static stretch is frequently observed in the l...
1,Is age an important determinant of the growth ...,The factors that regulate the growth hormone (...,We have previously reported the crucial roles ...
2,Is terlipressin more effective in decreasing v...,Terlipressin decreases portal pressure. Howeve...,Based on the theories of brain reserve and cog...
3,Do stem cells improve the quality of colonic a...,Stem cells have multiple ways of differentiati...,To investigate the effects of physical activit...
4,Is evolution of the renal function a better pr...,In recent years acute rejection has decreased ...,Atp13a2 (Park9) gene encodes a transmembrane l...


### Prepare the dataset for our finetuning

In [13]:
# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train, validation, and test sets (e.g., 80% train, 10% validation, 10% test)
train_frac = 0.8
valid_frac = 0.1
test_frac = 0.1

# define train and validation size
train_size = int(train_frac * len(df))
valid_size = int(valid_frac * len(df))

# create train, validation, and test datasets
df_train = df[:train_size]
df_valid = df[train_size:train_size + valid_size]
df_test = df[train_size + valid_size:]

In [16]:
print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

(8000, 3)
(1000, 3)
(1000, 3)


In [18]:
from datasets import DatasetDict, Dataset

# Convert the pandas DataFrames back to Hugging Face Datasets
train_ds = Dataset.from_pandas(df_train)
valid_ds = Dataset.from_pandas(df_valid)
test_ds = Dataset.from_pandas(df_test)

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_ds,
    'validation': valid_ds,
    'test': test_ds
})

# push data to hub
dataset_dict.push_to_hub("pavanmantha/pumed-finetuning", token='your token')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/pavanmantha/pumed-finetuning/commit/1ba143a9087c7004813ce74a7f356cac4619a7a8', commit_message='Upload dataset', commit_description='', oid='1ba143a9087c7004813ce74a7f356cac4619a7a8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/pavanmantha/pumed-finetuning', endpoint='https://huggingface.co', repo_type='dataset', repo_id='pavanmantha/pumed-finetuning'), pr_revision=None, pr_num=None)

### Fetch the newly created dataset

In [19]:
from datasets import load_dataset
dataset_label = 'pavanmantha/pumed-finetuning'

# importing data
dataset = load_dataset(dataset_label)

README.md:   0%|          | 0.00/576 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/13.0M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.60M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.63M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

### create the evaluator for our finetuning

In [20]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import TripletEvaluator

# import model
model_name = "sentence-transformers/all-distilroberta-v1"
model = SentenceTransformer(model_name)

# create evaluator
evaluator_valid = TripletEvaluator(
    anchors=dataset["validation"]["instruction"],
    positives=dataset["validation"]["context"],
    negatives=dataset["validation"]["context_neg"],
    name="ai-pubmed-validation",
)
evaluator_valid(model)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

{'ai-pubmed-validation_cosine_accuracy': 1.0}

### create the training args

In [21]:
from sentence_transformers.losses import MultipleNegativesRankingLoss

loss = MultipleNegativesRankingLoss(model)

In [31]:
from sentence_transformers import SentenceTransformerTrainingArguments


num_epochs = 1
batch_size = 16
lr = 2e-5
finetuned_model_name = "distilroberta-pubmed-embeddings"

train_args = SentenceTransformerTrainingArguments(
    output_dir=f"models/{finetuned_model_name}",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=lr,
    warmup_ratio=0.1,
    batch_sampler="no_duplicates",  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=100,
)

Currently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transformer/training/distributed.html for more information.


### trigger finetuning process

In [32]:
from sentence_transformers import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=model,
    args=train_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    loss=loss,
    evaluator=evaluator_valid,
)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Ai-pubmed-validation Cosine Accuracy
100,0.0152,0.008483,1.0


TrainOutput(global_step=125, training_loss=0.01464997124671936, metrics={'train_runtime': 60.9021, 'train_samples_per_second': 131.358, 'train_steps_per_second': 2.052, 'total_flos': 0.0, 'train_loss': 0.01464997124671936, 'epoch': 1.0})

### Push the model to the hub for future use

In [34]:
# push model to HF hub
model.push_to_hub(f"pavanmantha/{finetuned_model_name}", token='your token')

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

'https://huggingface.co/pavanmantha/distilroberta-pubmed-embeddings/commit/a81f563a8bfa0e74689418188e38d0fabc147257'

### Validate the newly created model

In [39]:
# import model
model = SentenceTransformer("pavanmantha/distilroberta-pubmed-embeddings")

# new query
query = "data scientist 6 year experience, LLMs, credit risk, content marketing"
query_embedding = model.encode(query)

# encode JDs
jd_embeddings = model.encode(dataset["test"]["context"][:10])

# compute similarities
similarities = model.similarity(query_embedding, jd_embeddings)

In [40]:
similarities.shape

torch.Size([1, 10])

In [41]:
similarities

tensor([[-0.0614,  0.0577, -0.0019, -0.0596, -0.0171, -0.0280,  0.0136, -0.0156,
          0.0256,  0.0020]])