In [2]:
!uv pip install sentence-transformers ipywidgets pandas datasets accelerate

[2mAudited [1m5 packages[0m [2min 284ms[0m[0m


In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments, SentenceTransformer, losses
from datasets import load_dataset
from huggingface_hub import login

In [None]:
# Read data/csw24.txt and convert line by line to csv
# each line is word   definition.
# convert to csv with two columns: word and definition
# import pandas as pd

# with open('data/csw24.txt', 'r') as file:
#     lines = file.readlines()

# # each line is word<tab>definition.
# # convert to csv with two columns: word and definition
# data = []
# for line in lines:
#     word, definition = line.strip().split('\t', 1)
#     data.append({'word': word, 'definition': definition})

# df = pd.DataFrame(data)

# assert len(df) == len(lines)
# df.to_csv('data/csw24.csv', index=False)


In [4]:
MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer(MODEL_NAME)
# model.max_seq_length = 256 # For trial run purposes

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Matryoshka Loss

Matryoshka Representation Learning trains embeddings at multiple dimensions simultaneously, ensuring smaller embeddings (prefixes of the full embedding) remain useful while optimizing the full dimension. This provides flexible quality/speed tradeoffs from a single model.

In [None]:
base_loss = losses.MultipleNegativesRankingLoss(model)
target_dims = [384, 256]
mrl_loss = losses.MatryoshkaLoss(model, base_loss, target_dims)

## Dataset

The CSW24 dictionary dataset contains word-definition pairs used for training. The dataset is split into train/validation/test sets for model training and evaluation.


In [None]:
DATA_LOCATION = "data/non_indian_words.csv"
dataset = load_dataset("csv", data_files=DATA_LOCATION)

# Split into train and temp (test+val)
splits = dataset['train'].train_test_split(test_size=0.2)  # 80% train, 20% temp
train_dataset = splits['train']
temp = splits['test']

# Split temp into val and test
temp_splits = temp.train_test_split(test_size=0.5)  # 50% val, 50% test
val_dataset = temp_splits['train']
test_dataset = temp_splits['test']

print("Train Dataset Size:", len(train_dataset))
print("Val Dataset Size:", len(val_dataset))
print("Test Dataset Size:", len(test_dataset))

Train Dataset Size: 222635
Val Dataset Size: 27829
Test Dataset Size: 27830


## Evaluator

The InformationRetrievalEvaluator measures how well the model retrieves the correct definition for each word using cosine similarity. It computes accuracy, precision, recall, and NDCG metrics at various top-K thresholds.


In [None]:
evaluator = InformationRetrievalEvaluator(
    queries={i: example['word'] for i, example in enumerate(val_dataset)},
    corpus={i: example['definition'] for i, example in enumerate(val_dataset)},
    relevant_docs={i: [i] for i in range(len(val_dataset))}, # Word i's def is always doc i
    name='dictionary-test'
)

## Trainer

The SentenceTransformerTrainer handles the training loop with the specified loss function, training arguments, and evaluator. It automatically manages batching, gradient updates, evaluation, and checkpointing during training.


In [None]:
!python -c "import accelerate; print(accelerate.__version__)"

0.24.0.dev0


In [None]:

training_args = SentenceTransformerTrainingArguments(
    output_dir='./output',
    per_device_train_batch_size=64,
    num_train_epochs=1,
    fp16=True,
    learning_rate=2e-5,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
)

trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=train_dataset,
    loss=mrl_loss,
    args=training_args,
    evaluator=evaluator
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Dictionary-test Cosine Accuracy@1,Dictionary-test Cosine Accuracy@3,Dictionary-test Cosine Accuracy@5,Dictionary-test Cosine Accuracy@10,Dictionary-test Cosine Precision@1,Dictionary-test Cosine Precision@3,Dictionary-test Cosine Precision@5,Dictionary-test Cosine Precision@10,Dictionary-test Cosine Recall@1,Dictionary-test Cosine Recall@3,Dictionary-test Cosine Recall@5,Dictionary-test Cosine Recall@10,Dictionary-test Cosine Ndcg@10,Dictionary-test Cosine Mrr@10,Dictionary-test Cosine Map@100
100,1.5353,No log,0.504826,0.67057,0.709771,0.743513,0.504826,0.223523,0.141954,0.074351,0.504826,0.67057,0.709771,0.743513,0.630627,0.593713,0.596473
200,1.2836,No log,0.546519,0.691377,0.720807,0.748536,0.546519,0.230459,0.144161,0.074854,0.546519,0.691377,0.720807,0.748536,0.654278,0.623322,0.626132
300,1.2305,No log,0.559217,0.698972,0.727532,0.755617,0.559217,0.232991,0.145506,0.075562,0.559217,0.698972,0.727532,0.755617,0.663688,0.633532,0.636208
400,1.1669,No log,0.56068,0.700356,0.729945,0.757081,0.56068,0.233452,0.145989,0.075708,0.56068,0.700356,0.729945,0.757081,0.665149,0.634996,0.637667
500,1.1904,No log,0.572271,0.704905,0.732239,0.758109,0.572271,0.234968,0.146448,0.075811,0.572271,0.704905,0.732239,0.758109,0.67144,0.642986,0.645754
600,1.0998,No log,0.573774,0.707951,0.734375,0.761709,0.573774,0.235984,0.146875,0.076171,0.573774,0.707951,0.734375,0.761709,0.673844,0.645033,0.647714
700,1.0655,No log,0.575277,0.708861,0.735285,0.762658,0.575277,0.236287,0.147057,0.076266,0.575277,0.708861,0.735285,0.762658,0.675059,0.646343,0.649096
800,1.095,No log,0.580934,0.711472,0.735997,0.763252,0.580934,0.237157,0.147199,0.076325,0.580934,0.711472,0.735997,0.763252,0.678071,0.650149,0.652928
900,1.1535,No log,0.585839,0.712658,0.738924,0.765427,0.585839,0.237553,0.147785,0.076543,0.585839,0.712658,0.738924,0.765427,0.681258,0.653691,0.656405
1000,1.0047,No log,0.586432,0.713331,0.73754,0.765071,0.586432,0.237777,0.147508,0.076507,0.586432,0.713331,0.73754,0.765071,0.681367,0.653972,0.65674


TrainOutput(global_step=3555, training_loss=1.0422707043954926, metrics={'train_runtime': 4517.6472, 'train_samples_per_second': 50.362, 'train_steps_per_second': 0.787, 'total_flos': 0.0, 'train_loss': 1.0422707043954926, 'epoch': 1.0})

In [None]:
FINAL_MODEL_REPO=""
model.save_pretrained(FINAL_MODEL_REPO)

## HuggingFace Hub Push

The trained model is pushed to the HuggingFace Hub for sharing and deployment. This allows the model to be easily loaded and used by others using the SentenceTransformer library.


In [None]:
login()
REPO_ID = ""
model.push_to_hub(repo_id=REPO_ID)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦