### Loading Libraries

In [None]:
# Install Pytorch & other libraries
!pip install "torch==2.1.2" tensorboard

# Install Hugging Face libraries
!pip install --upgrade \
  "sentence-transformers>=3" \
  "datasets==2.19.1"  \
  "transformers==4.41.2"
!pip install markdownify
!pip install accelerate>=0.20.3 transformers
!pip install transformers[torch]

Collecting torch==2.1.2
  Downloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.1.2)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.1.2)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.1.2)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.1.2)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.1.2)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2

### Dataset Loading and Pre-Processing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
file_path="/content/drive/MyDrive/datasets/output.json"

with open(file_path, 'r') as f:
  data=json.load(f)

In [None]:
deleted = ['id', 'z_score', 'normalized_score', 'comment_normalized_score', 'combined_score']

for d in data:
    for delete in deleted:
        d.pop(delete)

In [None]:
import pandas as pd
df=pd.DataFrame(data=data)

In [None]:
import markdownify as md
df['positive'] = df['body'].apply(lambda x: md.markdownify(x))
df['anchor'] = df['selftext'].apply(lambda x: md.markdownify(x))

  soup = BeautifulSoup(html, 'html.parser')
  soup = BeautifulSoup(html, 'html.parser')
  soup = BeautifulSoup(html, 'html.parser')
  soup = BeautifulSoup(html, 'html.parser')


In [None]:
df.drop(columns=['title','subreddit','selftext', 'body'], inplace=True)

In [None]:
df['id'] = [i for i in range(len(df))]

In [None]:
corpus = dict(
    zip(df["id"], df["positive"])
)
queries = dict(
    zip(df["id"], df["anchor"])
)

rel_docs = {}
for q_id in queries:
  rel_docs[q_id] = [q_id]

### Evaluate the current embedding model

In [None]:
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)
from sentence_transformers.util import cos_sim

  from tqdm.autonotebook import tqdm, trange


In [None]:
model_id = "BAAI/bge-base-en-v1.5"
matryoshka_dimensions = [768, 512, 256, 128, 64]

embed_model = SentenceTransformer(model_id, device="cuda" if torch.cuda.is_available() else "cpu")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from itertools import islice

def slice_dic(dic, sub: int):
    return {k: dic[k] for k in islice(dic, sub)}

corpus_test = slice_dic(corpus, 100)
queries_test=slice_dic(queries, 100)
rel_docs_test=slice_dic(rel_docs, 100)

In [None]:
matryoshka_evaluators = []

for dim in matryoshka_dimensions:
  ir_evaluator=InformationRetrievalEvaluator(
      queries=queries_test,
      corpus=corpus_test,
      relevant_docs=rel_docs_test,
      name=f"dim_{dim}",
      truncate_dim=dim,
      score_functions={"cosine": cos_sim},
  )

  matryoshka_evaluators.append(ir_evaluator)

evaluator = SequentialEvaluator(matryoshka_evaluators)

In [None]:
results=evaluator(embed_model)

In [None]:
for dim in matryoshka_dimensions:
  key = f"dim_{dim}_cosine_ndcg@10"
  print(f"{key}: {results[key]}")

dim_768_cosine_ndcg@10: 0.5691344814006021
dim_512_cosine_ndcg@10: 0.580101489391867
dim_256_cosine_ndcg@10: 0.5567719171210964
dim_128_cosine_ndcg@10: 0.5382497269154309
dim_64_cosine_ndcg@10: 0.47068112214982427


### Finetuning the base Model

In [None]:
from google.colab import userdata
from huggingface_hub import login

token=userdata.get("hugging_hub")
login(token=token, add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from sentence_transformers import SentenceTransformerModelCardData, SentenceTransformer

model = SentenceTransformer(
    model_id,
    model_kwargs={"attn_implementation":"sdpa"},
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="BGE base Financial Matryoshka",
    ),
)



In [None]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers

inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [None]:
df.to_json("data.json", orient="records")

In [None]:
#!pip install accelerate>=0.20.3 transformers
args = SentenceTransformerTrainingArguments(
    output_dir="bge-base-financial-matryoshka",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    warmup_ratio=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    optim="adamw_torch_fused",
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_dim_128_cosine_ndcg@10",
)

In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="data.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
from sentence_transformers import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=embed_model,
    args=args,
    train_dataset=dataset.select_columns(["positive", "anchor"]),
    loss=train_loss,
    evaluator=evaluator,
)

In [None]:
trainer.train()


trainer.save_model()


trainer.model.push_to_hub("bge-base-financial-matryoshka")

Epoch,Training Loss,Validation Loss,Dim 768 Cosine Accuracy@1,Dim 768 Cosine Accuracy@3,Dim 768 Cosine Accuracy@5,Dim 768 Cosine Accuracy@10,Dim 768 Cosine Precision@1,Dim 768 Cosine Precision@3,Dim 768 Cosine Precision@5,Dim 768 Cosine Precision@10,Dim 768 Cosine Recall@1,Dim 768 Cosine Recall@3,Dim 768 Cosine Recall@5,Dim 768 Cosine Recall@10,Dim 768 Cosine Ndcg@10,Dim 768 Cosine Mrr@10,Dim 768 Cosine Map@100,Dim 512 Cosine Accuracy@1,Dim 512 Cosine Accuracy@3,Dim 512 Cosine Accuracy@5,Dim 512 Cosine Accuracy@10,Dim 512 Cosine Precision@1,Dim 512 Cosine Precision@3,Dim 512 Cosine Precision@5,Dim 512 Cosine Precision@10,Dim 512 Cosine Recall@1,Dim 512 Cosine Recall@3,Dim 512 Cosine Recall@5,Dim 512 Cosine Recall@10,Dim 512 Cosine Ndcg@10,Dim 512 Cosine Mrr@10,Dim 512 Cosine Map@100,Dim 256 Cosine Accuracy@1,Dim 256 Cosine Accuracy@3,Dim 256 Cosine Accuracy@5,Dim 256 Cosine Accuracy@10,Dim 256 Cosine Precision@1,Dim 256 Cosine Precision@3,Dim 256 Cosine Precision@5,Dim 256 Cosine Precision@10,Dim 256 Cosine Recall@1,Dim 256 Cosine Recall@3,Dim 256 Cosine Recall@5,Dim 256 Cosine Recall@10,Dim 256 Cosine Ndcg@10,Dim 256 Cosine Mrr@10,Dim 256 Cosine Map@100,Dim 128 Cosine Accuracy@1,Dim 128 Cosine Accuracy@3,Dim 128 Cosine Accuracy@5,Dim 128 Cosine Accuracy@10,Dim 128 Cosine Precision@1,Dim 128 Cosine Precision@3,Dim 128 Cosine Precision@5,Dim 128 Cosine Precision@10,Dim 128 Cosine Recall@1,Dim 128 Cosine Recall@3,Dim 128 Cosine Recall@5,Dim 128 Cosine Recall@10,Dim 128 Cosine Ndcg@10,Dim 128 Cosine Mrr@10,Dim 128 Cosine Map@100,Dim 64 Cosine Accuracy@1,Dim 64 Cosine Accuracy@3,Dim 64 Cosine Accuracy@5,Dim 64 Cosine Accuracy@10,Dim 64 Cosine Precision@1,Dim 64 Cosine Precision@3,Dim 64 Cosine Precision@5,Dim 64 Cosine Precision@10,Dim 64 Cosine Recall@1,Dim 64 Cosine Recall@3,Dim 64 Cosine Recall@5,Dim 64 Cosine Recall@10,Dim 64 Cosine Ndcg@10,Dim 64 Cosine Mrr@10,Dim 64 Cosine Map@100,Sequential Score
0,4.1189,No log,0.38,0.63,0.71,0.74,0.38,0.21,0.142,0.074,0.38,0.63,0.71,0.74,0.569134,0.513139,0.524893,0.4,0.6,0.71,0.76,0.4,0.2,0.142,0.076,0.4,0.6,0.71,0.76,0.580101,0.522083,0.53217,0.37,0.58,0.66,0.76,0.37,0.193333,0.132,0.076,0.37,0.58,0.66,0.76,0.556772,0.492675,0.502808,0.38,0.53,0.65,0.72,0.38,0.176667,0.13,0.072,0.38,0.53,0.65,0.72,0.53825,0.481107,0.494161,0.3,0.47,0.56,0.66,0.3,0.156667,0.112,0.066,0.3,0.47,0.56,0.66,0.470681,0.411187,0.425313,0.425313


HfHubHTTPError: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-66671daa-0803b1e115a1f8b00e23671d;2d916d0d-dea7-4291-88c0-c8c4f257cabb)

You already created this model repo

In [None]:
trainer.model.push_to_hub("bge-base-financial-matryoshka")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

'https://huggingface.co/phamkinhquoc2002/bge-base-financial-matryoshka/commit/e1947725d58c4b3386004d309538f3d27050fc69'

In [None]:
from sentence_transformers import SentenceTransformer

fine_tuned_model = SentenceTransformer(
    args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
)
# Evaluate the model
results = evaluator(fine_tuned_model)

for dim in matryoshka_dimensions:
    key = f"dim_{dim}_cosine_ndcg@10"
    print(f"{key}: {results[key]}")

dim_768_cosine_ndcg@10: 0.5691344814006021
dim_512_cosine_ndcg@10: 0.580101489391867
dim_256_cosine_ndcg@10: 0.5567719171210964
dim_128_cosine_ndcg@10: 0.5382497269154309
dim_64_cosine_ndcg@10: 0.47068112214982427
