In [1]:
from os import path

PROJECT_ROOT = path.abspath(path.join(globals()['_dh'][0], '..'))
DATALAKE_PATH = path.abspath(path.join(PROJECT_ROOT, '..', '..', 'datalake'))
CBP_PROCESSED = path.abspath(path.join(DATALAKE_PATH, 'contrastivebertproj', 'processed'))
MODEL_DIR = path.abspath(path.join(PROJECT_ROOT, 'contrastivebert', 'classifier'))

FQFN_PROCESSED_DF = path.join(CBP_PROCESSED, 'flowers_dataset_with_descriptions.jsonl')

In [2]:
import pandas as pd

df = pd.read_json(f'file://{FQFN_PROCESSED_DF}', orient='records', lines=True)
print(f'df shape={df.shape}')

df = df[df["HUMAN_DESCRIPTION"].notna() & (df["HUMAN_DESCRIPTION"].str.strip() != "")]

print(f'df valid shape={df.shape}')
print(f'df columns={df.columns}')

df shape=(100, 6)
df valid shape=(100, 6)
df columns=Index(['FLOWER_NAME', 'PETAL_COLOR', 'PETAL_NUMBER', 'STEM_LENGTH',
       'LEAF_SHAPE', 'HUMAN_DESCRIPTION'],
      dtype='object')


In [3]:
from contrastivebert.trainer.contrastivebert_trainer import Trainer, TrainerConf

trainer = Trainer(df, TrainerConf())

2025-07-09 21:10:26,960 - tensorcraft - INFO - XLA Device Not Supported: No module named 'torch_xla'
2025-07-09 21:10:26,971 - tensorcraft - INFO - Pytorch version=2.6.0 preferred device=mps build with MPS support=True
2025-07-09 21:10:27,237 - tensorcraft - INFO - resolved device_name: mps compute_device: mps tensor_device: mps
2025-07-09 21:10:27,238 - tensorcraft - INFO - LM Components are being loaded from microsoft/graphcodebert-base...
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-07-09 21:10:28,232 - tensorcraft - INFO - LM Components successfully loaded for mps compute architecture


In [4]:
from time import time
start_time = time()

trainer.train()

print(f'Training duration: {time() - start_time:.4f} seconds')

Training Epochs:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1 Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch:   1/  4, LR:0.00002, Loss:0.14468


Epoch 2 Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch:   2/  4, LR:0.00002, Loss:0.05080


Epoch 3 Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch:   3/  4, LR:0.00003, Loss:0.04256


Epoch 4 Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch:   4/  4, LR:0.00005, Loss:0.02149
Training duration: 62.7312 seconds


## Compute Evaluation Metrics

In [5]:
from contrastivebert.classifier.onnx_exporter import WEIGHTS_FILE_NAME
from utils.pt_utils import save_model_weights

fqfn_model_save = path.join(MODEL_DIR, WEIGHTS_FILE_NAME)
save_model_weights(trainer.model, fqfn_model_save)

In [6]:
from os import path
from contrastivebert.classifier.contrastivebert_configuration import ModelConf
from contrastivebert.classifier.onnx_exporter import WEIGHTS_FILE_NAME
from contrastivebert.classifier.contrastivebert_classifier import ContrastiveSBERT
from utils.compute_device import DEVICES


fqfn_model_save = path.join(MODEL_DIR, WEIGHTS_FILE_NAME)
trained_model = ContrastiveSBERT(model_conf=ModelConf())
trained_model.load_model_weights(fqfn_model_save)
trained_model = trained_model.to(device=DEVICES['mps'])
trainer.model = trained_model


In [7]:
import json
from contrastivebert.classifier.onnx_exporter import EVAL_METRICS_FILE_NAME

fqfn_metrics = path.join(MODEL_DIR, EVAL_METRICS_FILE_NAME)
eval_metrics = trainer.evaluate(fqfn_metrics)
print(f'Evaluation metrics: {json.dumps(eval_metrics, indent=2)}')

Evaluation metrics: {
  "avg_pos_loss": 0.00064,
  "avg_pos_cosine": 0.99936
}


# measure inference time

In [8]:
import torch
import timeit

from contrastivebert.classifier.contrastivebert_configuration import ModelConf
from contrastivebert.classifier.onnx_exporter import WEIGHTS_FILE_NAME
from contrastivebert.classifier.contrastivebert_classifier import ContrastiveSBERT
from utils.compute_device import DEVICES

print(f'Model Class: {ContrastiveSBERT.__name__}')
model_conf = ModelConf()
fqfn_model_save = path.join(MODEL_DIR, WEIGHTS_FILE_NAME)
trained_model = ContrastiveSBERT(model_conf=model_conf)
trained_model.load_model_weights(fqfn_model_save)
trained_model = trained_model.to(device=DEVICES['mps'])

number_of_iterations = 100
with torch.no_grad():
    # Invoke the model with both tensors
    timer = timeit.Timer(stmt=lambda: trained_model(
        input_ids = torch.randint(low=0, high=trainer.model.ml_components.tokenizer.vocab_size, size=(1, ModelConf.input_size), device=DEVICES['mps'], dtype=torch.long),
        attention_mask = torch.randint(low=0, high=2, size=(1, ModelConf.input_size), device=DEVICES['mps'], dtype=torch.long)
    ))
    times = timer.repeat(repeat=1, number=number_of_iterations)  # repeat=1 to run 1000 iterations once

    average_time = sum(times) / (len(times) * number_of_iterations)

print(f'Average execution time: {average_time} seconds')
print()

Model Class: ContrastiveSBERT
Average execution time: 0.03711345667019486 seconds



## Compute System Metrics

In [9]:
import json
from contrastivebert.classifier.onnx_exporter import SYS_METRICS_FILE_NAME
from contrastivebert.classifier.contrastivebert_configuration import ModelConf

sys_metrics = {
    'input_size': ModelConf.input_size,
    'avg_inference_sec': round(average_time, 9),
    'parameter_count': trainer.model.parameter_count,
    'dataset.split_ratio': trainer.trainer_conf.dataset_split_ratio,
    'dataset_size.class_train': len(trainer.train_dataset),
    'dataset_size.class_test': len(trainer.test_dataset)
}

fqfn_sys_metrics = path.join(MODEL_DIR, SYS_METRICS_FILE_NAME)
with open(fqfn_sys_metrics, 'w+') as metric_file:
    json.dump(sys_metrics, metric_file, indent=2)
print(f'System metrics: {json.dumps(sys_metrics, indent=2)}')

System metrics: {
  "input_size": 512,
  "avg_inference_sec": 0.037113457,
  "parameter_count": 124645632,
  "dataset.split_ratio": 0.2,
  "dataset_size.class_train": 80,
  "dataset_size.class_test": 20
}
