In [1]:
from os import path

PROJECT_ROOT = path.abspath(path.join(globals()['_dh'][0], '..'))
DATALAKE_PATH = path.abspath(path.join(PROJECT_ROOT, '..', '..', 'datalake', 'mlpbertproj'))
FQFN_PROCESSED_DF = path.join(DATALAKE_PATH, 'processed', f'processed_20250205.jsonl')
MODEL_DIR = path.abspath(path.join(PROJECT_ROOT, 'mlpbertproj', 'classifier'))

In [2]:
import pandas as pd

df_unified = pd.read_json(f'file://{FQFN_PROCESSED_DF}', orient='records', lines=True)
print(f'df shape={df_unified.shape}')
print(f'df columns={df_unified.columns}')

df shape=(704, 5)
df columns=Index(['file_name', 'fqfn', 'text_body', 'text_embeddings', 'label'], dtype='object')


In [3]:
from mlpbertproj.trainer.mlpbert_trainer import Trainer, TrainerConf

# Ensure your DataFrame, df_unified, is loaded and the code_model is initialized
trainer = Trainer(df_unified, TrainerConf())

In [4]:
from time import time
start_time = time()

trainer.train()

print(f'Training duration: {time() - start_time:.4f} seconds')

Epoch:   1/ 12, LR:0.00060, Loss:0.20913
Epoch:   2/ 12, LR:0.00060, Loss:0.04689
Epoch:   3/ 12, LR:0.00059, Loss:0.02966
Epoch:   4/ 12, LR:0.00056, Loss:0.01936
Epoch:   5/ 12, LR:0.00051, Loss:0.01010
Epoch:   6/ 12, LR:0.00044, Loss:0.01521
Epoch:   7/ 12, LR:0.00037, Loss:0.00416
Epoch:   8/ 12, LR:0.00029, Loss:0.00266
Epoch:   9/ 12, LR:0.00022, Loss:0.00525
Epoch:  10/ 12, LR:0.00015, Loss:0.00279
Epoch:  11/ 12, LR:0.00010, Loss:0.00116
Epoch:  12/ 12, LR:0.00007, Loss:0.00242
Training duration: 0.4957 seconds


## Compute Evaluation Metrics

In [5]:
import json
from mlpbertproj.classifier.onnx_exporter import EVAL_METRICS_FILE_NAME

fqfn_metrics = path.join(MODEL_DIR, EVAL_METRICS_FILE_NAME)
eval_metrics = trainer.evaluate(fqfn_metrics)
print(f'Evaluation metrics: {json.dumps(eval_metrics, indent=2)}')

Evaluation metrics: {
  "accuracy": 1.0,
  "f1": 1.0,
  "precision": 1.0,
  "recall": 1.0,
  "roc-auc": 1.0
}


In [6]:
from mlpbertproj.classifier.onnx_exporter import WEIGHTS_FILE_NAME
from utils.pt_utils import save_model_weights

fqfn_model_save = path.join(MODEL_DIR, WEIGHTS_FILE_NAME)
save_model_weights(trainer.model, fqfn_model_save)

# measure inference time

In [7]:
import torch
import timeit

from mlpbertproj.classifier.mlpbert_configuration import ModelConf
from mlpbertproj.classifier.onnx_exporter import WEIGHTS_FILE_NAME
from mlpbertproj.classifier.mlpbert_classifier import MlpBertModel
from utils.compute_device import DEVICES

print(f'Model Class: {MlpBertModel.__name__}')
model_conf = ModelConf()
fqfn_model_save = path.join(MODEL_DIR, WEIGHTS_FILE_NAME)
trained_model = MlpBertModel(model_conf=model_conf)
trained_model.load_model_weights(fqfn_model_save)

number_of_iterations = 1000
with torch.no_grad():
    trained_model(
        torch.rand(size=(1, ModelConf.input_size), device=DEVICES['cpu'])
    )
    timer = timeit.Timer(stmt=lambda: trained_model(
        torch.rand(size=(1, ModelConf.input_size), device=DEVICES['cpu'])
    ))
    times = timer.repeat(repeat=1, number=number_of_iterations)  # repeat=1 to run 1000 iterations once

    average_time = sum(times) / (len(times) * number_of_iterations)

print(f'Average execution time: {average_time} seconds')
print()

2025-02-08 12:56:09,160 - tensorcraft - INFO - XLA Device Not Supported: No module named 'torch_xla'
2025-02-08 12:56:09,160 - tensorcraft - INFO - Pytorch version=2.6.0 preferred device=mps build with MPS support=True


Model Class: MlpBertModel
Average execution time: 2.702429099008441e-05 seconds



## Compute System Metrics

In [8]:
import json
from mlpbertproj.classifier.onnx_exporter import SYS_METRICS_FILE_NAME
from mlpbertproj.classifier.mlpbert_configuration import ModelConf


sys_metrics = {
    'input_size': ModelConf.input_size,
    'avg_inference_sec': round(average_time, 9),
    'parameter_count': trainer.model.parameter_count,
    'dataset.split_ratio': trainer.trainer_conf.dataset_split_ratio,
}

for ds in [trainer.test_dataset, trainer.train_dataset]:
    for label in ds.df['label'].unique():
        labeled_df = ds.df[ds.df['label'] == label]
        if f'dataset_size.class_{label}' not in sys_metrics:
            sys_metrics[f'dataset_size.class_{label}'] = 0
        sys_metrics[f'dataset_size.class_{label}'] += labeled_df.shape[0]

fqfn_sys_metrics = path.join(MODEL_DIR, SYS_METRICS_FILE_NAME)
with open(fqfn_sys_metrics, 'w+') as metric_file:
    json.dump(sys_metrics, metric_file, indent=2)
print(f'System metrics: {json.dumps(sys_metrics, indent=2)}')

System metrics: {
  "input_size": 768,
  "avg_inference_sec": 2.7024e-05,
  "parameter_count": 459521,
  "dataset.split_ratio": 0.2,
  "dataset_size.class_0": 350,
  "dataset_size.class_1": 354
}
