In [1]:
from os import path

PROJECT_ROOT = path.abspath(path.join(globals()['_dh'][0], '..'))
DATALAKE_PATH = path.abspath(path.join(PROJECT_ROOT, '..', '..', 'datalake', 'imgproj'))
MODEL_DIR = path.abspath(path.join(PROJECT_ROOT, 'imgproj', 'classifier'))

In [2]:
import pandas as pd
from imgproj.classifier.img_configuration import ModelConf

FQFN_PROCESSED_DF = path.join(DATALAKE_PATH, 'processed', f'processed_20250202.{ModelConf.image_size[0]}px.jsonl')

df = pd.read_json(f'file://{FQFN_PROCESSED_DF}', orient='records', lines=True, convert_dates=False)
assert df['label'].unique().size == 2, f'Label column unique values should be 0 and 1'

original_len = df.shape[0]

print(f'ImgProj df shape={df.shape}')
print(f'ImgProj df columns={df.columns}')


ImgProj df shape=(502, 6)
ImgProj df columns=Index(['file_name', 'fqfn', 'img_grey', 'img_height', 'img_width', 'label'], dtype='object')


In [3]:
for label in df['label'].unique():
    labeled_df = df[df['label'] == label]
    print(f'for label {label}: {labeled_df.shape[0]} #records')

for label 0: 250 #records
for label 1: 252 #records


In [4]:
# avoiding Pytorch 2.5 error with aten.var_mean.correction
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [None]:
from imgproj.trainer.img_trainer import Trainer, TrainerConf
from imgproj.classifier.onnx_exporter import TMPL_EVAL_METRICS_FILE_NAME, TMPL_SYS_METRICS_FILE_NAME, TMPL_WEIGHTS_FILE_NAME

from datetime import datetime
import json

run_id = f'{datetime.now():%Y-%m-%dT%H-%M}'
trainer = Trainer(df, TrainerConf())

print(trainer.model)
print(f'Model {trainer.model.__class__.__name__} number of parameters = {trainer.model.parameter_count:,}')

In [None]:
from pt_utils import capture_model_architecture
from imgproj.classifier.img_classifier import ImgClassifier
from imgproj.classifier.img_configuration import ModelConf

import torch
from torch.utils.tensorboard import SummaryWriter

COMPUTE_DEVICE_CPU = torch.device('cpu')

with SummaryWriter(log_dir=f'tensorboard.run/{ImgClassifier.__name__}', comment='model_architecture') as writer:
    # size=(1, 1, model_conf.image_size, model_conf.image_size) stands for (1 batch, 1 color channel, image_height, image_width)
    rnd_img_grey = torch.rand(size=(1, 1, ModelConf.image_size[0], ModelConf.image_size[1]), device=COMPUTE_DEVICE_CPU) * 255
    capture_model_architecture(model=ImgClassifier(ModelConf()).to(device=COMPUTE_DEVICE_CPU), t=rnd_img_grey, writer=writer)

In [7]:
with SummaryWriter(log_dir=f'../tensorboard.run/img_dataset', comment='training dataset') as writer:
    for label in df['label'].unique():
        df_sub = df[df['label'] == label]
    
        for idx, row in df_sub.iterrows():
            img_grey = row['img_grey']  # Extract the grayscale image (ndarray)
            label = row['label']        # Extract the label (0 or 1)
        
            # Convert the grayscale image to a tensor with the shape [1, img_height, img_width]
            img_tensor = torch.tensor(img_grey).unsqueeze(0)  # Unsqueeze to add channel dimension [1, H, W]

            # Tensorboard requirement: normalize the pixel values to the range [0, 1]
            img_tensor = img_tensor.float() / 255.0
    
            # Add the image to TensorBoard, with label as metadata (optional)
            writer.add_image(f"Label_{label}", img_tensor, idx, dataformats='CHW')


In [8]:
with SummaryWriter(log_dir=f'tensorboard.run/training.st_{run_id}', comment='training stats') as writer:
    trainer.train(writer)

Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

Batches of the epoch:   0%|          | 0/13 [00:00<?, ?it/s]

2025-02-07 13:56:09,101 - tensorcraft - INFO - Early stopping at epoch 16. Best loss: 0.00956


In [9]:
fqfn_eva_metrics = path.join(MODEL_DIR, TMPL_EVAL_METRICS_FILE_NAME.format(ImgClassifier.__name__, ModelConf.image_size[0]))
eval_metrics = trainer.evaluate(fqfn_eva_metrics)
print(f'Evaluation metrics: {json.dumps(eval_metrics, indent=2)}')

with SummaryWriter(log_dir=f'tensorboard.run/training.st_{run_id}', comment='evaluation stats') as writer:
    for metric_name, metric_value in eval_metrics.items():
        writer.add_scalar(f'metrics/{metric_name}', metric_value, global_step=0)

Evaluation metrics: {
  "accuracy": 0.8812,
  "f1": 0.8723,
  "precision": 1.0,
  "recall": 0.7736,
  "roc-auc": 0.987
}


In [10]:
trainer.to(device_name='cpu')

2025-02-07 13:56:09,938 - tensorcraft - INFO - resolved device_name: cpu compute_device: cpu tensor_device: cpu


In [11]:
from pt_utils import save_model_weights
fqfn_model_save = path.join(MODEL_DIR, TMPL_WEIGHTS_FILE_NAME.format(ImgClassifier.__name__, ModelConf.image_size[0]))
save_model_weights(trainer.model, fqfn_model_save)

In [12]:
with torch.no_grad():
    rnd_img_grey = torch.rand(size=(1, 1, ModelConf.image_size[0], ModelConf.image_size[1]), device=COMPUTE_DEVICE_CPU) * 255
    result = trainer.model(rnd_img_grey)
    print(f'Random result = {result} of type {result.dtype}')
    print()


Random result = tensor([[-3.1575]]) of type torch.float32



# measure inference time

In [13]:
import timeit

from imgproj.classifier.img_configuration import ModelConf
from imgproj.classifier.onnx_exporter import TMPL_WEIGHTS_FILE_NAME
from imgproj.classifier.img_classifier import ImgClassifier


print(f'Model Class: {ImgClassifier.__name__}')
model_conf = ModelConf()
fqfn_model_save = path.join(MODEL_DIR, TMPL_WEIGHTS_FILE_NAME.format(ImgClassifier.__name__, ModelConf.image_size[0]))
trained_model = ImgClassifier(model_conf=model_conf)
trained_model.load_model_weights(fqfn_model_save)

number_of_iterations = 1000
with torch.no_grad():
    timer = timeit.Timer(stmt=lambda: trained_model(
        torch.rand(size=(1, 1, model_conf.image_size[0], model_conf.image_size[1]), device=COMPUTE_DEVICE_CPU) * 255
    ))
    times = timer.repeat(repeat=1, number=number_of_iterations)  # repeat=1 to run 1000 iterations once

    average_time = sum(times) / (len(times) * number_of_iterations)
    print(f'Average execution time: {average_time} seconds')
    print()

Model Class: ImgClassifier
Loaded pretrained weights for efficientnet-b0
Average execution time: 0.04083447145800165 seconds



# compute system metrics

In [14]:
sys_metrics = {
    'input_size': model_conf.input_size,
    'avg_inference_sec': round(average_time, 9),
    'parameter_count': trainer.model.parameter_count,
    'dataset.split_ratio': trainer.trainer_conf.dataset_split_ratio,
}

for label in df['label'].unique():
    labeled_df = df[df['label'] == label]
    sys_metrics[f'dataset_size.class_{label}'] = labeled_df.shape[0] 

fqfn_sys_metrics = path.join(MODEL_DIR, TMPL_SYS_METRICS_FILE_NAME.format(ImgClassifier.__name__, ModelConf.image_size[0]))
with open(fqfn_sys_metrics, 'w+') as metric_file:
    json.dump(sys_metrics, metric_file, indent=2)
print(f'System metrics: {json.dumps(sys_metrics, indent=2)}')

System metrics: {
  "input_size": 57600,
  "avg_inference_sec": 0.040834471,
  "parameter_count": 4008253,
  "dataset.split_ratio": 0.2,
  "dataset_size.class_0": 250,
  "dataset_size.class_1": 252
}
