In [2]:
from os import path

PROJECT_ROOT = path.abspath(path.join(globals()['_dh'][0], '..'))
DATALAKE_PATH = path.abspath(path.join(PROJECT_ROOT, '..', '..', 'datalake', 'txtproj'))
MODEL_DIR = path.abspath(path.join(PROJECT_ROOT, 'txtproj', 'classifier'))

In [3]:
import json
import pandas as pd

FQFN_PROCESSED_DF = path.join(DATALAKE_PATH, 'processed', 'processed_20250203.jsonl')
FQFN_VOCABULARY = path.join(DATALAKE_PATH, 'processed', 'vocabulary_20250203.json')

df = pd.read_json(f'file://{FQFN_PROCESSED_DF}', orient='records', lines=True, convert_dates=False)

with open(FQFN_VOCABULARY, encoding='utf-8', mode='rt') as f:
    vocabulary = json.load(f)

print(f'TextProject df shape={df.shape}')
print(f'TextProject df columns={df.columns}')
print(f'TextProject vocabulary size={len(vocabulary)}')


TextProject df shape=(410, 4)
TextProject df columns=Index(['file_name', 'text', 'text_tfidf', 'label'], dtype='object')
TextProject vocabulary size=256


In [4]:
for label in df['label'].unique():
    labeled_df = df[df['label'] == label]
    print(f'for label {label}: {labeled_df.shape[0]} #records')

for label 0: 210 #records
for label 1: 200 #records


In [18]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore', message='.*No further splits with positive gain.*')

In [19]:
from trainer.txt_trainer import Trainer, TrainerConf
from txtproj.classifier.txt_classifier import LrTxtClassifier, SvmTxtClassifier, LgbmTxtClassifier
from txtproj.classifier.onnx_exporter import TMPL_EVAL_METRICS_FILE_NAME, TMPL_WEIGHTS_FILE_NAME
from txtproj.classifier.txt_configuration import ModelConf
import json
import numpy.random

for model_class in [LrTxtClassifier, SvmTxtClassifier, LgbmTxtClassifier]:
    print(f'Detector Class: {model_class.__name__}')
    trainer = Trainer(model_class, df, TrainerConf())
    trainer.train()

    fqfn_metrics = path.join(MODEL_DIR, TMPL_EVAL_METRICS_FILE_NAME.format(model_class.__name__))
    metrics = trainer.evaluate(fqfn_metrics)
    print(f'Evaluation metrics: {json.dumps(metrics, indent=2)}')

    fqfn_model_save = path.join(MODEL_DIR, TMPL_WEIGHTS_FILE_NAME.format(model_class.__name__))
    trainer.model.save_model_weights(fqfn_model_save)

    model_conf = ModelConf()
    print(f'Model input size = {model_conf.input_size}')

    result = trainer.model.predict(numpy.random.random(size=(1, model_conf.input_size)))
    print(f'Random result = {result} of type {result.dtype}')
    print()


Detector Class: LrTxtClassifier
predicted_labels.values=(array([0., 1.], dtype=float32), array([239, 171]))
true_labels.values=(array([0, 1]), array([210, 200]))
Evaluation metrics: {
  "accuracy": 0.9049,
  "f1": 0.8949,
  "precision": 0.9708,
  "recall": 0.83,
  "roc-auc": 0.9704
}
Model input size = 256
Random result = [1.] of type float32

Detector Class: SvmTxtClassifier
predicted_labels.values=(array([0., 1.], dtype=float32), array([233, 177]))
true_labels.values=(array([0, 1]), array([210, 200]))
Evaluation metrics: {
  "accuracy": 0.9195,
  "f1": 0.9125,
  "precision": 0.9718,
  "recall": 0.86,
  "roc-auc": 0.967
}
Model input size = 256
Random result = [1.] of type float32

Detector Class: LgbmTxtClassifier
[LightGBM] [Info] Number of positive: 163, number of negative: 165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002587 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `for

# compute system metrics

In [20]:
from trainer.np_utils import NpEncoder
from txtproj.classifier.txt_configuration import ModelConf
import numpy.random
import timeit

from txtproj.classifier.onnx_exporter import TMPL_WEIGHTS_FILE_NAME, TMPL_SYS_METRICS_FILE_NAME

model_conf = ModelConf()
print(f'Model input size = {model_conf.input_size}')

for model_class in [LrTxtClassifier, SvmTxtClassifier, LgbmTxtClassifier]:
    print(f'Model Class: {model_class.__name__}')
    fqfn_model_save = path.join(MODEL_DIR, TMPL_WEIGHTS_FILE_NAME.format(model_class.__name__))
    trained_model = model_class(model_conf=model_conf)
    trained_model.load_model_weights(fqfn_model_save)

    number_of_iterations = 1000
    timer = timeit.Timer(stmt=lambda: trained_model.predict(numpy.random.random(size=(1, model_conf.input_size))))
    times = timer.repeat(repeat=1, number=number_of_iterations)  # repeat=1 to run 1000 iterations once

    average_time = sum(times) / (len(times) * number_of_iterations)

    sys_metrics = {
        'vocabulary_size': ModelConf.vocabulary_size,
        'avg_inference_sec': round(average_time, 9),
        'dataset.split_ratio': trainer.trainer_conf.dataset_split_ratio,
    }
    if model_class == LrTxtClassifier:
        sys_metrics['parameter_count'] = trained_model._model.coef_.size + trained_model._model.intercept_.size
    elif model_class == SvmTxtClassifier:
        sys_metrics['support_vector_count'] = sum(trained_model._model.n_support_)
    elif model_class == LgbmTxtClassifier:
        sys_metrics['tree_count'] = trained_model._model.n_estimators_
    else:
        raise ValueError(f'Model class {model_class.__name__} not recognized')

    for label in df['label'].unique():
        labeled_df = df[df['label'] == label]
        sys_metrics[f'dataset_size.class_{label}'] = labeled_df.shape[0]
    
    fqfn_sys_metrics = path.join(MODEL_DIR, TMPL_SYS_METRICS_FILE_NAME.format(model_class.__name__))
    with open(fqfn_sys_metrics, 'w+') as metric_file:
        json.dump(sys_metrics, metric_file, indent=2, cls=NpEncoder)
    print(f'System metrics: {json.dumps(sys_metrics, indent=2, cls=NpEncoder)}')
    print()

Model input size = 256
Model Class: LrTxtClassifier
System metrics: {
  "vocabulary_size": 256,
  "avg_inference_sec": 3.9255e-05,
  "dataset.split_ratio": 0.2,
  "parameter_count": 257,
  "dataset_size.class_0": 210,
  "dataset_size.class_1": 200
}

Model Class: SvmTxtClassifier
System metrics: {
  "vocabulary_size": 256,
  "avg_inference_sec": 5.9557e-05,
  "dataset.split_ratio": 0.2,
  "support_vector_count": 161,
  "dataset_size.class_0": 210,
  "dataset_size.class_1": 200
}

Model Class: LgbmTxtClassifier
System metrics: {
  "vocabulary_size": 256,
  "avg_inference_sec": 0.000227704,
  "dataset.split_ratio": 0.2,
  "tree_count": 100,
  "dataset_size.class_0": 210,
  "dataset_size.class_1": 200
}

