In [19]:
# !pip install --upgrade git+https://github.com/onnx/sklearn-onnx.git
# !pip install --upgrade git+https://github.com/onnx/onnxmltools

In [20]:
from os import path

PROJECT_ROOT = path.abspath(path.join(globals()['_dh'][0], '..'))
DATALAKE_PATH = path.abspath(path.join(PROJECT_ROOT, '..', '..', 'datalake', 'xplainproj'))
FQFN_PROCESSED_DF = path.join(DATALAKE_PATH, 'processed', f'processed_20250207.jsonl')
MODEL_DIR = path.abspath(path.join(PROJECT_ROOT, 'xplainproj', 'classifier'))

In [21]:
import pandas as pd

df = pd.read_json(f'file://{FQFN_PROCESSED_DF}', orient='records', lines=True)
print(f'df shape={df.shape}')
print(f'df columns={df.columns}')

df shape=(699, 11)
df columns=Index(['file_name', 'fqfn', 'text_body', 'label', 'longest_code_line_length',
       'median_code_line_length', 'lines_of_code', 'code_size_in_bytes',
       'ratio_of_comments_to_code', 'is_64base_content_present',
       'file_name_embedding'],
      dtype='object')


In [22]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore', message='.*No further splits with positive gain.*')

In [23]:
from xplainproj.trainer.xplain_trainer import Trainer, TrainerConf

# Ensure your DataFrame, df_unified, is loaded and the code_model is initialized
trainer = Trainer(df, TrainerConf())
trainer.train()

[LightGBM] [Info] Number of positive: 289, number of negative: 270
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000490 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6177
[LightGBM] [Info] Number of data points in the train set: 559, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.516995 -> initscore=0.068005
[LightGBM] [Info] Start training from score 0.068005
Test Accuracy: 0.8714285714285714


## Compute Evaluation Metrics

In [24]:
import json
from xplainproj.classifier.onnx_exporter import EVAL_METRICS_FILE_NAME

fqfn_metrics = path.join(MODEL_DIR, EVAL_METRICS_FILE_NAME)
eval_metrics = trainer.evaluate(fqfn_metrics)
print(f'Evaluation metrics: {json.dumps(eval_metrics, indent=2)}')

Evaluation metrics: {
  "accuracy": 0.8714,
  "f1": 0.8525,
  "precision": 0.8387,
  "recall": 0.8667,
  "roc-auc": 0.9467
}


In [25]:
from xplainproj.classifier.onnx_exporter import WEIGHTS_FILE_NAME

fqfn_model_save = path.join(MODEL_DIR, WEIGHTS_FILE_NAME)
trainer.model.save_model_weights(fqfn_model_save)


## Compute System Metrics

In [None]:
import json
import numpy.random
import timeit

from utils.np_utils import NpEncoder
from xplainproj.classifier.onnx_exporter import SYS_METRICS_FILE_NAME, WEIGHTS_FILE_NAME
from xplainproj.classifier.xplain_configuration import ModelConf
from xplainproj.classifier.xplain_classifier import XplainClassifier

model_conf = ModelConf()

for model_class in [XplainClassifier]:
    print(f'Model Class: {model_class.__name__}')
    fqfn_model_save = path.join(MODEL_DIR, WEIGHTS_FILE_NAME)
    trained_model = model_class(model_conf=model_conf)
    trained_model.load_model_weights(fqfn_model_save)

    number_of_iterations = 1000
    timer = timeit.Timer(stmt=lambda: trained_model.predict(numpy.random.random(size=(1, model_conf.input_size))))
    times = timer.repeat(repeat=1, number=number_of_iterations)  # repeat=1 to run 1000 iterations once

    average_time = sum(times) / (len(times) * number_of_iterations)

    sys_metrics = {
        'input_size': ModelConf.input_size,
        'avg_inference_sec': round(average_time, 9),
        'dataset.split_ratio': trainer.trainer_conf.dataset_split_ratio,
    }
    if model_class == XplainClassifier:
        sys_metrics['tree_count'] = trained_model._model.n_estimators_
    else:
        raise ValueError(f'Model class {model_class.__name__} not recognized')

    for ds in [trainer.test_dataset, trainer.train_dataset]:
        for label in ds.df['label'].unique():
            labeled_df = ds.df[ds.df['label'] == label]
            if f'dataset_size.class_{label}' not in sys_metrics:
                sys_metrics[f'dataset_size.class_{label}'] = 0
            sys_metrics[f'dataset_size.class_{label}'] += labeled_df.shape[0]

    fqfn_sys_metrics = path.join(MODEL_DIR, SYS_METRICS_FILE_NAME)
    with open(fqfn_sys_metrics, 'w+') as metric_file:
        json.dump(sys_metrics, metric_file, indent=2, cls=NpEncoder)
    print(f'System metrics: {json.dumps(sys_metrics, indent=2, cls=NpEncoder)}')
    print()


In [None]:
import numpy.random
from xplainproj.classifier.xplain_configuration import ModelConf

model_conf = ModelConf()
print(f'Model input size = {model_conf.input_size}')

result = trained_model.predict(numpy.random.random(size=(1, model_conf.input_size)))
print(f'Result = {result} of type {result.dtype}')
