In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn

In [11]:
# Let's load the dataset
data = pd.read_csv('data/synth_data_for_training.csv')

# Let's specify the features and the target
y = data['checked']
X = data.drop(['checked'], axis=1)
X = X.astype(np.float32)

# Let's split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [12]:
# Select data based on variance (not the final version yet, for now just for testing)
selector = VarianceThreshold()

In [13]:
# Define a gradient boosting classifier
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

# Other classifiers such as RandomForestClassifier, LogisticRegression, etc. can be used as well

In [14]:
# Create a pipeline object with our selector and classifier
# NOTE: You can create custom pipeline objects but they must be registered to onnx or it will not recognise them
# Because of this we recommend using the onnx known objects as defined in the documentation
pipeline = Pipeline(steps=[('feature selection', selector), ('classification', classifier)])

In [15]:
# Let's train a simple model
pipeline.fit(X_train, y_train)

# Let's evaluate the model
y_pred = pipeline.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of the original model: ', original_accuracy)

Accuracy of the original model:  0.9456040480708412


In [16]:
# Let's convert the model to ONNX
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)

# Let's check the accuracy of the converted model
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

# Get model confidence for random test sample
print(y_pred_onnx[1])

Accuracy of the ONNX model:  0.9456040480708412
[{0: 0.999323844909668, 1: 0.0006761550903320312}, {0: 0.24301636219024658, 1: 0.7569836378097534}, {0: 0.9280160665512085, 1: 0.0719839334487915}, {0: 0.9873268008232117, 1: 0.01267319917678833}, {0: 0.9999923706054688, 1: 7.62939453125e-06}, {0: 0.9569653272628784, 1: 0.04303467273712158}, {0: 0.9963588118553162, 1: 0.003641188144683838}, {0: 0.9138126969337463, 1: 0.08618730306625366}, {0: 0.974804162979126, 1: 0.025195837020874023}, {0: 0.7449288368225098, 1: 0.25507116317749023}, {0: 0.9938682913780212, 1: 0.00613170862197876}, {0: 0.9952160120010376, 1: 0.004783987998962402}, {0: 0.9941954016685486, 1: 0.005804598331451416}, {0: 0.9888350367546082, 1: 0.011164963245391846}, {0: 0.9935358762741089, 1: 0.006464123725891113}, {0: 0.9999967813491821, 1: 3.2186508178710938e-06}, {0: 0.9997240900993347, 1: 0.0002759099006652832}, {0: 0.9878925085067749, 1: 0.012107491493225098}, {0: 0.9974952936172485, 1: 0.002504706382751465}, {0: 0.9996