In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.feature_selection import VarianceThreshold

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn
from onnx import helper

In [59]:
# Load dataset
data = pd.read_csv('data/all_data.csv')

# match on both risk value and 'checked' value so both can be used after shuffling data when creating train/test split 
y = data[['Ja', 'checked']]

X = data.drop(['Ja', 'Nee', 'checked'], axis=1)
X = X.astype(np.float32)

# Let's split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [60]:
threshold = (0.697021996059818 + 0.697013377682873) / 2.0  #approx. boundary value in dataset
selector = VarianceThreshold()
regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

In [61]:
pipeline = Pipeline(steps=[('feature selection', selector), ('regression', regressor)])
pipeline.fit(X_train, y_train['Ja'])
y_pred = pipeline.predict(X_test)

In [62]:
original_accuracy = accuracy_score(y_test['checked'], y_pred > threshold)
print('Accuracy of the original model: ', original_accuracy)

Accuracy of the original model:  0.9243076923076923


In [63]:
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)

In [64]:
# Add note to onnx graph such that it saves both the predicted risk value
# and the 'checked' value based on if the risk value is higher than the
# provided threshold.

graph = onnx_model.graph
threshold_node = helper.make_node(
    "Constant",
    inputs=[],
    outputs=["threshold"],
    value=helper.make_tensor("value", onnx.TensorProto.FLOAT, [], [threshold])
)
graph.node.append(threshold_node)

greater_node = helper.make_node(
    "Greater",
    inputs=[graph.output[0].name, "threshold"],
    outputs=["boolean_output"]
)
graph.node.append(greater_node)

boolean_output = helper.make_tensor_value_info("boolean_output", onnx.TensorProto.BOOL, [None])
graph.output.extend([boolean_output])

In [None]:
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

# y_pred_onnx[0] = risk values
# y_pred_onnx[1] = boolean value indicating if high risk or not
accuracy_onnx_model = accuracy_score(y_test['checked'], y_pred_onnx[1])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9243076923076923


In [66]:
onnx.save(onnx_model, "model/regression_model.onnx")