In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn
from skl2onnx import to_onnx
from onnx import helper
from onnx import TensorProto
import onnxruntime as rt
import onnx

In [None]:
# Load the data
data = pd.read_csv('../data/model2_data.csv')

# Print dataset details
print(f"Number of samples: {data.shape[0]}")
print(f"Number of features: {data.shape[1]}")

In [None]:
# Prepare features and target
y = data[['Ja', 'checked']]
X = data.drop(['Ja', 'Nee', 'checked'], axis=1).astype(np.float32)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1
)

# Extract the "checked" column as the target variable
y_train_checked = y_train['checked']
y_test_checked = y_test['checked']

In [None]:
# Define and train the Gradient Boosting Classifier

model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

model.fit(X_train, y_train_checked)

## Save ONNX

In [5]:
# Save the model to ONNX
onnx_model = convert_sklearn(
    model, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12
)

# Add note to onnx graph such that it saves both the predicted risk value
# and the 'checked' value based on if the risk value is higher than the
# provided threshold.

threshold = 0.7  # Example threshold for high risk
graph = onnx_model.graph
threshold_node = helper.make_node(
    "Constant",
    inputs=[],
    outputs=["threshold"],
    value=helper.make_tensor("value", onnx.TensorProto.FLOAT, [], [threshold])
)
graph.node.append(threshold_node)

greater_node = helper.make_node(
    "Greater",
    inputs=[graph.output[0].name, "threshold"],
    outputs=["boolean_output"]
)
graph.node.append(greater_node)

boolean_output = helper.make_tensor_value_info("boolean_output", onnx.TensorProto.BOOL, [None])
graph.output.extend([boolean_output])

In [None]:
# Test ONNX model
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx = sess.run(None, {'X': X_test.values.astype(np.float32)})

# Since a regression model is used, we first predict the risk values and assign the boolean high-risk classification based 
# on this value. 
# y_pred_onnx[0] = predicted risk values
# y_pred_onnx[1] = boolean value indicating if high risk or not
accuracy_onnx_model = accuracy_score(y_test['checked'], y_pred_onnx[1])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

# Save the ONNX model
onnx.save(onnx_model, "../model/model2.onnx")