In [46]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn
from skl2onnx import to_onnx
from onnx import helper
from onnx import TensorProto
import onnxruntime as rt
import onnx

In [47]:
# Load the biased data
bad_data = pd.read_csv('../data/bad_data.csv')

# Print dataset details
print(f"Number of samples: {bad_data.shape[0]}")
print(f"Number of features: {bad_data.shape[1]}")

Number of samples: 130000
Number of features: 318


In [48]:
# Prepare features and target
y_bad = bad_data[['Ja', 'checked']]
X_bad = bad_data.drop(['Ja', 'Nee', 'checked'], axis=1).astype(np.float32)

# Split into training and testing sets
X_train_bad, X_test_bad, y_train_bad, y_test_bad = train_test_split(
    X_bad, y_bad, test_size=0.25, random_state=1
)

# Extract the "Ja" column as the target variable
y_train_bad_checked = y_train_bad['checked']
y_test_bad_checked = y_test_bad['checked']

In [49]:
# Define and train the Gradient Boosting Classifier
bad_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

bad_model.fit(X_train_bad, y_train_bad_checked)



In [50]:
# Predict probabilities
y_pred_bad = bad_model.predict_proba(X_test_bad)[:, 1]

# Calculate metrics
roc_auc = roc_auc_score(y_test_bad_checked, y_pred_bad)
print(f"ROC AUC Score: {roc_auc:.2f}")

threshold_bad = 0.7  # Example threshold for high risk
accuracy_bad = accuracy_score(y_test_bad['checked'], y_pred_bad > threshold_bad)
print(f"Accuracy of Bad Model: {accuracy_bad:.2%}")

ROC AUC Score: 0.97
Accuracy of Bad Model: 84.92%


## Save ONNX

In [51]:
# Save the model to ONNX
onnx_bad_model = convert_sklearn(
    bad_model, initial_types=[('X', FloatTensorType((None, X_bad.shape[1])))],
    target_opset=12
)


In [52]:
# Test ONNX model
sess_bad = rt.InferenceSession(onnx_bad_model.SerializeToString())
y_pred_onnx = sess_bad.run(None, {'X': X_test_bad.values.astype(np.float32)})

# Convert probabilities to binary predictions
y_pred_onnx_binary = (y_pred_onnx[0] > threshold_bad).astype(int)

# Calculate accuracy for ONNX model
accuracy_onnx_model = accuracy_score(y_test_bad_checked, y_pred_onnx_binary)
print('Accuracy of the ONNX Bad Model: ', accuracy_onnx_model)

# %%
# Save the ONNX model
onnx.save(onnx_bad_model, "../model/bad_model.onnx")

Accuracy of the ONNX Bad Model:  0.9028
