In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


### Configuration and Constants

In [2]:
# Define bins and labels for age categorization
AGE_BINS = [17, 25, 45, 67]
AGE_LABELS = ['18-25', '25-45', '45-67']

RANDOM_STATE = 42

### Helper Functions

In [3]:
def load_features(features_path: str) -> list:
    """Load features to exclude based on the inclusion criteria."""
    features_df = pd.read_csv(features_path, delimiter=';')
    excluded = features_df[features_df['Include'] == 0]['Feature (nl)'].tolist()
    print(f"Number of features to exclude: {len(excluded)}")
    return excluded

def classify_features(df: pd.DataFrame) -> pd.DataFrame:
    """Classify features into Binary, Integer, Continuous, or Unknown."""
    summary = []
    for col in df.columns:
        col_data = df[col].dropna()
        unique_vals = set(col_data.unique())
        is_binary = unique_vals.issubset({0, 1})
        is_integer = np.all(np.equal(np.mod(col_data, 1), 0))
        
        if is_binary:
            feature_type = "Binary"
            details = f"Values: {sorted(unique_vals)}"
        elif is_integer:
            feature_type = "Integer"
            details = f"Range: {col_data.min()} to {col_data.max()}"
        elif not is_integer and col_data.dtype.kind in 'fi':  # float or integer
            feature_type = "Continuous"
            details = f"Range: {col_data.min()} to {col_data.max()}"
        else:
            feature_type = "Unknown"
            details = f"Values: {sorted(unique_vals)}"
        
        summary.append({
            'Feature': col,
            'Type': feature_type,
            'Details': details
        })
    
    return pd.DataFrame(summary)

def scale_features(df: pd.DataFrame, feature_names: list) -> pd.DataFrame:
    """Scale specified integer features using MinMaxScaler."""
    scaler = MinMaxScaler()
    df[feature_names] = scaler.fit_transform(df[feature_names])
    return df

### Main Workflow

In [None]:
# Load features to exclude
features_to_exclude = load_features("../data/features_to_include.csv")

# Load all data
data = pd.read_csv("../data/all_data.csv")
print(f"Number of samples: {data.shape[0]}")
print(f"Number of features before exclusion: {data.shape[1]}")

# Drop excluded features
data_copy = data.drop(columns=features_to_exclude)
print(f"Number of features after exclusion: {data_copy.shape[1]}")

# Classify feature types
feature_summary = classify_features(data_copy)

# Find integer features to scale
integer_features = feature_summary[feature_summary['Type'] == 'Integer']['Feature'].tolist()
integer_features = [feat for feat in integer_features if feat != 'persoon_leeftijd_bij_onderzoek']

# Scale integer features
data_copy = scale_features(data_copy, integer_features)

In [5]:
# Prepare features and target
y = data_copy[['Ja', 'checked']]
X = data_copy.drop(['Ja', 'Nee', 'checked'], axis=1).astype(np.float32)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE
)

In [6]:
# Handle Age Group Weights
train_age_groups = pd.cut(
    X_train['persoon_leeftijd_bij_onderzoek'],
    bins=AGE_BINS,
    labels=AGE_LABELS,
    right=True
)

In [7]:
# Define and train the logistic regression model
# Define the regularization strength (alpha)
# This can be tuned using cross-validation
ridge_alpha = 1.0  # Example value; you may want to optimize this

# Create a Ridge regressor with the specified alpha
regressor = Ridge(
    alpha=ridge_alpha,
    fit_intercept=True,
    copy_X=True,
    random_state=42
)

pipeline = Pipeline([('regression', regressor)])

pipeline.fit(
    X_train, y_train['Ja']
)

threshold = (0.697021996059818 + 0.697013377682873) / 2.0  #approx. boundary value in dataset

# Predict and evaluate
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test['checked'], y_pred > threshold)

### Store to ONNX

In [8]:
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from skl2onnx import convert_sklearn
from onnx import helper
import onnxruntime as rt
import onnx

In [9]:
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)

In [10]:
# Add note to onnx graph such that it saves both the predicted risk value
# and the 'checked' value based on if the risk value is higher than the
# provided threshold.

graph = onnx_model.graph
threshold_node = helper.make_node(
    "Constant",
    inputs=[],
    outputs=["threshold"],
    value=helper.make_tensor("value", onnx.TensorProto.FLOAT, [], [threshold])
)
graph.node.append(threshold_node)

greater_node = helper.make_node(
    "Greater",
    inputs=[graph.output[0].name, "threshold"],
    outputs=["boolean_output"]
)
graph.node.append(greater_node)

boolean_output = helper.make_tensor_value_info("boolean_output", onnx.TensorProto.BOOL, [None])
graph.output.extend([boolean_output])

In [None]:
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

# y_pred_onnx[0] = risk values
# y_pred_onnx[1] = boolean value indicating if high risk or not
accuracy_onnx_model = accuracy_score(y_test['checked'], y_pred_onnx[1])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

In [12]:
onnx.save(onnx_model, "../model/model_1.onnx")