In [31]:
import pandas as pd
import numpy as np
import onnxruntime as rt
import onnx
from skl2onnx import convert_sklearn, to_onnx, update_registered_converter
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import xgboost as xgb
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score, roc_curve
from skl2onnx.common.shape_calculator import (
    calculate_linear_classifier_output_shapes,
)
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost

In [2]:
data = pd.read_csv("data/train_augmented.csv")
test_data= pd.read_csv("data/test_en.csv")


X_train, y_train = data.drop(['checked'], axis=1), data['checked']
X_test, y_test = test_data.drop(['checked'], axis=1), test_data['checked']

cv    = RepeatedKFold(n_splits=5, n_repeats=10, random_state=101)
folds = [(train,test) for train, test in cv.split(X_train, y_train)]

In [96]:
metrics = ['auc', 'fpr', 'tpr', 'thresholds', 'importance']
results = {
    'train': {m:[] for m in metrics},
    'val'  : {m:[] for m in metrics},
    'test' : {m:[] for m in metrics}
}

In [97]:
params = {
    'objective'   : 'binary:logistic',
    'eval_metric' : 'logloss'
}

In [98]:
importances = []
dtest = xgb.DMatrix(X_test, label=y_test)
for train, test in tqdm(folds, total=len(folds)):
    dtrain = xgb.DMatrix(X_train.iloc[train,:], label=y_train.iloc[train])
    dval   = xgb.DMatrix(X_train.iloc[test,:], label=y_train.iloc[test])
    model  = xgb.train(
        dtrain                = dtrain,
        params                = params, 
        evals                 = [(dtrain, 'train'), (dval, 'val')],
        num_boost_round       = 1000,
        verbose_eval          = False,
        early_stopping_rounds = 10,
    )
    importances.append(model.get_score())
    sets = [dtrain, dval, dtest]
    for i,ds in enumerate(results.keys()):
        y_preds              = model.predict(sets[i])
        labels               = sets[i].get_label()
        fpr, tpr, thresholds = roc_curve(labels, y_preds)
        results[ds]['fpr'].append(fpr)
        results[ds]['tpr'].append(tpr)
        results[ds]['thresholds'].append(thresholds)
        results[ds]['auc'].append(roc_auc_score(labels, y_preds))

  0%|          | 0/50 [00:00<?, ?it/s]

In [99]:
importances_agg = dict()
for importance_dict in importances:
    for key in importance_dict:
        if key not in importances_agg:
            importances_agg[key] = {"count": 1, "sum": importance_dict[key]}
        else:
            importances_agg[key]["sum"] += importance_dict[key]
            importances_agg[key]["count"] += 1

avg_importance = []

for key in importances_agg:
    imp_sum = importances_agg[key]["sum"]
    imp_count = importances_agg[key]["count"]
    imp_avg = imp_sum / imp_count
    importances_agg[key]["avg"] = imp_avg
    avg_importance.append((imp_avg, key))
avg_importance.sort(reverse=True)

In [100]:

print(avg_importance)

[(132.9, 'person_age_during_investigation'), (84.52, 'exemption_day_history_because_of_your_medical_conditions'), (73.9, 'personal_qualities_days_since_language_requirement'), (70.76, 'personal_qualities_days_since_last_updated'), (69.88, 'obstacle_days_physical_problems'), (67.04, 'obstacles_days_financial_problems'), (57.68, 'contact_subject_no_show'), (52.06, 'address_days_at_address'), (51.2, 'exemption_days_history_mean'), (49.68, 'relationship_child_age_difference_parent_first_child'), (49.42, 'appointment_number_of_words'), (49.34, 'relationship_partner_total_days_partner'), (48.52, 'contact_type_last_year_outgoing_documents'), (48.32, 'obstacle_days_psychological_problems'), (41.46, 'contacts_type_of_document_outgoing'), (38.6, 'typering_days_sum'), (36.64, 'relationship_other_current_costsharer'), (35.78, 'appointment_last_year_number_of_words'), (33.44, 'instrument_ladder_reason_termination_reason_sucuessful'), (33.2, 'instrument_ladder_history_activation'), (30.24, 'availabi

In [23]:
model = xgb.XGBClassifier(max_depth=25, n_estimators = 200)

pipeline = Pipeline([
    ('standard_scaler', StandardScaler()), 
    # ('selector', VarianceThreshold),
    # ('pca', PCA()), 
    ('model', model)
])

# param_grid = {
#     # 'pca__n_components': [50, 100, 150],
#     'model__max_depth': [20, 25, 30],
#     'model__n_estimators': [175, 200]
# }

# grid = HalvingGridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='roc_auc', verbose=10)

In [27]:
# grid.fit(X_train, y_train)
pipeline.fit(X_train, y_train)

In [25]:
# print(grid.best_params_)


{'model__max_depth': 25, 'model__n_estimators': 200}


In [28]:
y_pred = pipeline.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of the original model: ', original_accuracy)

Accuracy of the original model:  0.9437065148640101


In [32]:
update_registered_converter(
    XGBClassifier,
    "XGBoostXGBClassifier",
    calculate_linear_classifier_output_shapes,
    convert_xgboost,
    options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
)

In [34]:
booster = model.get_booster()
original_feature_names = booster.feature_names
if original_feature_names is not None:
    onnx_converter_conform_feature_names = [f"f{num}" for num in range(len(original_feature_names))]
    booster.feature_names = onnx_converter_conform_feature_names

In [35]:
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X_train.shape[1])))],
    target_opset=12)

In [36]:
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9437065148640101


In [37]:
onnx.save(onnx_model, "model/model2.onnx")