In [3]:
import json
import numpy as np
import pandas as pd
import logging
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from xgboost import XGBClassifier
import deepchecks

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from deepchecks.tabular import Dataset

from deepchecks.tabular.suites import full_suite

import config
from etl import load_train_test_data


logger = logging.getLogger('deepchecks')

logger.info('Load data')
X_train, X_test, y_train, y_test = load_train_test_data(selected_feats_only=True)
label_col = 'target'

df_train = X_train.copy()
df_train['target'] = y_train

df_test = X_test.copy()
df_test['target'] = y_test

assert list(df_train.columns) == list(df_test.columns)

ds_train = Dataset(df_train, label=label_col, cat_features=[])
ds_test = Dataset(df_test, label=label_col, cat_features=[])

# train model on train data and evaluate on test data (the portion not involved in CV)
logger.info('load model')
params_model = config.get_params_model()
model = XGBClassifier(**params_model)
model.fit(X_train, y_train)
y_pred_prob_test = model.predict_proba(X_test)[:, 1]
score_test = roc_auc_score(y_test, y_pred_prob_test)


logger.info('run deepchecks suite')
suite = full_suite()
suite.run(train_dataset=ds_train, test_dataset=ds_test, model=model)

deepchecks - INFO - Load data
2022-12-19 15:13:30,583 - deepchecks - INFO - Load data
deepchecks - INFO - load model
2022-12-19 15:13:31,712 - deepchecks - INFO - load model
deepchecks - INFO - run deepchecks suite
2022-12-19 15:13:39,550 - deepchecks - INFO - run deepchecks suite


Accordion(children=(VBox(children=(HTML(value='\n            <h1 id="summary_J1JQM5E412HX0S0OTN6IY2F86">Full S…