In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix, balanced_accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [2]:
us_clocks = pd.read_csv('us_clocks.csv')
predictors = ['listingIsTopRated', 'sellerFeedbackScore', 'sellerPositivePercent', 'sellerIsTopRated', 
              'endAtWeekend', 'endAtEvening', 'length', 'isBroken', 'isUsed', 'isLarge', 'freeShipping']

X = us_clocks.loc[:,predictors]
y = us_clocks.loc[:,'isSold']

In [3]:
model = GradientBoostingClassifier()
over = SMOTE()
under = RandomUnderSampler()
pipeline = Pipeline(steps=[("scaler", StandardScaler()), ('over', over), ('under', under), ('model', model)])
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

param_grid = {
    "over": ["passthrough", SMOTE(sampling_strategy=0.2, k_neighbors=1), SMOTE(sampling_strategy=0.5, k_neighbors=1),
            SMOTE(sampling_strategy=0.2, k_neighbors=3), SMOTE(sampling_strategy=0.5, k_neighbors=3),
            SMOTE(sampling_strategy=0.2, k_neighbors=5), SMOTE(sampling_strategy=0.5, k_neighbors=5)],
    "under": ["passthrough", RandomUnderSampler(sampling_strategy=0.5), RandomUnderSampler(sampling_strategy=0.8)],
    "model__n_estimators": [5, 30, 80, 150, 300],
    "model__max_depth": [1, 3, 7, 14, 20, 30]
}

search = GridSearchCV(pipeline, param_grid, scoring='balanced_accuracy', cv=cv, n_jobs=-1)
search.fit(X,y)

4050 fits failed out of a total of 9450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4050 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/liumukun/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/liumukun/anaconda3/lib/python3.8/site-packages/imblearn/pipeline.py", line 268, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/Users/liumukun/anaconda3/lib/python3.8/site-packages/imblearn/pipeline.py", line 226, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "/Users/liumukun/anaconda3/lib/python3.8/site-packages/joblib/memory.py", line 349

In [5]:
print(search.best_params_)
print(search.best_score_)

{'model__max_depth': 7, 'model__n_estimators': 300, 'over': SMOTE(k_neighbors=1, sampling_strategy=0.5), 'under': 'passthrough'}
0.6815788400801623


In [15]:
from sklearn.inspection import permutation_importance
from sklearn.pipeline import make_pipeline

clf = GradientBoostingClassifier(max_depth=7, n_estimators=300)
model = regr.fit(X_train, y_train)

r = permutation_importance(model, X_test, y_test,
                            n_repeats=30,
                            random_state=0)

for i in r.importances_mean.argsort()[::-1]:
     if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
         print(f"{X_train.columns[i]:<8}"
               f"{r.importances_mean[i]:.3f}"
               f" +/- {r.importances_std[i]:.3f}")

sellerFeedbackScore0.093 +/- 0.011
endAtEvening0.034 +/- 0.005
isBroken0.023 +/- 0.006
listingIsTopRated0.011 +/- 0.002
