In [2]:
import pandas as pd
from sklearn.metrics import confusion_matrix, balanced_accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [3]:
uk_clocks = pd.read_csv('uk_clocks.csv')
predictors = ['listingIsTopRated', 'sellerFeedbackScore', 'sellerPositivePercent', 'sellerIsTopRated', 
              'endAtWeekend', 'endAtEvening', 'length', 'isBroken', 'isUsed', 'isLarge', 'freeShipping']

X = uk_clocks.loc[:,predictors]
y = uk_clocks.loc[:,'isSold']

In [16]:
model = GradientBoostingClassifier()
over = SMOTE()
under = RandomUnderSampler()
pipeline = Pipeline(steps=[("scaler", StandardScaler()), ('over', over), ('under', under), ('model', model)])
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

param_grid = {
    "over": ["passthrough", SMOTE(sampling_strategy=0.2, k_neighbors=1), SMOTE(sampling_strategy=0.5, k_neighbors=1),
            SMOTE(sampling_strategy=0.2, k_neighbors=3), SMOTE(sampling_strategy=0.5, k_neighbors=3),
            SMOTE(sampling_strategy=0.2, k_neighbors=5), SMOTE(sampling_strategy=0.5, k_neighbors=5)],
    "under": ["passthrough", RandomUnderSampler(sampling_strategy=0.5), RandomUnderSampler(sampling_strategy=0.8)],
    "model__n_estimators": [5, 30, 80, 150, 300],
    "model__max_depth": [1, 3, 7, 14, 20, 30]
}

search = GridSearchCV(pipeline, param_grid, scoring='balanced_accuracy', cv=cv, n_jobs=-1)
search.fit(X,y)

In [23]:
print(search.best_params_)
print(search.best_score_)

{'model__max_depth': 14, 'model__n_estimators': 150, 'over': 'passthrough', 'under': RandomUnderSampler(sampling_strategy=0.5)}
0.6141859078128995


In [46]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

# over = RandomUnderSampler(sampling_strategy=0.5)
# X_train_, y_train_ = over.fit_resample(X_train, y_train)


In [49]:
from sklearn.inspection import permutation_importance

over = RandomUnderSampler(sampling_strategy=0.5)
clf = GradientBoostingClassifier(max_depth=14, n_estimators=150)
regr = Pipeline(steps=[('over',over), ('model',clf)])
model = regr.fit(X_train, y_train)

clf = GradientBoostingClassifier(max_depth=14, n_estimators=150)
model = clf.fit(X_train, y_train)

r = permutation_importance(model, X_test, y_test,
                            n_repeats=30,
                            random_state=0)

for i in r.importances_mean.argsort()[::-1]:
     if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
         print(f"{X_train.columns[i]:<8}"
               f"{r.importances_mean[i]:.3f}"
               f" +/- {r.importances_std[i]:.3f}")

sellerFeedbackScore0.062 +/- 0.013
sellerPositivePercent0.034 +/- 0.008
endAtEvening0.023 +/- 0.006
length  0.014 +/- 0.006
isUsed  0.013 +/- 0.005
isBroken0.011 +/- 0.004


In [43]:
balanced_accuracy_score(y_test,y_pred)

0.6116447985004686