In [1]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

import numpy as np

In [2]:
import xgboost as xgb
import catboost as ctb
import lightgbm as lgb



In [3]:
data_dict = fetch_20newsgroups()

In [4]:
len(data_dict["data"])

11314

In [5]:
grid = {
    "preprocessor__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "preprocessor__min_df": [2, 4, 7, 10],
    "preprocessor__stop_words":['english'],
    "model__estimator__max_depth": list(range(3, 7)),
    "model__estimator__n_estimators": [2**i-1 for i in range(3, 7)],
    "model": [xgb.XGBClassifier(objective="multi:softmax"),
              lgb.LGBMClassifier()]
}

In [6]:
pipe = Pipeline([
    ("preprocessor", TfidfVectorizer()),
    ("model", None)
])

In [7]:
search = RandomizedSearchCV(pipe, grid, n_jobs=-1, cv=3, verbose=2, n_iter=20, scoring="f1_weighted")

In [8]:
search.fit(data_dict["data"], data_dict["target"])

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 33.1min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 74.8min finished




RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('preprocessor',
                                              TfidfVectorizer()),
                                             ('model', None)]),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'model': [XGBClassifier(base_score=None,
                                                                booster=None,
                                                                colsample_bylevel=None,
                                                                colsample_bynode=None,
                                                                colsample_bytree=None,
                                                                gamma=None,
                                                                gpu_id=None,
                                                                importance_type='gain',
                                                                interaction_cons

In [9]:
search.cv_results_

{'mean_fit_time': array([ 688.63064194,  506.48002927,  446.14740562,  587.95375872,
        1395.13122503,  841.82380096,  406.94069505,  393.23736699,
         372.93763502,  266.9672002 , 1469.30455152,  208.14298638,
         217.26009194, 1425.04973173,  195.8068099 ,  289.93396036,
         337.91788332,  834.11621817,  204.58773311,  458.21421528]),
 'std_fit_time': array([2.53404422e+00, 5.18376354e+00, 2.48396183e+01, 5.92518217e+00,
        6.96030463e+02, 8.23447510e+02, 1.17610243e+01, 4.64688239e+00,
        2.18456897e+00, 5.72655927e+01, 1.39007450e+02, 6.88180803e-01,
        1.92376996e+00, 1.12158102e+01, 7.38158339e+00, 5.49172995e+01,
        2.71966064e+01, 1.00049581e+01, 1.66286022e+01, 9.81826153e+00]),
 'mean_score_time': array([ 7.57462136, 21.43073042, 14.71191565,  5.54828723,  7.61680237,
         7.0834194 ,  5.45324429,  2.86497235,  2.6356287 ,  2.35349099,
         4.68307336,  3.44103686,  4.43430535,  4.20803436,  2.08655103,
         4.09960055,  5.2

In [10]:
search.best_estimator_

Pipeline(steps=[('preprocessor',
                 TfidfVectorizer(min_df=10, ngram_range=(1, 3),
                                 stop_words='english')),
                ('model',
                 LGBMClassifier(estimator__max_depth=4,
                                estimator__n_estimators=7))])