In [1]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

import numpy as np

In [2]:
import xgboost as xgb
import lightgbm as lgb

In [3]:
data_dict = fetch_20newsgroups()

In [4]:
len(data_dict["data"])

11314

In [5]:
grid = {
    "preprocessor__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "preprocessor__min_df": [2, 4, 7, 10],
    "preprocessor__stop_words":['english'],
    "model__max_depth": list(range(3, 7)),
    "model__n_estimators": [2**i-1 for i in range(3, 7)],
    "model": [xgb.XGBClassifier(),
              lgb.LGBMClassifier()]
}

In [6]:
pipe = Pipeline([
    ("preprocessor", TfidfVectorizer()),
    ("model", None)
])

In [7]:
search = RandomizedSearchCV(pipe, grid, n_jobs=-1, cv=3, verbose=2, n_iter=10, scoring="f1_weighted")

In [8]:
search.fit(data_dict["data"], data_dict["target"])

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  8.3min finished




RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('preprocessor',
                                              TfidfVectorizer()),
                                             ('model', None)]),
                   n_jobs=-1,
                   param_distributions={'model': [XGBClassifier(base_score=None,
                                                                booster=None,
                                                                colsample_bylevel=None,
                                                                colsample_bynode=None,
                                                                colsample_bytree=None,
                                                                gamma=None,
                                                                gpu_id=None,
                                                                importance_type='gain',
                                                                interaction_constraints=Non

In [9]:
search.cv_results_

{'mean_fit_time': array([207.48451694,  40.71611762, 249.30416417, 128.75765276,
        149.77507027, 169.65733059, 103.56240567, 104.77577734,
         74.4770546 ,  26.71790377]),
 'std_fit_time': array([ 1.62060967,  0.96014791,  1.49410472, 14.273824  ,  1.91419606,
         8.38470854,  1.44150046,  2.11910972,  9.05037262,  5.76454052]),
 'mean_score_time': array([8.35808897, 8.73633202, 7.3917222 , 3.58552965, 3.16723641,
        2.64488371, 5.28593477, 2.32623235, 1.62092145, 4.2360847 ]),
 'std_score_time': array([0.1779599 , 0.24532836, 0.47130644, 0.80671167, 0.3194547 ,
        0.28812725, 0.80153457, 0.21021487, 0.09710995, 0.69739189]),
 'param_preprocessor__stop_words': masked_array(data=['english', 'english', 'english', 'english', 'english',
                    'english', 'english', 'english', 'english', 'english'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=o

In [10]:
search.best_estimator_

Pipeline(steps=[('preprocessor',
                 TfidfVectorizer(min_df=4, stop_words='english')),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=5, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=31,
                               n_jobs=8, num_parallel_tree=1,
                               objective='multi:softprob', random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
                               subsample=1, tree_method='exact',
                               validate_p

In [11]:
search.best_score_

0.8065849153677007

In [12]:
est = search.best_estimator_
est.fit(data_dict['data'], data_dict['target'])



Pipeline(steps=[('preprocessor',
                 TfidfVectorizer(min_df=4, stop_words='english')),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=5, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=31,
                               n_jobs=8, num_parallel_tree=1,
                               objective='multi:softprob', random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
                               subsample=1, tree_method='exact',
                               validate_p

In [13]:
test = fetch_20newsgroups(subset="test")

In [14]:
from sklearn import metrics
pred = est.predict(test['data'])
print(metrics.classification_report(test['target'], pred))

              precision    recall  f1-score   support

           0       0.74      0.65      0.69       319
           1       0.62      0.67      0.65       389
           2       0.68      0.72      0.70       394
           3       0.60      0.70      0.65       392
           4       0.76      0.77      0.76       385
           5       0.77      0.66      0.71       395
           6       0.81      0.86      0.84       390
           7       0.84      0.78      0.81       396
           8       0.89      0.86      0.88       398
           9       0.85      0.87      0.86       397
          10       0.92      0.87      0.89       399
          11       0.90      0.82      0.86       396
          12       0.50      0.65      0.56       393
          13       0.83      0.76      0.79       396
          14       0.84      0.85      0.84       394
          15       0.84      0.88      0.86       398
          16       0.64      0.77      0.70       364
          17       0.95    