In [2]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

import numpy as np

In [3]:
import xgboost as xgb
import lightgbm as lgb

In [4]:
data_dict = fetch_20newsgroups()

In [5]:
len(data_dict["data"])

11314

In [6]:
grid = [{
    "preprocessor__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "preprocessor__min_df": [2, 4, 7, 10],
    "preprocessor__stop_words":['english'],
    "model__max_depth": list(range(3, 7)),
    "model__n_estimators": [2**i-1 for i in range(3, 7)],
    "model": [xgb.XGBClassifier()]
}, {
    "preprocessor__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "preprocessor__min_df": [2, 4, 7, 10],
    "preprocessor__stop_words":['english'],
    "model__max_depth": list(range(3, 7)),
    "model__n_estimators": [2**i-1 for i in range(3, 7)],
    "model": [lgb.LGBMClassifier()]
}]

In [7]:
pipe = Pipeline([
    ("preprocessor", TfidfVectorizer()),
    ("model", None)
])

In [8]:
search = RandomizedSearchCV(pipe, grid, n_jobs=-1, cv=3, verbose=2, n_iter=5, scoring="f1_weighted")

In [9]:
search.fit(data_dict["data"], data_dict["target"])

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed: 44.1min remaining: 38.6min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 131.3min finished




RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('preprocessor',
                                              TfidfVectorizer()),
                                             ('model', None)]),
                   n_iter=5, n_jobs=-1,
                   param_distributions=[{'model': [XGBClassifier(base_score=None,
                                                                 booster=None,
                                                                 colsample_bylevel=None,
                                                                 colsample_bynode=None,
                                                                 colsample_bytree=None,
                                                                 gamma=None,
                                                                 gpu_id=None,
                                                                 importance_type='gain',
                                                                 interact

In [10]:
search.best_estimator_

Pipeline(steps=[('preprocessor',
                 TfidfVectorizer(min_df=10, ngram_range=(1, 2),
                                 stop_words='english')),
                ('model', LGBMClassifier(max_depth=6, n_estimators=63))])

In [11]:
search.best_score_

0.8232340073354566

In [12]:
est = search.best_estimator_
est.fit(data_dict['data'], data_dict['target'])

Pipeline(steps=[('preprocessor',
                 TfidfVectorizer(min_df=10, ngram_range=(1, 2),
                                 stop_words='english')),
                ('model', LGBMClassifier(max_depth=6, n_estimators=63))])

In [13]:
test = fetch_20newsgroups(subset="test")

In [14]:
from sklearn import metrics
pred = est.predict(test['data'])
print(metrics.classification_report(test['target'], pred))

              precision    recall  f1-score   support

           0       0.79      0.68      0.73       319
           1       0.65      0.71      0.68       389
           2       0.70      0.72      0.71       394
           3       0.61      0.71      0.65       392
           4       0.76      0.78      0.77       385
           5       0.83      0.68      0.75       395
           6       0.85      0.86      0.85       390
           7       0.84      0.80      0.82       396
           8       0.90      0.86      0.88       398
           9       0.89      0.88      0.88       397
          10       0.94      0.89      0.92       399
          11       0.91      0.83      0.87       396
          12       0.49      0.66      0.57       393
          13       0.84      0.81      0.83       396
          14       0.85      0.86      0.85       394
          15       0.84      0.91      0.88       398
          16       0.65      0.81      0.72       364
          17       0.97    