In [1]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

import numpy as np

In [2]:
data_dict = fetch_20newsgroups()

In [3]:
len(data_dict["data"])

11314

In [4]:
grid = [{
    "preprocessor": [CountVectorizer()],
    "preprocessor__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "preprocessor__min_df": [2, 4, 7, 10],
    "preprocessor__stop_words":['english'],
    'model': [LogisticRegression()],
    'model__penalty': ['l1', 'l2'],
    'model__max_iter': [1000],
    'model__C': np.logspace(-4, 4, 10)
    }, {
    "preprocessor": [TfidfVectorizer()],
    "preprocessor__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "preprocessor__min_df": [2, 4, 7, 10],
    "preprocessor__stop_words":['english'],
    'model': [LogisticRegression()],
    'model__penalty': ['l1', 'l2'],
    'model__max_iter': [1000],
    'model__C': np.logspace(-4, 4, 10)
}]

In [5]:
pipe = Pipeline([
    ("preprocessor", None),
    ("model", None)
])

In [6]:
search = RandomizedSearchCV(pipe, grid, n_jobs=-1, cv=3, verbose=2, n_iter=10, scoring="f1_weighted")

In [7]:
search.fit(data_dict["data"], data_dict["target"])

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  7.2min finished


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('preprocessor', None),
                                             ('model', None)]),
                   n_jobs=-1,
                   param_distributions=[{'model': [LogisticRegression()],
                                         'model__C': array([1.00000000e-04, 7.74263683e-04, 5.99484250e-03, 4.64158883e-02,
       3.59381366e-01, 2.78255940e+00, 2.15443469e+01, 1.66810054e+02,
       1.29154967e+03, 1.00000000e+04]),
                                         'model__max_iter': [1000],
                                         'mo...
       3.59381366e-01, 2.78255940e+00, 2.15443469e+01, 1.66810054e+02,
       1.29154967e+03, 1.00000000e+04]),
                                         'model__max_iter': [1000],
                                         'model__penalty': ['l1', 'l2'],
                                         'preprocessor': [TfidfVectorizer(min_df=2,
                                                 

In [9]:
search.best_estimator_

Pipeline(steps=[('preprocessor',
                 TfidfVectorizer(min_df=2, ngram_range=(1, 2),
                                 stop_words='english')),
                ('model',
                 LogisticRegression(C=166.81005372000558, max_iter=1000))])

In [10]:
search.best_score_

0.9117377807076205

In [11]:
test = fetch_20newsgroups(subset="test")

In [12]:
est = search.best_estimator_
est.fit(data_dict['data'], data_dict['target'])

Pipeline(steps=[('preprocessor',
                 TfidfVectorizer(min_df=2, ngram_range=(1, 2),
                                 stop_words='english')),
                ('model',
                 LogisticRegression(C=166.81005372000558, max_iter=1000))])

In [13]:
from sklearn import metrics
pred = est.predict(test['data'])
print(metrics.classification_report(test['target'], pred))

              precision    recall  f1-score   support

           0       0.86      0.79      0.82       319
           1       0.72      0.83      0.77       389
           2       0.78      0.74      0.76       394
           3       0.70      0.75      0.73       392
           4       0.82      0.85      0.83       385
           5       0.85      0.75      0.80       395
           6       0.80      0.91      0.85       390
           7       0.91      0.90      0.90       396
           8       0.96      0.96      0.96       398
           9       0.90      0.94      0.92       397
          10       0.97      0.97      0.97       399
          11       0.95      0.91      0.93       396
          12       0.79      0.79      0.79       393
          13       0.90      0.85      0.88       396
          14       0.91      0.92      0.92       394
          15       0.88      0.93      0.91       398
          16       0.78      0.91      0.84       364
          17       0.97    