In [1]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

import numpy as np

In [2]:
data_dict = fetch_20newsgroups()

In [3]:
len(data_dict["data"])

11314

In [4]:
grid = {
    "preprocessor": [CountVectorizer(), TfidfVectorizer()],
    "preprocessor__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "preprocessor__min_df": [2, 4, 7, 10],
    "model__estimator__C": [0.01, 0.1, 0.5 , 1, 3, 5, 10, 25, 50],
    "preprocessor__stop_words":['english'],
}

In [5]:
pipe = Pipeline([
    ("preprocessor", CountVectorizer()),
    ("model", OneVsRestClassifier(LogisticRegression(max_iter=1000)))
])

In [6]:
search = RandomizedSearchCV(pipe, grid, n_jobs=-1, cv=3, verbose=2, n_iter=20, scoring="f1_weighted")

In [7]:
search.fit(data_dict["data"], data_dict["target"])

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  8.6min finished


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('preprocessor',
                                              CountVectorizer()),
                                             ('model',
                                              OneVsRestClassifier(estimator=LogisticRegression(max_iter=1000)))]),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'model__estimator__C': [0.01, 0.1, 0.5,
                                                                1, 3, 5, 10, 25,
                                                                50],
                                        'preprocessor': [CountVectorizer(),
                                                         TfidfVectorizer(min_df=2,
                                                                         ngram_range=(1,
                                                                                      2),
                                                                    

In [8]:
search.cv_results_

{'mean_fit_time': array([ 58.41825724, 105.97255095, 123.42507521,  43.40573096,
        168.01206422, 216.63334346,  39.34012961,  93.22030536,
         29.83187342,  14.22215327,  16.24928331,  36.78483693,
         58.93230184,  23.10798796,  23.96666098,  25.72089267,
         17.93893361,  24.18307964,  29.53789409,  65.33497938]),
 'std_fit_time': array([0.61408216, 3.19579036, 4.05400163, 2.51145335, 6.218995  ,
        1.21219044, 0.69316225, 4.77459331, 0.13816609, 0.40638519,
        0.28180142, 0.55913289, 1.1185136 , 1.57392972, 0.39873733,
        1.59727259, 0.4779861 , 0.50279934, 0.55727951, 1.06998362]),
 'mean_score_time': array([4.93269388, 3.51377845, 3.94057409, 1.95364976, 4.06777541,
        4.17593431, 5.15534107, 3.990364  , 5.03308829, 2.56051811,
        3.88155071, 2.61512772, 3.88129814, 1.63543542, 1.56078005,
        1.55365864, 3.55033573, 1.51701903, 2.31845681, 1.45193164]),
 'std_score_time': array([0.07859608, 0.25142834, 0.45474147, 0.14180534, 0.18

In [9]:
search.best_estimator_

Pipeline(steps=[('preprocessor',
                 TfidfVectorizer(min_df=2, ngram_range=(1, 2),
                                 stop_words='english')),
                ('model',
                 OneVsRestClassifier(estimator=LogisticRegression(C=50,
                                                                  max_iter=1000)))])

In [10]:
search.best_score_

0.9125136533101775