In [1]:
# load text data
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=1337)

In [2]:
# vectorize data
from sklearn.feature_extraction.text import CountVectorizer

X_train_counts = CountVectorizer().fit_transform(twenty_train.data)

In [3]:
# tfidf transform
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [4]:
# naive bayes
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
text_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
text_pipeline.fit(twenty_train.data, twenty_train.target)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=1337)
predicted = text_pipeline.predict(twenty_test.data)
print(np.mean(predicted == twenty_test.target))

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [6]:
# SGD
from sklearn.linear_model import SGDClassifier

text_pipeline_2 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])
text_pipeline_2.fit(twenty_train.data, twenty_train.target)
predicted = text_pipeline_2.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.9094540612516645

In [7]:
# grid search
from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_pipeline_2, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])
print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.9200000000000002
clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)
