In [1]:
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
bunch = load_files("./data/20_newsgroups/")
X_train, X_test, y_train, y_test = train_test_split(bunch.data, bunch.target, test_size=.2)

In [2]:
twenty_newsgroups_categories = ['alt.atheism',
                     'comp.graphics',
                     'comp.os.ms-windows.misc',
                     'comp.sys.ibm.pc.hardware',
                     'comp.sys.mac.hardware',
                     'comp.windows.x',
                     'misc.forsale',
                     'rec.autos',
                     'rec.motorcycles',
                     'rec.sport.baseball',
                     'rec.sport.hockey',
                     'sci.crypt',
                     'sci.electronics',
                     'sci.med',
                     'sci.space',
                     'soc.religion.christian',
                     'talk.politics.guns',
                     'talk.politics.mideast',
                     'talk.politics.misc',
                     'talk.religion.misc']


In [3]:
print(len(X_train))
print(len(X_test))

15997
4000


In [4]:
# Convert a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(decode_error='ignore')
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(15997, 174520)

In [5]:
# Transform a count matrix to a normalized tf or tf-idf representation
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(15997, 174520)

In [6]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tf, y_train)

In [7]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf, X_train_tf, y_train, scoring='accuracy',cv=5)

array([0.87348084, 0.87702871, 0.86178862, 0.87100814, 0.87030075])

In [11]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'i love a girl but i don\'t know how to say her',
            'iranian says down with usa']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_newsgroups_categories[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => rec.autos
"i love a girl but i don't know how to say her" => soc.religion.christian
'iranian says down with usa' => talk.politics.guns


In order to make the vectorizer => transformer => classifier easier to work with, scikit-learn provides a Pipeline class that behaves like a compound classifier:

In [13]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore'))
    ,('tfidf', TfidfTransformer())
    ,('clf', MultinomialNB())])
text_clf.fit(X_train, y_train)  

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])