In [None]:
!pip install -U scikit-learn

In [1]:
categories = ['alt.atheism',
            'comp.graphics',
            'comp.os.ms-windows.misc',
            'comp.sys.ibm.pc.hardware',
            'comp.sys.mac.hardware',
            'comp.windows.x',
            'misc.forsale',
            'rec.autos',
            'rec.motorcycles',
            'rec.sport.baseball',
            'rec.sport.hockey',
            'sci.crypt',
            'sci.electronics',
            'sci.med',
            'sci.space',
            'soc.religion.christian',
            'talk.politics.guns',
            'talk.politics.mideast',
            'talk.politics.misc',
            'talk.religion.misc'
             ]

In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [4]:
count_vect.vocabulary_.get(u'space')

109061

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

In [6]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [7]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1).fit(X_train_tfidf, twenty_train.target)

In [9]:
wrds = ['Atheism', 'Religion',
            'Documentary','Sports',
            'CPU','Hockey','computer',
            'space','orbit','not','never','right','left','political',
            'launch','explore','moon','kick']

X_new_counts = count_vect.transform(wrds)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(wrds, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'Atheism' => alt.atheism
'Religion' => soc.religion.christian
'Documentary' => talk.politics.guns
'Sports' => rec.sport.baseball
'CPU' => comp.sys.ibm.pc.hardware
'Hockey' => rec.sport.hockey
'computer' => comp.sys.ibm.pc.hardware
'space' => sci.space
'orbit' => sci.space
'not' => soc.religion.christian
'never' => talk.politics.guns
'right' => talk.politics.guns
'left' => rec.autos
'political' => alt.atheism
'launch' => sci.space
'explore' => sci.space
'moon' => sci.space
'kick' => rec.sport.hockey


In [10]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB(alpha=1)),
                    ])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [11]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.7738980350504514