In [1]:
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [5]:
len(train.data)
# len(train.filenames)

11314

In [6]:
print("\n".join(train.data[0].split("\n")[:3]))

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,


In [7]:
print(train.target_names[train.target[0]])

rec.autos


In [8]:
train.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [9]:
for t in train.target[:10]:
    print(train.target_names[t])

rec.autos
comp.sys.mac.hardware
comp.sys.mac.hardware
comp.graphics
sci.space
talk.politics.guns
sci.med
comp.sys.ibm.pc.hardware
comp.os.ms-windows.misc
comp.sys.mac.hardware


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train.data)
X_train_counts.shape

(11314, 101631)

In [11]:
count_vect.vocabulary_.get(u'algorithm')

17881

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(11314, 101631)

In [13]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 101631)

In [14]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train.target)

In [15]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.sys.ibm.pc.hardware


In [16]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [17]:
text_clf.fit(train.data, train.target)

In [18]:
import numpy as np
docs_test = test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == test.target)

0.6062134891131173

In [19]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(train.data, train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == test.target)

0.6829527349973447

In [20]:
from sklearn import metrics
print(metrics.classification_report(test.target, predicted,
    target_names=test.target_names))




metrics.confusion_matrix(test.target, predicted)

                          precision    recall  f1-score   support

             alt.atheism       0.62      0.39      0.48       319
           comp.graphics       0.64      0.69      0.66       389
 comp.os.ms-windows.misc       0.66      0.61      0.63       394
comp.sys.ibm.pc.hardware       0.68      0.62      0.65       392
   comp.sys.mac.hardware       0.75      0.69      0.72       385
          comp.windows.x       0.78      0.71      0.74       395
            misc.forsale       0.48      0.85      0.61       390
               rec.autos       0.77      0.70      0.73       396
         rec.motorcycles       0.77      0.75      0.76       398
      rec.sport.baseball       0.82      0.79      0.80       397
        rec.sport.hockey       0.81      0.92      0.86       399
               sci.crypt       0.69      0.75      0.71       396
         sci.electronics       0.67      0.49      0.57       393
                 sci.med       0.77      0.78      0.78       396
         

array([[125,   2,   1,   1,   0,   2,  13,   4,   3,   4,   6,   6,   7,
          7,  19,  79,   6,  20,   2,  12],
       [  3, 269,  22,  10,   7,  19,  13,   1,   3,   5,   1,  13,   3,
          2,  12,   2,   1,   2,   1,   0],
       [  2,  16, 239,  31,  20,  21,  18,   1,   2,   3,   2,   4,   1,
          9,  11,   1,   4,   2,   5,   2],
       [  0,  17,  30, 244,  24,   6,  27,   3,   1,   1,   1,  11,  21,
          2,   2,   0,   0,   2,   0,   0],
       [  0,   8,   6,  30, 267,   6,  30,   5,   8,   1,   4,   6,   7,
          1,   4,   1,   1,   0,   0,   0],
       [  0,  43,  33,   4,   7, 279,  13,   0,   1,   0,   0,   5,   3,
          1,   5,   0,   0,   1,   0,   0],
       [  0,   4,   0,  12,   9,   0, 331,   7,   4,   2,   3,   1,   4,
          0,   5,   1,   5,   1,   0,   1],
       [  2,   2,   3,   1,   1,   3,  37, 277,  18,   5,   2,   3,  14,
          3,   7,   1,   7,   6,   2,   2],
       [  3,   1,   1,   1,   1,   0,  23,  20, 300,   5,   1,  

In [21]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [22]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [23]:
gs_clf = gs_clf.fit(train.data[:400], train.target[:400])

In [24]:
train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [25]:
gs_clf.best_score_
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [26]:
gs_clf.cv_results_

{'mean_fit_time': array([0.10209961, 0.33044181, 0.10550256, 0.22898474, 0.06776147,
        0.2756444 , 0.06478896, 0.13528967]),
 'std_fit_time': array([0.0187182 , 0.0425899 , 0.02381067, 0.01678081, 0.01355075,
        0.02904098, 0.00809821, 0.0146898 ]),
 'mean_score_time': array([0.01958756, 0.03838363, 0.01488581, 0.02286129, 0.00998693,
        0.04022293, 0.01290283, 0.01736422]),
 'std_score_time': array([0.01298392, 0.01176287, 0.0033477 , 0.00825831, 0.0034412 ,
        0.01837923, 0.00465003, 0.00516075]),
 'param_clf__alpha': masked_array(data=[0.01, 0.01, 0.01, 0.01, 0.001, 0.001, 0.001, 0.001],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_tfidf__use_idf': masked_array(data=[True, True, False, False, True, True, False, False],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_vect__ngram_range'