In [1]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Mathieu Blondel <mathieu@mblondel.org>
# License: BSD 3 clause

from __future__ import print_function

from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


###############################################################################
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]
# Uncomment the following to do the analysis on all the categories
#categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

data = fetch_20newsgroups(subset='train', categories=categories)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

###############################################################################
# define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000), #
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False), #
    #'tfidf__norm': ('l1', 'l2'), #
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80), #
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data.data, data.target)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))


Automatically created module for IPython interactive environment
Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']
857 documents
2 categories

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__penalty': ('l2', 'elasticnet'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done  58 out of  72 | elapsed:   14.2s remaining:    3.4s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   15.9s finished


done in 17.301s

Best score: 0.943
Best parameters set:
	clf__alpha: 1e-05
	clf__penalty: 'l2'
	vect__max_df: 0.75
	vect__ngram_range: (1, 2)


In [9]:
print(data.data[0])

From: mangoe@cs.umd.edu (Charley Wingate)
Subject: Benediktine Metaphysics
Lines: 24

Benedikt Rosenau writes, with great authority:

>     IF IT IS CONTRADICTORY IT CANNOT EXIST.

"Contradictory" is a property of language.  If I correct this to


      THINGS DEFINED BY CONTRADICTORY LANGUAGE DO NOT EXIST

I will object to definitions as reality.  If you then amend it to

      THINGS DESCRIBED BY CONTRADICTORY LANGUAGE DO NOT EXIST

then we've come to something which is plainly false.  Failures in
description are merely failures in description.

(I'm not an objectivist, remember.)


-- 
C. Wingate        + "The peace of God, it is no peace,
                  +    but strife closed in the sod.
mangoe@cs.umd.edu +  Yet, brothers, pray for but one thing:
tove!mangoe       +    the marv'lous peace of God."



In [11]:
len(data.filenames)

857

In [12]:
data.target_names

['alt.atheism', 'talk.religion.misc']

In [16]:
type(data)

sklearn.datasets.base.Bunch

In [30]:
zip(data.filenames, data.target)[:10]

[('/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51267',
  0),
 ('/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51139',
  0),
 ('/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51245',
  0),
 ('/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/talk.religion.misc/84200',
  1),
 ('/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/talk.religion.misc/82815',
  1),
 ('/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51294',
  0),
 ('/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51253',
  0),
 ('/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/talk.religion.misc/84186',
  1),
 ('/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53459',
  0),
 ('/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51217',
  0)]

In [18]:
from sklearn.datasets.base import Bunch

In [19]:
b = Bunch()

In [20]:
data.keys()

['DESCR', 'data', 'target', 'target_names', 'filenames']

In [46]:
grid_search.predict

<function sklearn.grid_search.predict>