# Classification of 20 newsgroups dataset 

http://scikit-learn.org/stable/datasets/twenty_newsgroups.html

In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'talk.religion.misc', 'talk.politics.guns', 'sci.space', 'talk.politics.misc']
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', 
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)
from pprint import pprint
pprint(list(newsgroups_train.target_names)) 

['alt.atheism',
 'sci.space',
 'talk.politics.guns',
 'talk.politics.misc',
 'talk.religion.misc']


## Pipeline of transformers and final predictor

See http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html

and http://scikit-learn.org/stable/modules/naive_bayes.html for which variant of NB is best adapted to text.

In [2]:
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score


pipeline = make_pipeline(
    CountVectorizer(stop_words='english', max_df=0.9, min_df=0.01),
    TfidfTransformer(), MultinomialNB())

pipeline

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.01,
        ngram_range=(1, 1), preprocessor=None, stop_words='engl...alse, use_idf=True)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [3]:
pipeline.fit(X=newsgroups_train.data, y=newsgroups_train.target)

train_score = accuracy_score(y_true=newsgroups_train.target, y_pred=pipeline.predict(newsgroups_train.data))
test_score = accuracy_score(y_true=newsgroups_test.target, y_pred=pipeline.predict(newsgroups_test.data))

print(train_score, test_score)

0.820804550996 0.617216117216
