In [7]:
##Import News corpus from scikit learn
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')

In [3]:
#Number of records that need to be classified
print (len(news.data))

18846


In [4]:
#Number of categories that they can be classified into
print (len(news.target_names))


20


In [5]:
print (news.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [6]:
for text, num_label in zip(news.data[:10], news.target[:10]):
    print ('[%s]:\t\t "%s ..."' % (news.target_names[num_label], text[:100].split('\n')[0]))

[rec.sport.hockey]:		 "From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu> ..."
[comp.sys.ibm.pc.hardware]:		 "From: mblawson@midway.ecn.uoknor.edu (Matthew B Lawson) ..."
[talk.politics.mideast]:		 "From: hilmi-er@dsv.su.se (Hilmi Eren) ..."
[comp.sys.ibm.pc.hardware]:		 "From: guyd@austin.ibm.com (Guy Dawson) ..."
[comp.sys.mac.hardware]:		 "From: Alexander Samuel McDiarmid <am2o+@andrew.cmu.edu> ..."
[sci.electronics]:		 "From: tell@cs.unc.edu (Stephen Tell) ..."
[comp.sys.mac.hardware]:		 "From: lpa8921@tamuts.tamu.edu (Louis Paul Adams) ..."
[rec.sport.hockey]:		 "From: dchhabra@stpl.ists.ca (Deepak Chhabra) ..."
[rec.sport.hockey]:		 "From: dchhabra@stpl.ists.ca (Deepak Chhabra) ..."
[talk.religion.misc]:		 "From: arromdee@jyusenkyou.cs.jhu.edu (Ken Arromdee) ..."


In [16]:
#A simple way of training and evaluating a classifier agains a test set:
from sklearn.model_selection import train_test_split
 
def train(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
 
    classifier.fit(X_train, y_train)
    print ("Accuracy: %s" % classifier.score(X_test, y_test))
    y_pred = classifier.predict(X_test)
    print ("Predicted: %s" %y_pred)
    return classifier
 

In [17]:
# Multinomial Naive Bayes classifier. Text classification is the most common use case for this classifier.
# For transforming the text into a feature vector we’ll have to use specific feature extractors 
# from the sklearn.feature_extraction.text. 
# TfidfVectorizer has the advantage of emphasizing the most important words for a given document.

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
 
trial1 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB()),
])
 
train(trial1, news.data, news.target)
# Accuracy: 0.846349745331
 

Accuracy: 0.846349745331
Predicted: [ 8 15 13 ...,  1 15 11]


Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...      vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [18]:
# Ignore insignificant words.We can use NLTK’s stopwords list.
from nltk.corpus import stopwords
 
trial2 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('classifier', MultinomialNB()),
])
 
train(trial2, news.data, news.target)
# Accuracy: 0.877546689304

Accuracy: 0.877758913413
Predicted: [ 8 15 13 ...,  1 19 11]


Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...      vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [None]:
# Use alpha parameter of the Naive-Bayes classifier. Let’s set it to a low value:

trial3 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('classifier', MultinomialNB(alpha=0.05)),
])
 
train(trial3, news.data, news.target)
# Accuracy: 0.909592529711
 

In [19]:
# Ignore words that appear fewer than 5 times in the document collection:

trial4 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'),
                             min_df=5)),
    ('classifier', MultinomialNB(alpha=0.05)),
])
 
train(trial4, news.data, news.target)
# Accuracy: 0.903013582343
 

Accuracy: 0.902801358234
Predicted: [ 8 15 13 ...,  1 19 11]


Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...     vocabulary=None)), ('classifier', MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True))])

In [None]:
# We’ll use NLTK tokenizer to better split the text into words 
# and then bring the words to a base form using a stemmer. 
# We’ll also ignore the punctuation since word_tokenize doesn’t filter them out.


import string
from nltk.stem import PorterStemmer
from nltk import word_tokenize
 
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]
 
trial5 = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=stemming_tokenizer,
                             stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=0.05)),
])
 
train(trial5, news.data, news.target)
# Accuracy: 0.910653650255
 

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred, target_names=news.target_names.unique()))