In [16]:
#Loading the 20 newsgroups dataset

categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

In [None]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',categories=categories,shuffle=True,random_state=42)

In [18]:
print(len(twenty_train.data))
print(len(twenty_train.filenames))

2257
2257


In [19]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [11]:
twenty_train.target_names[twenty_train.target[0]]

'comp.graphics'

In [13]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [15]:
for i in twenty_train.target[:10]:
    print(twenty_train.target_names[i])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [20]:
#Extracting features from text files

#Tokenizing text with scikit-learn

from sklearn.feature_extraction.text import CountVectorizer
count_vect=CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [21]:
count_vect.vocabulary_.get(u'algorithm')

4690

In [25]:
#From occurrences to frequencies

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf=tf_transformer.transform(X_train_counts)
X_train_tf.shape


(2257, 35788)

In [26]:
#Training a classifier
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB().fit(X_train_tf,twenty_train.target)

In [28]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tf_transformer.transform(X_new_counts)

In [29]:
predicted = clf.predict(X_new_tfidf)

In [30]:
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [33]:
#Building a pipeline
from sklearn.pipeline import Pipeline

text_clf=Pipeline([('vect',CountVectorizer()),
                  ('tfidf',TfidfTransformer()),
                  ('clf',MultinomialNB())])

In [34]:
text_clf=text_clf.fit(twenty_train.data,twenty_train.target)

In [35]:
#Evaluation of the performance on the test set

import numpy as np
twenty_test = fetch_20newsgroups(subset='test',categories=categories,shuffle=True,random_state=42)
docs_test=twenty_test.data
predicted=text_clf.predict(docs_test)
np.mean(predicted==twenty_test.target)

0.83488681757656458

In [37]:
from sklearn.linear_model import SGDClassifier

text_clf=Pipeline([('vect',CountVectorizer()),
                  ('tfidf',TfidfTransformer()),
                  ('clf',SGDClassifier(loss='hinge',
                                       alpha=1e-3,n_iter=5,random_state=42))])
                                       
_ =text_clf.fit(twenty_train.data,twenty_train.target)                                       

In [38]:
predicted=text_clf.predict(docs_test)
np.mean(predicted==twenty_test.target)

0.9127829560585885

In [39]:
from sklearn import metrics

In [41]:
print(metrics.classification_report(twenty_test.target,predicted,target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [42]:
metrics.confusion_matrix(twenty_test.target,predicted)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]])

In [43]:
#Parameter tuning using grid search
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

In [44]:
gs_clf=GridSearchCV(text_clf,parameters,n_jobs=-1)

In [46]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [47]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [49]:
gs_clf.best_score_

0.90000000000000002

In [50]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

In [51]:
gs_clf.cv_results_

{'mean_fit_time': array([ 3.02256529,  1.68807181,  0.18589004,  0.7381018 ,  0.22907058,
         0.77652947,  0.20285034,  0.67674224]),
 'mean_score_time': array([ 0.20483081,  0.31615384,  0.0910236 ,  0.24596016,  0.08251119,
         0.19645484,  0.09478601,  0.17430528]),
 'mean_test_score': array([ 0.8775,  0.875 ,  0.765 ,  0.78  ,  0.9   ,  0.89  ,  0.7675,  0.81  ]),
 'mean_train_score': array([ 0.99374372,  1.        ,  0.94123886,  0.97623272,  1.        ,
         1.        ,  0.98499057,  1.        ]),
 'param_clf__alpha': masked_array(data = [0.01 0.01 0.01 0.01 0.001 0.001 0.001 0.001],
              mask = [False False False False False False False False],
        fill_value = ?),
 'param_tfidf__use_idf': masked_array(data = [True True False False True True False False],
              mask = [False False False False False False False False],
        fill_value = ?),
 'param_vect__ngram_range': masked_array(data = [(1, 1) (1, 2) (1, 1) (1, 2) (1, 1) (1, 2) (1, 1) (1, 2