In [3]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian',
             'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset = 'train',
                                 categories = categories, shuffle = True,
                                 random_state = 42)

In [10]:
#example of text
print('\n'.join(twenty_train.data[0].split('\n')[0:10]))
#text label
print(twenty_train.target_names[twenty_train.target[0]])

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

comp.graphics


## extracting features
#### bag of words
for each document, record the number of occurences, i,  of each word, w, 
in a table x[i,j] where j refers to w
#### BoW limitations
including all of the occurences near zero results in unnecessarily large datasets. Go sparse instead! store only non-zero feature vectors


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

count_vect.vocabulary_.get('love')

20537

#### Normalizing
Bc larger documents will necessarily have more words and thus more occurences than smaller documents, it is better to normalize by the total number of words in the document. This is refered to as TF-IDF, term frequency inverse document frequency.

In [23]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf = False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

#OR

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

### Training a Classifier: Naive Bayes


In [25]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [28]:
#sample new data to be classified
docs_new = ['Allah God Love', 'Deep Learning object Recognition']

#convert new data to BoW
X_new_counts = count_vect.transform(docs_new)
#convert occurences to tfidf
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'Allah God Love' => soc.religion.christian
'Deep Learning object Recognition' => comp.graphics


### Building a Pipeline

In [32]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())
                    ])
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [35]:
#Evaluate Fit w NB
twenty_test = fetch_20newsgroups(subset = 'test',
                               categories = categories,
                               shuffle  = True,
                               random_state = 42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)

import numpy as np
np.mean(predicted == twenty_test.target)

0.83488681757656458

In [37]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss = 'hinge', 
                                          penalty = 'l2',
                                          alpha = 1e-3,
                                          random_state = 42,
                                          max_iter = 5, tol = None))
                    ])
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [41]:
from sklearn import metrics

print(metrics.classification_report(twenty_test.target, predicted,
                                   target_names = twenty_test.target_names))
metrics.confusion_matrix(twenty_test.target, predicted)

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]])

### Hyper Parameter Tuning

In [45]:
from sklearn.model_selection import GridSearchCV
#tuning SVM hyperparameters and tfidf and words v bigrams
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3)}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs = -1)

In [None]:
#to speed things up, perform grid search on a smaller subset of the data
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [None]:
gs_clf.best_score_
