In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
tweety_train = fetch_20newsgroups(subset='train',shuffle=True)

In [3]:
#tweety_train.target_names   - gives all category
len(tweety_train.data)

11314

In [25]:
len(tweety_train.data)

11314

In [4]:
tweety_train['data'][0].split("\n")

["From: lerxst@wam.umd.edu (where's my thing)",
 'Subject: WHAT car is this!?',
 'Nntp-Posting-Host: rac3.wam.umd.edu',
 'Organization: University of Maryland, College Park',
 'Lines: 15',
 '',
 ' I was wondering if anyone out there could enlighten me on this car I saw',
 'the other day. It was a 2-door sports car, looked to be from the late 60s/',
 'early 70s. It was called a Bricklin. The doors were really small. In addition,',
 'the front bumper was separate from the rest of the body. This is ',
 'all I know. If anyone can tellme a model name, engine specs, years',
 'of production, where this car is made, history, or whatever info you',
 'have on this funky looking car, please e-mail.',
 '',
 'Thanks,',
 '- IL',
 '   ---- brought to you by your neighborhood Lerxst ----',
 '',
 '',
 '',
 '',
 '']

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer
Convert a collection of text documents to a matrix of token counts

This implementation produces a sparse representation of the counts using
scipy.sparse.csr_matrix.

If you do not provide an a-priori dictionary and you do not use an analyzer
that does some kind of feature selection then the number of features will
be equal to the vocabulary size found by analyzing the data.

In [6]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tweety_train.data)

In [7]:
print(X_train_counts.shape)    #11314 sentences are there and 130107 is features_names
X_train_counts  

(11314, 130107)


<11314x130107 sparse matrix of type '<class 'numpy.int64'>'
	with 1787565 stored elements in Compressed Sparse Row format>

In [8]:
X_train_counts[0]

<1x130107 sparse matrix of type '<class 'numpy.int64'>'
	with 89 stored elements in Compressed Sparse Row format>

### Dataframe

In [9]:
import pandas as pd

In [38]:
df = pd.DataFrame(X_train_counts.toarray(),columns=count_vect.get_feature_names())

## Second step = TfidfTransformer
Transform a count matrix to a normalized tf or tf-idf representation


In [26]:
from sklearn.feature_extraction.text import TfidfTransformer

In [27]:
tfid_transformer = TfidfTransformer()

In [29]:
X_train_tfid = tfid_transformer.fit_transform(X_train_counts)

In [31]:
X_train_tfid.shape

(11314, 130107)

In [37]:
print(type(X_train_tfid))
print(X_train_tfid[0])

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 56979)	0.0574701540749
  (0, 75358)	0.353835013497
  (0, 123162)	0.259709024574
  (0, 118280)	0.211868072083
  (0, 50527)	0.0546142865886
  (0, 124031)	0.107987951542
  (0, 85354)	0.0369697850882
  (0, 114688)	0.0621407098631
  (0, 111322)	0.019156718025
  (0, 123984)	0.0368542926346
  (0, 37780)	0.381338912595
  (0, 68532)	0.0732581234213
  (0, 114731)	0.144472755128
  (0, 87620)	0.0356718631408
  (0, 95162)	0.0344713840933
  (0, 64095)	0.0354209242713
  (0, 98949)	0.160686060554
  (0, 90379)	0.0199288599566
  (0, 118983)	0.0370859780506
  (0, 89362)	0.065211743063
  (0, 79666)	0.109364012524
  (0, 40998)	0.0780136819692
  (0, 92081)	0.0991327449391
  (0, 76032)	0.0192194630522
  (0, 4605)	0.0633260395248
  :	:
  (0, 37565)	0.0343176044248
  (0, 113986)	0.176917506749
  (0, 83256)	0.0884438249646
  (0, 86001)	0.0700041144584
  (0, 51730)	0.0971474405798
  (0, 109271)	0.108447248221
  (0, 128026)	0.0606220958898
  (0, 96144)	0.108269044907
  

In [39]:
from sklearn.naive_bayes import MultinomialNB

In [40]:
clf = MultinomialNB().fit(X_train_tfid,tweety_train.target)

# Same above thing using pipeline 

In [44]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
])
text_clf = text_clf.fit(tweety_train.data, tweety_train.target)

In [45]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.81691449814126393

# Stemming: 
From Wikipedia, stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form. E.g. A stemming algorithm reduces the words “fishing”, “fished”, and “fisher” to the root word, “fish”.
We need NLTK which can be installed from here. NLTK comes with various stemmers (details on how stemmers work are out of scope for this article) which can help reducing the words to their root form. Again use this, if it make sense for your problem.
Below I have used Snowball stemmer which works very well for English language. 

In [47]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

In [50]:
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                      ('tfidf', TfidfTransformer()),
                      ('mnb', MultinomialNB(fit_prior=False)),
 ])
text_mnb_stemmed = text_mnb_stemmed.fit(tweety_train.data, tweety_train.target)
predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)
np.mean(predicted_mnb_stemmed == twenty_test.target)

0.81678173127987252

## Reference 
- http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.get_feature_names
- http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer
- https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a
 
Topic Modeling 
- https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730
