A typical workflow for a text classification system is as follows:

    1. Train and test datasets
    2. Text Normalization
    3. Feature Extraction
    4. Model Training
    5. Model Prediction

In [44]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import numpy as np

**1. Download dataset**

In [32]:
'''
Using 20 newsgroups dataset from Scikit-learn
https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html
The 20 newsgroups dataset contains 18000 newsgroups posts on 20 topics split in two subsets: 
one for training and the other one for testing
'''

#Dowload train data:
train_data = fetch_20newsgroups(subset='train', shuffle=True)
test_data = fetch_20newsgroups(subset='test', shuffle=True)

In [33]:
# Display all the categories in the dataset
train_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

**2. Feature Extraction**

In [34]:
# Using Count Vectorizer
count_vect = CountVectorizer()
train_vect = count_vect.fit_transform(train_data.data)
train_vect.shape

(11314, 130107)

In [35]:
#Using Tf-IDF
vectorizer = TfidfVectorizer()
tf_vectors = vectorizer.fit_transform(train_data.data)
tf_vectors.shape

(11314, 130107)

In [36]:
vectors_test = vectorizer.transform(test_data.data)

**3. Model Training**

**Naive bayes classifier for multinomial models**

In [37]:
# Using MultinomialNB (Naive bayes classifier for multinomial models)
clf = MultinomialNB(alpha=0.1)
clf.fit(tf_vectors, train_data.target)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [38]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB()),
])

In [39]:
##Performance evaluation

predicted = clf.predict(vectors_test)
np.mean(predicted == test_data.target)

metrics.f1_score(test_data.target, predicted, average='macro')

0.81388938732554

**Support Vector Machines (SVM)**

In [45]:
# Using SVM
clf_svm = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42))
                        ])
clf_svm.fit(train_data.data, train_data.target)
svm_pred = clf_svm.predict(test_data.data)
np.mean(svm_pred == test_data.target)

0.8240839086563994