Source: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
<br>
To classify text documents into 4 groups using Sklearn.

# Install and Import Modules

In [27]:
!pip install scikit-learn
!pip install numpy



In [40]:
import numpy as np
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

The dataset is called “Twenty Newsgroups”. Here is the official description, quoted from the website:
<br>
The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups. To the best of our knowledge, it was originally collected by Ken Lang, probably for his paper “Newsweeder: Learning to filter netnews,” though he does not explicitly mention this collection. The 20 newsgroups collection has become a popular data set for experiments in text applications of machine learning techniques, such as text classification and text clustering.

# Load Files and Labels

In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [6]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [7]:
len(twenty_train.data)

2257

In [8]:
len(twenty_train.filenames)

2257

In [14]:
# display example data

twenty_train.data[0].split("\n")

['From: sd345@city.ac.uk (Michael Collier)',
 'Subject: Converting images to HP LaserJet III?',
 'Nntp-Posting-Host: hampton',
 'Organization: The City University',
 'Lines: 14',
 '',
 'Does anyone know of a good way (standard PC application/PD utility) to',
 'convert tif/img/tga files into LaserJet III format.  We would also like to',
 'do the same, converting to HPGL (HP plotter) files.',
 '',
 'Please email any response.',
 '',
 'Is this the correct group?',
 '',
 'Thanks in advance.  Michael.',
 '-- ',
 'Michael Collier (Programmer)                 The Computer Unit,',
 'Email: M.P.Collier@uk.ac.city                The City University,',
 'Tel: 071 477-8000 x3769                      London,',
 'Fax: 071 477-8565                            EC1V 0HB.',
 '']

In [15]:
# display target name of example data

twenty_train.target_names[twenty_train.target[0]]

'comp.graphics'

# Feature Extraction

In [17]:
# tokenizing text: transform text to feature vectors

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [19]:
# tdidf transform

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

# Model Training, Prediction, and Evaluation

In [21]:
# model: naive bayes

clf = MultinomialNB().fit(X=X_train_tfidf, y=twenty_train.target)

In [22]:
# prediction

docs_test = ['God is love', 'OpenGL on the GPU is fast']

# preprocessing
X_test_counts = count_vect.transform(docs_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# prediction
predicted = clf.predict(X_test_tfidf)

In [24]:
# print prediction outcome

for doc, category in zip(docs_test, predicted):
    print(f"{doc} => {twenty_train.target_names[category]}")

God is love => soc.religion.christian
OpenGL on the GPU is fast => comp.graphics


In [26]:
# build a proper pipeline with a process of preprocessing and model training

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

# train
text_clf.fit(X=twenty_train.data, y=twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [32]:
# evaluate using a proper test set

twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)

print(f"Mean accuracy score: {np.mean(predicted == twenty_test.target)*100:.2f}%")

Mean accuracy score: 83.49%


In [34]:
# second model: support vector machine

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None))
])

# train
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [35]:
# prediction and evaluation

predicted = text_clf.predict(docs_test)
print(f"Mean accuracy score: {np.mean(predicted == twenty_test.target)*100:.2f}%")

Mean accuracy score: 91.01%


In [38]:
# performance analysis

# classification report
print(metrics.classification_report(y_true=twenty_test.target, y_pred=predicted, target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.80      0.87       319
         comp.graphics       0.87      0.98      0.92       389
               sci.med       0.94      0.89      0.91       396
soc.religion.christian       0.90      0.95      0.93       398

              accuracy                           0.91      1502
             macro avg       0.91      0.91      0.91      1502
          weighted avg       0.91      0.91      0.91      1502



In [39]:
# confusion matrix
metrics.confusion_matrix(y_true=twenty_test.target, y_pred=predicted)

array([[256,  11,  16,  36],
       [  4, 380,   3,   2],
       [  5,  35, 353,   3],
       [  5,  11,   4, 378]], dtype=int64)

# Hyperparameter Optimization

In [46]:
# grid search

parameters = {
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
    'clf__max_iter': (5, 10, 15)
}

In [47]:
gs_clf = GridSearchCV(estimator=text_clf, param_grid=parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [48]:
# check the best mean score

gs_clf.best_score_

0.9175000000000001

In [49]:
# check the param config for the best mean score

for param_name in sorted(parameters.keys()):
    print(f"{param_name}: {gs_clf.best_params_[param_name]}")

clf__alpha: 0.001
clf__max_iter: 5
tfidf__use_idf: True


In [50]:
# run the ML pipeline using the best config 

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=0.001, random_state=42, max_iter=5, tol=None))
])

# train
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [51]:
# prediction and evaluation

predicted = text_clf.predict(docs_test)
print(f"Mean accuracy score: {np.mean(predicted == twenty_test.target)*100:.2f}%")

Mean accuracy score: 91.01%
