### Import required libraries

In [1]:
import numpy as np

In [2]:
import re

### Load input data

In [3]:
categories = ['talk.politics.guns', 'talk.politics.misc', 'talk.politics.mideast', 'talk.religion.misc']

In [4]:
from sklearn.datasets import fetch_20newsgroups

In [5]:
data_trace = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [6]:
data_trace.target_names

['talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [7]:
len(data_trace.data)

1952

In [8]:
len(data_trace.filenames)

1952

In [9]:
print(data_trace.target_names[data_trace.target[0]])

talk.religion.misc


In [10]:
data_trace.data[0]

"From: psyrobtw@ubvmsb.cc.buffalo.edu (Robert Weiss)\nSubject: 6 Apr 93   God's Promise in John 16:24\nOrganization: University at Buffalo\nLines: 8\nNews-Software: VAX/VMS VNEWS 1.41\nNntp-Posting-Host: ubvmsb.cc.buffalo.edu\n\n\n\n\tHitherto have ye asked nothing\n\tin my name:\n\task, and ye shall receive,\n\tthat your joy may be full.\n\n\tJohn 16:24\n"

In [11]:
print("\n".join(data_trace.data[0].split("\n")[:]))

From: psyrobtw@ubvmsb.cc.buffalo.edu (Robert Weiss)
Subject: 6 Apr 93   God's Promise in John 16:24
Organization: University at Buffalo
Lines: 8
News-Software: VAX/VMS VNEWS 1.41
Nntp-Posting-Host: ubvmsb.cc.buffalo.edu



	Hitherto have ye asked nothing
	in my name:
	ask, and ye shall receive,
	that your joy may be full.

	John 16:24



In [12]:
data_trace.target[:10]

array([3, 0, 1, 3, 3, 1, 3, 1, 2, 3], dtype=int64)

In [13]:
for t in data_trace.target[:20]:
...     print(data_trace.target_names[t])

talk.religion.misc
talk.politics.guns
talk.politics.mideast
talk.religion.misc
talk.religion.misc
talk.politics.mideast
talk.religion.misc
talk.politics.mideast
talk.politics.misc
talk.religion.misc
talk.politics.mideast
talk.politics.mideast
talk.politics.misc
talk.politics.misc
talk.politics.guns
talk.politics.misc
talk.politics.mideast
talk.religion.misc
talk.politics.guns
talk.politics.guns


### The input files are texts. We need to:

> **1. Tokenize the strings to extract individual words. We can use white spaces and punctuation marks as token separators.**
<br>
> **2. Count the number of occurrences of tokens in each document.**
<br>
> **3. Convert counts to probabilities. This involves normalizing and possibly weighting.**



In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
train_counts = vectorizer.fit_transform(data_trace.data)
train_counts.shape

(1952, 36379)

In [15]:
dictionary = vectorizer.vocabulary_
dictionary

{'from': 15280,
 'psyrobtw': 26855,
 'ubvmsb': 33598,
 'cc': 8663,
 'buffalo': 7816,
 'edu': 12943,
 'robert': 28687,
 'weiss': 35229,
 'subject': 31592,
 'apr': 5621,
 '93': 3596,
 'god': 15935,
 'promise': 26636,
 'in': 18112,
 'john': 19385,
 '16': 760,
 '24': 1787,
 'organization': 24582,
 'university': 33982,
 'at': 6061,
 'lines': 20843,
 'news': 23605,
 'software': 30682,
 'vax': 34508,
 'vms': 34823,
 'vnews': 34829,
 '41': 2388,
 'nntp': 23773,
 'posting': 26125,
 'host': 17430,
 'hitherto': 17196,
 'have': 16803,
 'ye': 35981,
 'asked': 5920,
 'nothing': 23903,
 'my': 23179,
 'name': 23310,
 'ask': 5917,
 'and': 5315,
 'shall': 29922,
 'receive': 27620,
 'that': 32567,
 'your': 36093,
 'joy': 19436,
 'may': 21847,
 'be': 6760,
 'full': 15351,
 'hambidge': 16588,
 'bms': 7342,
 'com': 9563,
 're': 27503,
 'gun': 16368,
 'control': 10225,
 'reply': 28115,
 'to': 32905,
 'bristol': 7699,
 'myers': 23182,
 'squibb': 31084,
 '94': 3670,
 'article': 5860,
 'c51l52': 8091,
 'bgo': 7

In [16]:
vectorizer.vocabulary_.get('algorithm')

5018

**We have the counts. Now to probabilities and naive Bayes.**

In [17]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_counts, data_trace.target)

In [18]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
new_counts = vectorizer.transform(docs_new)

In [19]:
predicted = clf.predict(new_counts)

In [20]:
print(data_trace.target_names[predicted[0]])
print(data_trace.target_names[predicted[1]])

talk.religion.misc
talk.politics.guns


**Apply on test data**

In [21]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
test_count = vectorizer.transform(docs_test)
predicted = clf.predict(test_count)
np.mean(predicted == twenty_test.target)

0.8547271329746349

**Obtain performance metrics**

In [22]:
from sklearn import metrics
print (metrics.classification_report(twenty_test.target,predicted,target_names=twenty_test.target_names))

                       precision    recall  f1-score   support

   talk.politics.guns       0.75      0.94      0.83       364
talk.politics.mideast       0.99      0.92      0.95       376
   talk.politics.misc       0.81      0.68      0.74       310
   talk.religion.misc       0.91      0.86      0.88       251

          avg / total       0.86      0.85      0.85      1301



In [23]:
from sklearn import metrics
metrics.confusion_matrix(twenty_test.target, predicted)

array([[341,   1,  12,  10],
       [  3, 345,  24,   4],
       [ 90,   1, 211,   8],
       [ 19,   3,  14, 215]], dtype=int64)