## Very naive and straightforward text categorization

In [8]:
import string
import collections
import glob
import codecs
import numpy as np
import scipy as sp
from scipy.sparse import lil_matrix

In [12]:
translator = str.maketrans("","", string.punctuation)
dirs = glob.glob('20news-bydate-train/*')

In [26]:
train_set = []
train_labels = []
total_counts = collections.Counter()
for directory in dirs:
    filenames = glob.glob(directory + '/*')
    print(directory, len(filenames))
    for filename in filenames:
        with codecs.open(filename, 'r', 'latin1') as f:
            lines = f.readlines()
            lines_proc = [y for x in lines for y in x.strip().lower().translate(translator).split()]
            count = collections.Counter(lines_proc)
            total_counts += count
            train_set.append(count)
            train_labels.append(directory)

In [14]:
vocab_counts = {w: c for w, c in total_counts.items() if c > 9}
vocab = {w: i for i,w in enumerate(sorted(vocab_counts.keys()))}

In [15]:
len(vocab)

17247

In [None]:
X = lil_matrix((len(train_set), len(vocab)))
for i_doc, doc in enumerate(train_set):
    for w,c in doc.items():
        if w in vocab:
            X[i_doc, vocab[w]] = c

In [18]:
labels_array = sorted(collections.Counter(train_labels).keys())
labels_dict = {l: i for i, l in enumerate(labels_array)}
y = np.array([labels_dict[x] for x in train_labels])

In [19]:
y.shape, X.shape

((11314,), (11314, 17247))

In [20]:
X

<11314x17247 sparse matrix of type '<class 'numpy.float64'>'
	with 1511629 stored elements in LInked List format>

In [21]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [22]:
model = MultinomialNB()
model.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
model.coef_.shape

(20, 17247)

In [24]:
model.score(X, y)

0.94679158564610222

In [25]:
model = BernoulliNB()
model.fit(X, y)
model.score(X, y)

0.8195156443344529

In [39]:
test_set = []
test_labels = []
for directory in glob.glob('20news-bydate-test/*'):
    filenames = glob.glob(directory + '/*')
    print(directory, len(filenames))
    for filename in filenames:
        with codecs.open(filename, 'r', 'latin1') as f:
            lines = f.readlines()
            lines_proc = [y for x in lines for y in x.strip().lower().translate(translator).split() if y in vocab]
            count = collections.Counter(lines_proc)
            test_set.append(count)
            test_labels.append(directory)

20news-bydate-test/sci.electronics 393
20news-bydate-test/comp.graphics 389
20news-bydate-test/talk.religion.misc 251
20news-bydate-test/comp.windows.x 395
20news-bydate-test/alt.atheism 319
20news-bydate-test/talk.politics.misc 310
20news-bydate-test/comp.os.ms-windows.misc 394
20news-bydate-test/comp.sys.mac.hardware 385
20news-bydate-test/rec.autos 396
20news-bydate-test/rec.sport.baseball 397
20news-bydate-test/comp.sys.ibm.pc.hardware 392
20news-bydate-test/rec.motorcycles 398
20news-bydate-test/sci.space 394
20news-bydate-test/soc.religion.christian 398
20news-bydate-test/talk.politics.mideast 376
20news-bydate-test/sci.crypt 396
20news-bydate-test/talk.politics.guns 364
20news-bydate-test/misc.forsale 390
20news-bydate-test/rec.sport.hockey 399
20news-bydate-test/sci.med 396


In [36]:
X_test = lil_matrix((len(test_set), len(vocab)))
for i_doc, doc in enumerate(test_set):
    for w,c in doc.items():
        if w in vocab:
            X[i_doc, vocab[w]] = c

In [41]:
y_test = np.array([labels_dict['20news-bydate-train/' + x.split('/')[1]] for x in test_labels])

In [48]:
def test_model(model, name=None):
    if not name is None:
        print(name)
    model.fit(X, y)
    print("Train set: %.5f\tTest set: %.5f" % (model.score(X, y), model.score(X_test, y_test)) )

In [50]:
test_model(MultinomialNB(), 'MultinomialNB')
test_model(BernoulliNB(), 'BernoulliNB')

MultinomialNB
Train set: 0.91877	Test set: 0.05297
BernoulliNB
Train set: 0.79247	Test set: 0.05178


Oh my God! Nothing works! What do we do?..