## Very naive and straightforward text categorization

In [1]:
import string
import collections
import glob
import codecs
import numpy as np
import scipy as sp
from scipy.sparse import lil_matrix

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
translator = str.maketrans("","", string.punctuation)

In [3]:
import nltk

In [4]:
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.SnowballStemmer('english')

In [5]:
stemmer.stem("helicopters flying")

'helicopters fli'

In [6]:
def my_tokenizer(s):
    return [ stemmer.stem(x) for x in nltk.tokenize.word_tokenize(s.lower().translate(translator)) if not x in stopwords ]

In [None]:
[x for directory in glob.glob('data/20news-bydate/20news-bydate-train/*') for x in glob.glob(directory + '/*')]

In [13]:
cv = CountVectorizer(input='filename', encoding='latin1', min_df=10)
X = cv.fit_transform([x for directory in glob.glob('data/20news-bydate/20news-bydate-train/*') for x in glob.glob(directory + '/*')])
cv.vocabulary

In [23]:
tfv = TfidfVectorizer(input='filename', encoding='latin1', min_df=10, max_df=200, ngram_range=(1, 2), tokenizer=my_tokenizer)
X = tfv.fit_transform([x for directory in glob.glob('20news-bydate-train/*') for x in glob.glob(directory + '/*')])

In [17]:
labels_array = sorted(list(set([directory.split('/')[-1] for directory in glob.glob('20news-bydate-train/*')])))
labels_dict = {l: i for i, l in enumerate(labels_array)}
y = np.array([labels_dict[directory.split('/')[-1]] for directory in glob.glob('20news-bydate-train/*') for x in glob.glob(directory + '/*')])

In [10]:
X_test = tfv.transform([x for directory in glob.glob('20news-bydate-test/*') for x in glob.glob(directory + '/*')])
y_test = np.array([labels_dict[x.split('/')[1]] for x in [directory for directory in glob.glob('20news-bydate-test/*') for x in glob.glob(directory + '/*')]])

In [18]:
X.shape, y.shape, X_test.shape, y_test.shape

((11314, 10483), (11314,), (7532, 10483), (7532,))

In [19]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
model = MultinomialNB()
model.fit(X, y)
vocab_array = tfv.get_feature_names()
model.coef_.shape
[(vocab_array[y[0]], y[1]) for y in sorted([(i,x) for i, x in enumerate(model.coef_[0])], key=lambda x: np.abs(x[1]))[:20] ]

[('god', -6.2189046232034997),
 ('atheist', -6.3988392865917323),
 ('moral', -6.4395034832551241),
 ('islam', -6.4580124877455729),
 ('keith', -6.491187027997869),
 ('write', -6.7550487363271898),
 ('one', -6.8238868903327816),
 ('peopl', -6.8291634263994982),
 ('say', -6.8784324388447491),
 ('atheism', -6.9563495174527779),
 ('articl', -6.9797632844082873),
 ('dont', -6.9995806838319048),
 ('religion', -7.0279409120691465),
 ('think', -7.0299183749454137),
 ('would', -7.0827037177702472),
 ('jon', -7.0853506268168882),
 ('schneider', -7.1022536790955177),
 ('believ', -7.1170541127290292),
 ('thing', -7.1721647529960961),
 ('object', -7.1743589046410321)]

In [20]:
def test_model(model, name=None):
    if not name is None:
        print(name)
    model.fit(X, y)
    print("Train set: %.5f\tTest set: %.5f" % (model.score(X, y), model.score(X_test, y_test)) )

test_model(MultinomialNB(), 'MultinomialNB')
test_model(BernoulliNB(), 'BernoulliNB')

MultinomialNB
Train set: 0.92514	Test set: 0.81784
BernoulliNB
Train set: 0.84285	Test set: 0.69331


In [21]:
from sklearn.svm import LinearSVC, SVC
test_model(LinearSVC(), 'LinearSVC')
# test_model(SVC(kernel='rbf'), 'SVC RBF')

LinearSVC
Train set: 0.99788	Test set: 0.82900


In [22]:
from sklearn.linear_model import LogisticRegression
test_model(LogisticRegression(), 'LogRegr')

LogRegr
Train set: 0.95395	Test set: 0.82209
