In [1]:
import string, glob, codecs
import collections
import numpy as np
import scipy as sp
import os

translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
dirs = glob.glob('data/20news-bydate/20news-bydate-train/*')

In [2]:
train_set = []
train_labels = []
total_counts = collections.Counter()

for directory in dirs:
    filenames = glob.glob(directory + '/*')
    for filename in filenames:
        with codecs.open(filename, 'r', 'latin1') as f:
            lines = f.readlines()
            lines_proc = [y for x in lines for y in x.strip().lower().translate(translator).split()]
            count = collections.Counter(lines_proc)
            total_counts += count
            train_set.append(count)
            train_labels.append(os.path.basename(os.path.dirname(filename)))
    print(directory + " " + str(len(filenames)))

data/20news-bydate/20news-bydate-train/alt.atheism 480
data/20news-bydate/20news-bydate-train/comp.graphics 584
data/20news-bydate/20news-bydate-train/comp.os.ms-windows.misc 591
data/20news-bydate/20news-bydate-train/comp.sys.ibm.pc.hardware 590
data/20news-bydate/20news-bydate-train/comp.sys.mac.hardware 578
data/20news-bydate/20news-bydate-train/comp.windows.x 593
data/20news-bydate/20news-bydate-train/misc.forsale 585
data/20news-bydate/20news-bydate-train/rec.autos 594
data/20news-bydate/20news-bydate-train/rec.motorcycles 598
data/20news-bydate/20news-bydate-train/rec.sport.baseball 597
data/20news-bydate/20news-bydate-train/rec.sport.hockey 600
data/20news-bydate/20news-bydate-train/sci.crypt 595
data/20news-bydate/20news-bydate-train/sci.electronics 591
data/20news-bydate/20news-bydate-train/sci.med 594
data/20news-bydate/20news-bydate-train/sci.space 593
data/20news-bydate/20news-bydate-train/soc.religion.christian 599
data/20news-bydate/20news-bydate-train/talk.politics.guns 

In [None]:
total_counts.most_common(50)

In [3]:
vocab_counts = {w : c for w , c in total_counts.items() if c > 9}
vocab = { w : i for i,w in enumerate(sorted(vocab_counts))}

In [4]:
X = np.zeros((len(train_set), len(vocab)))
for iDoc, doc in enumerate(train_set):
    for w,c in doc.items():
        if w in vocab:
            X[iDoc, vocab[w]] = c

In [5]:
labels_array = sorted(collections.Counter(train_labels).keys())
labels_dict = {l : i for i,l in enumerate(labels_array)}
y = np.array([labels_dict[x] for x in train_labels])

In [56]:
print(X.shape)
print(y.shape)

(11314, 20192)
(11314,)


In [57]:
X

<11314x20192 sparse matrix of type '<class 'numpy.float64'>'
	with 1667404 stored elements in LInked List format>

In [60]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [61]:
model  = MultinomialNB()
model.fit(X , y)
model.score(X , y)

0.90825525897118609

In [62]:
model  = BernoulliNB()
model.fit(X , y)
model.score(X , y)

0.82923811207353726

# Test Section

In [6]:
dirs = glob.glob('data/20news-bydate/20news-bydate-test/*')

test_set = []
test_labels = []
test_total_counts = collections.Counter()

for directory in dirs:
    filenames = glob.glob(directory + '/*')
    for filename in filenames:
        with codecs.open(filename, 'r', 'latin1') as f:
            lines = f.readlines()
            lines_proc = [y for x in lines for y in x.strip().lower().translate(translator).split()]
            count = collections.Counter(lines_proc)
            test_total_counts += count
            test_set.append(count)
            test_labels.append(os.path.basename(os.path.dirname(filename)))
    print(directory + " " + str(len(filenames)))

data/20news-bydate/20news-bydate-test/alt.atheism 319
data/20news-bydate/20news-bydate-test/comp.graphics 389
data/20news-bydate/20news-bydate-test/comp.os.ms-windows.misc 394
data/20news-bydate/20news-bydate-test/comp.sys.ibm.pc.hardware 392
data/20news-bydate/20news-bydate-test/comp.sys.mac.hardware 385
data/20news-bydate/20news-bydate-test/comp.windows.x 395
data/20news-bydate/20news-bydate-test/misc.forsale 390
data/20news-bydate/20news-bydate-test/rec.autos 396
data/20news-bydate/20news-bydate-test/rec.motorcycles 398
data/20news-bydate/20news-bydate-test/rec.sport.baseball 397
data/20news-bydate/20news-bydate-test/rec.sport.hockey 399
data/20news-bydate/20news-bydate-test/sci.crypt 396
data/20news-bydate/20news-bydate-test/sci.electronics 393
data/20news-bydate/20news-bydate-test/sci.med 396
data/20news-bydate/20news-bydate-test/sci.space 394
data/20news-bydate/20news-bydate-test/soc.religion.christian 398
data/20news-bydate/20news-bydate-test/talk.politics.guns 364
data/20news-b

In [7]:
X_test = np.zeros((len(test_set), len(vocab)))
for iDoc, doc in enumerate(test_set):
    for w,c in doc.items():
        if w in vocab:
            X_test[iDoc, vocab[w]] = c

In [8]:
y_test = np.array([labels_dict[x] for x in test_labels])

In [83]:
model  = MultinomialNB()
model.fit(X_test , y_test)
model.score(X_test , y_test)

0.94715878916622409

In [84]:
model  = BernoulliNB()
model.fit(X_test , y_test)
model.score(X_test , y_test)

0.82952734997344668

In [112]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler( with_mean=False ).fit(X_test)
X_test = scaler.transform(X_test)
model  = MultinomialNB()
model.fit(X_test , y_test)
model.score(X_test , y_test)

0.9880509824747743

In [121]:
from sklearn import preprocessing
model  = MultinomialNB()
model.fit(X , y)
model.score(X_test , y_test)

0.78850238980350507

In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [10]:
dict_classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Nearest Neighbors": KNeighborsClassifier(),
#     "Linear SVM": SVC(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Decision Tree": tree.DecisionTreeClassifier(),
#     "Random Forest": RandomForestClassifier(n_estimators = 18),
    "Neural Net": MLPClassifier(alpha = 1),
    "Naive Bayes": GaussianNB()
}

In [None]:
no_classifiers = len(dict_classifiers.keys())

def batch_classify(X_train, Y_train, X_test, Y_test, verbose = True):
    df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,4)), columns = ['classifier', 'train_score', 'test_score', 'training_time'])
    count = 0
    for key, classifier in dict_classifiers.items():
        t_start = time.clock()
        classifier.fit(X_train, Y_train)
        t_end = time.clock()
        t_diff = t_end - t_start
        train_score = classifier.score(X_train, Y_train)
        test_score = classifier.score(X_test, Y_test)
        df_results.loc[count,'classifier'] = key
        df_results.loc[count,'train_score'] = train_score
        df_results.loc[count,'test_score'] = test_score
        df_results.loc[count,'training_time'] = t_diff
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=key, f=t_diff))
        count+=1
    return df_results

In [None]:
df_results = batch_classify(X, y, X_test, y_test)
display(df_results.sort_values(by='test_score', ascending=False))