In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

from NBC import *

In [3]:
# define constants
PATH = "./20news-18828/"

In [4]:
# generate file path of training set and testing set
subdirs = os.listdir(PATH)
filepaths = []
for dir_ in subdirs:
    files = os.listdir(PATH+dir_)
    filepaths.extend(os.path.join(dir_, fn) for fn in files)

training_set_fns, test_set_fns = train_test_split(filepaths, test_size=0.2)

In [5]:
print('The training set contains %d files' % len(training_set_fns))
print('The testing set contains %d files' % len(test_set_fns))

The training set contains 15062 files
The testing set contains 3766 files


In [6]:
# read documents of training set
training_docs = []
for fn in training_set_fns:
    with open(PATH+fn, encoding='latin1') as f:
        training_docs.append(f.read())

In [7]:
# preprocess training docs
stop_words = set(stopwords.words('english'))
def filter_stop_words(doc):
    return filter(lambda w: w not in stop_words, doc)

tokenized_docs = []
tokenizer = RegexpTokenizer(r'\w+')
stemmer = SnowballStemmer("english", ignore_stopwords=True)
for doc in training_docs:
    word_tokens = tokenizer.tokenize(doc) # Tokenization
    word_stems = [stemmer.stem(w.lower()) for w in word_tokens] # Stemming/normalization
    word_without_stopwords = filter_stop_words(word_stems) # Stopword/controlled vocabulary filtering
    tokenized_docs.append(list(word_without_stopwords))

In [8]:
# preprocess the whole testing set
N = len(test_set_fns)

whole_test_docs = []
for fn in test_set_fns:
    with open(PATH+fn, encoding='latin1') as f:
        whole_test_docs.append(f.read())

whole_test_tokenized_docs = []
for doc in whole_test_docs:
    word_tokens = tokenizer.tokenize(doc) # Tokenization
    word_stems = [stemmer.stem(w.lower()) for w in word_tokens] # Stemming/normalization
    word_without_stopwords = filter_stop_words(word_stems) # Stopword/controlled vocabulary filtering
    whole_test_tokenized_docs.append(list(word_without_stopwords))

In [9]:
# create a naive Bayes classifier
labels = [os.path.dirname(fn) for fn in training_set_fns]
classifier = NaiveBayesClassifier(tokenized_docs, labels)

In [10]:
# training the classifier
classifier.train()

In [11]:
# classify the testing data for different k
results = []
for doc in whole_test_tokenized_docs:
    results.append(classifier.classify(doc))

In [12]:
error_rate = 0
sum_ = 0
for i, label in enumerate(results):
    if label != os.path.dirname(test_set_fns[i]):
        sum_ += 1

error_rate= sum_/N
print('The accuracy is: %f' % (1-error_rate))

The accuracy is: 0.800319
