In [1]:
import os
import random

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

from VectorSpaceModel import *
from KNNClassifier import *

In [2]:
# define constants
PATH = "./20news-18828/"

In [3]:
# generate file path of training set and test set
subdirs = os.listdir(PATH)
training_set_fns = []
test_set_fns = []
splitting_pos = 100
for dir_ in subdirs:
    files = os.listdir(PATH+dir_)
    training_set_fns.extend(os.path.join(dir_, fn) for fn in files[:splitting_pos])
    test_set_fns.extend(os.path.join(dir_, fn) for fn in files[splitting_pos:])

In [4]:
# read documents of training set
training_docs = []
for fn in training_set_fns:
    with open(PATH+fn, encoding='latin1') as f:
        training_docs.append(f.read())

In [5]:
# preprocess training docs
stop_words = set(stopwords.words('english'))
def filter_stop_words(doc):
    return filter(lambda w: w not in stop_words, doc)

tokenized_docs = []
tokenizer = RegexpTokenizer(r'\w+')
stemmer = SnowballStemmer("english", ignore_stopwords=True)
for doc in training_docs:
    word_tokens = tokenizer.tokenize(doc) # Tokenization
    word_stems = [stemmer.stem(w.lower()) for w in word_tokens] # Stemming/normalization
    word_without_stopwords = filter_stop_words(word_stems) # Stopword/controlled vocabulary filtering
    tokenized_docs.append(word_without_stopwords)

In [6]:
# get the VSM representation of each document
vsms = []
for doc in tokenized_docs:
    vsm = VectorSpaceModel(doc)
    VectorSpaceModel.accumulateDocumentFrequency(vsm.getTerms())
    vsms.append(vsm)

VectorSpaceModel.calIDF()
for vsm in vsms:
    vsm.calWeight(TF_Scale.MAXIMUM)
    vsm.normalize()

In [7]:
# select some test data
N = len(test_set_fns)
sample_num = 100
test_indices = random.sample(range(N), k=sample_num)

In [8]:
# preprocess selected test docs
test_docs = []
for i in test_indices:
    with open(PATH+test_set_fns[i], encoding='latin1') as f:
        test_docs.append(f.read())

test_tokenized_docs = []
for doc in test_docs:
    word_tokens = tokenizer.tokenize(doc) # Tokenization
    word_stems = [stemmer.stem(w.lower()) for w in word_tokens] # Stemming/normalization
    word_without_stopwords = filter_stop_words(word_stems) # Stopword/controlled vocabulary filtering
    test_tokenized_docs.append(word_without_stopwords)

In [9]:
# get the VSM representation of documents in sampled test dataset
test_vsms = []
for doc in test_tokenized_docs:
    vsm = VectorSpaceModel(doc)
    test_vsms.append(vsm)

for vsm in test_vsms:
    vsm.calWeight(TF_Scale.MAXIMUM)
    vsm.normalize()

In [10]:
# create a KNN classifier
k = 5
labels = [os.path.split(fn)[0] for fn in training_set_fns]
classifier = KNNClassifier(vsms, labels, k)

In [11]:
# training the classifier
classifier.train()

In [12]:
# classify the sampled test data
index = 1
x = test_vsms[index]
print(classifier.classify(x))
print(test_set_fns[test_indices[index]])

('sci.med', 1)
talk.politics.guns\54345
