In [1]:
import os

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

from VectorSpaceModel import *

In [2]:
# constants
PATH = "./20news-18828/"

In [3]:
# generate file path of training set and test set
subdirs = os.listdir(PATH)
training_set_fns = []
test_set_fns = []
splitting_pos = 100
for dir_ in subdirs:
    files = os.listdir(PATH+dir_)
    training_set_fns.extend(dir_ + '/' + fn for fn in files[:splitting_pos])
    test_set_fns.extend(dir_ + '/' + fn for fn in files[splitting_pos:])

In [4]:
# read documents of training set
training_docs = []
for fn in training_set_fns:
    with open(PATH+fn, encoding='latin1') as f:
        training_docs.append(f.read())
        
test_docs = []
for fn in test_set_fns:
    with open(PATH+fn, encoding='latin1') as f:
        test_docs.append(f.read())

In [5]:
# preprocessing text
stop_words = set(stopwords.words('english'))
def filter_stop_words(doc):
    return filter(lambda w: w not in stop_words, doc)

tokenized_docs = []
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()
for doc in training_docs:
    word_tokens = tokenizer.tokenize(doc) # Tokenization
    word_stems = [stemmer.stem(w.lower()) for w in word_tokens] # Stemming/normalization
    word_without_stopwords = filter_stop_words(word_stems) # Stopword/controlled vocabulary filtering
    tokenized_docs.append(word_without_stopwords)

In [7]:
# get the VSM representation of each document
vsms = []
for doc in tokenized_docs:
    vsm = VectorSpaceModel(doc)
    VectorSpaceModel.accumulateDocumentFrequency(vsm.rawTF.keys())
    vsms.append(vsm)

VectorSpaceModel.calIDF()
for vsm in vsms:
    vsm.calWeight(TF_Scale.MAXIMUM)
    vsm.toUnit()