In [1]:
import time
import pickle
import numpy as np
from nltk.corpus import reuters
from nltk.corpus import stopwords

In [2]:
# get the doc id for each doc in our corpus
doc_ids = reuters.fileids()
n_docs = len(doc_ids)
n_docs

10788

In [3]:
# load the cleaned corpus vocab
V = pickle.load(open('reuters-clean-vocab.p', 'rb'))

In [4]:
# get stop words so we can filter them out
stop_words = set(stopwords.words('english'))

In [5]:
# go through each doc in the corpus and represent it both as a
# BoW vector with counts (n_docs x len(V)) and as a list of vocab indices (n_docs x various)
bow_vecs = []
vidx_vecs = []
for i in range(n_docs):
    cur_doc_words = reuters.words(doc_ids[i])
    
    cur_bow_vec = np.zeros(len(V))
    cur_vidx_vec = []
    for w in cur_doc_words:
        w = w.lower()
        if not w.isalpha():
            continue
        if w in stop_words:
            continue
        if len(w) < 4:
            continue
        vidx = V.index(w)
        cur_bow_vec[vidx] += 1
        cur_vidx_vec.append(vidx)

    bow_vecs.append(cur_bow_vec)
    vidx_vecs.append(cur_vidx_vec)
    

In [6]:
# convert to numpy arrays and then confirm our understanding of the dimensions
bow_corpus = np.asarray(bow_vecs)
vidx_corpus = np.asarray([np.asarray(vec) for vec in vidx_vecs])

In [7]:
bow_corpus.shape

(10788, 26435)

In [8]:
len(vidx_corpus), len(vidx_corpus[0]), len(vidx_corpus[1])

(10788, 413, 55)

In [9]:
pickle.dump(bow_corpus, open('reuters-clean-corpus-bow.p', 'wb'))

In [10]:
pickle.dump(vidx_corpus, open('reuters-clean-corpus-vidx.p', 'wb'))