In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim import corpora

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

import csv
from six import iteritems

## create and save the dictionary
This is the mapping from words to their id's. It's the lookup table for features.

In [None]:
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
stops = [word for word in stopwords.words('english')]
stops += ["=", "->", ".", ","]

In [None]:
# make the dictionary, a collection of statistics about all tokens in the corpus
dictionary = corpora.Dictionary(tokenizer.tokenize(line) for line in open('./datasets/corpus.csv'))

The [tutorial](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Corpora_and_Vector_Spaces.ipynb) includes this step. I assume it makes the bag-of-words perform better down the line.

In [None]:
# find stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stops 
            if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]

# remove stop words and words that appear only once
dictionary.filter_tokens(stop_ids + once_ids)

# remove gaps in id sequence after words that were removed
dictionary.compactify()

In [None]:
dictionary.save('./datasets/corpus.dict')  # store the dictionary, for future reference

Define a class that efficiently represents the bag-of-words

In [None]:
# memory-friendly bag-of-words
class BOW(object):
    def __iter__(self):
        for line in open('./datasets/corpus.csv'):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())

Now we can make a bag of words and do something with it by iterating over it

In [None]:
arxiv_bow = BOW()
corpora.MmCorpus.serialize('./datasets/corpus.mm', arxiv_bow)  # store to disk, for later use

Represent an unseen document as a bag-of-words using this dictionary to define the vector space

In [None]:
#Create a token to feature ID map. Given a token, gives the feature ID of that token.
token2id_map = dictionary.token2id

The function doc2bow() simply counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector. The sparse vector [(0, 1), (1, 1)] therefore reads: in the document “all partial results illustrated entropy”, the words all (id=31) and partial (id=82) appear once; words that don't appear in the corpus are ignored

In [None]:
new_doc = "all partial results results illustrated entropy"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)