# Intro to Gensim
### Creating Corpora and Vector Space

#### Bag of words

In [56]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [57]:
from gensim import corpora

In [58]:
documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

In [59]:
stoplist = set('for a of the and to in'.split())

In [60]:
from pprint import pprint

In [61]:
# remove common words and tokenize
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]

In [62]:
pprint(texts)

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]


#### Counting frequenty

Use text with word frequency greater than 1

In [63]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]

In [64]:
pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [65]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')
print(dictionary)

2018-03-08 13:31:06,590 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-03-08 13:31:06,591 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2018-03-08 13:31:06,592 : INFO : saving Dictionary object under /tmp/deerwester.dict, separately None
2018-03-08 13:31:06,594 : INFO : saved /tmp/deerwester.dict


Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [66]:
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [67]:
new_doc = "Human computer interaction"

In [68]:
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

[(0, 1), (1, 1)]


- `doc2bow()` counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector. 
- The result `[(0,1), (1,1)]` actually reads `[(<computer id>, <present if 1>),(<human id>,<present if 1>)]` and not as the index of the new document provided.
- Other tokens are implicitly 0

In [69]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  # store to disk, for later use
pprint(corpus)

2018-03-08 13:31:07,606 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm
2018-03-08 13:31:07,607 : INFO : saving sparse matrix to /tmp/deerwester.mm
2018-03-08 13:31:07,608 : INFO : PROGRESS: saving document #0
2018-03-08 13:31:07,608 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2018-03-08 13:31:07,610 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index


[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


### Making things memory effifient

In [70]:
class MyCorpus(object):
    def __iter__(self):
        for line in open('mycorpus.txt'):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())

- `yield` is used to return value from generator such that the local variables are not destoryed after returning.
- Generators are functions that generate lists on the fly one item at a time. 
- Thus this function returns one line from the file at a time becoming memory efficiently. 

In [71]:
corpus_memory_friendly = MyCorpus()  # doesn't load the corpus into memory!
print(corpus_memory_friendly)

<__main__.MyCorpus object at 0x105931c18>


In [72]:
for vector in corpus_memory_friendly:
    print(vector)

[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


What the function does is that is makes use of the dictionary we have created above to generate vectors for any input text (document related to the training documents).

But wait! We still have to use large amount of text to memory to create the `dictionary` in the first place!

No, we will create a similar function to slove the issue.

In [74]:
from six import iteritems
# create dictionary id for every word in document
dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt'))
# store ids of stop words and words that appear once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
# remove stopwords and words that occure once
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()
pprint(dictionary.token2id)

2018-03-08 13:32:44,070 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-03-08 13:32:44,071 : INFO : built Dictionary(42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...) from 9 documents (total 69 corpus positions)


{'computer': 0,
 'eps': 8,
 'graph': 10,
 'human': 1,
 'interface': 2,
 'minors': 11,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'trees': 9,
 'user': 7}
