# Gensim: Dictionary Generation
## Source: https://radimrehurek.com/gensim/wiki.html

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [2]:
from gensim import corpora, models, similarities
import re


In [3]:
#Import text files
import os
corpus_path = os.path.join('data')
filenames = sorted([os.path.join(corpus_path,fn) for fn in os.listdir(corpus_path) if not fn.startswith('.') ])
len(filenames)

62

In [4]:
#Data Cleaning
def remove_punc2(text):
    punctuation = '!@#$%^&*()_-+={}[]:;"\'|<>,.?/~`1234567890'
    clean_text = ""
    for character in text:
        if character not in punctuation:
            clean_text += character
    return clean_text


In [5]:
#Need to make a function to which can supply each document to gensim. __iter__ is the key
## Weaker impleemntation for time being: All data stored in one big matrix
## Use re
## Do it on the fly for next step


new_document = []
for n,files in enumerate(filenames):
    if files != 'data/.DS_Store':  # Find a better way to handle it.
        infile = open(files, 'r')
        text=infile.read()
        clean_text=remove_punc2(text)
        new_document.append(clean_text)

In [6]:

new_dictionary = corpora.Dictionary(doc.lower().split() for doc in new_document)
new_dictionary.save('/tmp/dictionary.dict')

In [7]:
len(new_document)
#new_document[4]
#len(new_dictionary)

62

In [8]:
stoplist=set('''court supreme in as xx xxx
                on or by that is was are were no not vs be being been has have had case 
                he she I one every least less many now ever never say says said also get
                go goes just made make put see seen whether like well back even still way
                take since another however two three four five six seven eight nine ten first second new old high 
                long and but if or because as until while  of at by for with about against
                between into through during before after above below to from up down in out
                on off over under  again further then once  here there when where why how
                all any both each few more most other some such  no nor not only own same
                so than too very what which who whom this that these those they them their
                theirs themselves it its itself she her herself he his him himself you your
                yours yourself yourselves our ours ourselves us thou we me my myself i am 
                is are was were be been being have has had having do does did doing will 
                would shall should can could may might must ought a an the'''.split())
stop_ids = [new_dictionary.token2id[stopword] for stopword in stoplist
if stopword in new_dictionary.token2id]
#non_alpha_id = [new_dictionary.token2id[nonalphaword] for nonalphaword in stoplist
#if nonalphaword.isalpha()==False]
once_ids = [tokenid for tokenid, docfreq in new_dictionary.dfs.iteritems() if docfreq == 1]

new_dictionary.filter_tokens(stop_ids + once_ids)# + non_alpha_id)
new_dictionary.compactify()

In [9]:
class MyCorpus(object): #Convert dictionary to vectors for low memeory
    def __iter__(self):
        for doc in new_document:
            yield new_dictionary.doc2bow(doc.lower().split())
#print [stopword for stopword in stoplist]
corpus_small_memory = MyCorpus()
corpora.MmCorpus.serialize('/tmp/sercorpusnew.mm', corpus_small_memory) #Multiple formats available

In [10]:
print(new_dictionary.token2id) #wihtout stopword and non-numeric characters

{u'deferment': 0, u'foul': 1, u'narcotic': 2, u'clotted': 3, u'hanging': 4, u'increase': 5, u'granting': 6, u'eligible': 7, u'electricity': 8, u'party\u201d': 9, u'lord': 10, u'regional': 11, u'stipulate': 12, u'bringing': 13, u'differentiated': 108, u'basics': 15, u'commented': 16, u'specially': 17, u'pulse': 18, u'attended': 5668, u'sailed': 20, u'errors': 21, u'naidu': 22, u'contributed': 23, u'increasing': 24, u'specialist': 25, u'misjudged': 26, u'reported': 27, u'\xa0\xa0\xa0scr\xa0': 28, u'aftaruddin': 29, u'\u2026\u2026': 30, u'affiliated': 31, u'substance': 32, u'k': 33, u'deferring': 34, u'vicepresident': 35, u'\u2026\u201d': 36, u'reports': 37, u'controversy': 38, u'military': 39, u'cancellation': 40, u'criticism': 41, u'divide': 42, u'classification': 43, u'explained': 44, u'hamar': 45, u'summons': 46, u'psychological': 47, u'unit': 48, u'cheating': 49, u'spoke': 50, u'impartiality': 51, u'vol': 1150, u'therefore': 53, u'unfortunately': 5711, u'strike': 54, u'successful': 5

In [11]:
#print(list(corpus_small_memory))

In [345]:
#for vector in corpus_small_memory: # load one vector into memory at a time
#...     print len(vector)