In [1]:
import pymongo
from redis import Redis

REDIS = Redis(host='this_redis')
mongo_client = pymongo.MongoClient('this_mongo')
corpus_db = mongo_client.corpus

In [2]:
cd ..

/home/jovyan


In [3]:
import pandas as pd

In [4]:
corpus = pd.read_pickle('data/corpus.p')

In [5]:
corpus_df_records = corpus.to_dict('records')

In [6]:
corpus_df_records[:4]

[{'sentence': 'In the great green room There was a telephone And a red balloon',
  'title': 'Goodnight, Moon'},
 {'sentence': 'And a picture of-', 'title': 'Goodnight, Moon'},
 {'sentence': 'The cow jumping over the moon', 'title': 'Goodnight, Moon'},
 {'sentence': 'And there were three little bears sitting on chairs And two little kittens',
  'title': 'Goodnight, Moon'}]

In [7]:
corpus_db.documents.drop()
corpus_db.documents.insert_many(corpus_df_records)

<pymongo.results.InsertManyResult at 0x7f1fd4b1d7e0>

In [8]:
corpus_db.documents.count()

170

In [10]:
corpus_db.documents.find_one()

{'_id': ObjectId('5a91c4d357afe300612de3a6'),
 'sentence': 'In the great green room There was a telephone And a red balloon',
 'title': 'Goodnight, Moon'}

In [11]:
corpus_db.documents.find_one({'tokens' : {'$exists': False}})

{'_id': ObjectId('5a91c4d357afe300612de3a6'),
 'sentence': 'In the great green room There was a telephone And a red balloon',
 'title': 'Goodnight, Moon'}

# `MAPPER`

In [12]:
def tokenize(document):
    return (document
            .replace(',','')
            .replace('.','')
            .split())

def MAPPER(document):
    for word in tokenize(document):
        yield (word, 1)

In [13]:
doc = corpus_db.documents.find_one({'tokens' : {'$exists': False}})
list(MAPPER(doc['sentence']))

[('In', 1),
 ('the', 1),
 ('great', 1),
 ('green', 1),
 ('room', 1),
 ('There', 1),
 ('was', 1),
 ('a', 1),
 ('telephone', 1),
 ('And', 1),
 ('a', 1),
 ('red', 1),
 ('balloon', 1)]

In [14]:
unprocessed_filter = {'processed' : {'$exists': False}}
doc = corpus_db.documents.find_one(unprocessed_filter)
while doc:
    id_filter = { '_id' : doc['_id'] }
    tokens = list(MAPPER(doc['sentence']))
    update = { '$set' : {'tokens' : tokens, 'processed' : 'tokenized'} }
    corpus_db.documents.update_one(id_filter, update)
    doc = corpus_db.documents.find_one({'tokens' : {'$exists': False}})
    #ends because while None when Doc = None becaues Exists is not False -- would kill the process

In [15]:
corpus_db.documents.find_one()

{'_id': ObjectId('5a91c4d357afe300612de3a6'),
 'processed': 'tokenized',
 'sentence': 'In the great green room There was a telephone And a red balloon',
 'title': 'Goodnight, Moon',
 'tokens': [['In', 1],
  ['the', 1],
  ['great', 1],
  ['green', 1],
  ['room', 1],
  ['There', 1],
  ['was', 1],
  ['a', 1],
  ['telephone', 1],
  ['And', 1],
  ['a', 1],
  ['red', 1],
  ['balloon', 1]]}

# `COLLECTOR`

In [16]:
def COLLECTOR(document, vocabulary):
    for token in doc['tokens']:
        REDIS.sadd(vocabulary, token[0])
        #gets all of the unique words
        REDIS.rpush(*token)
        #Taking lists and making them arguments of a function could also be
        # REDIS.rpush(token[0],token[1]) - pushes it on the right (vs lpush) - e.g. becomes 'goodnight' : [1,1]

In [17]:
tokenized_filter = {'processed' : 'tokenized'}
doc = corpus_db.documents.find_one(tokenized_filter)
doc

{'_id': ObjectId('5a91c4d357afe300612de3a6'),
 'processed': 'tokenized',
 'sentence': 'In the great green room There was a telephone And a red balloon',
 'title': 'Goodnight, Moon',
 'tokens': [['In', 1],
  ['the', 1],
  ['great', 1],
  ['green', 1],
  ['room', 1],
  ['There', 1],
  ['was', 1],
  ['a', 1],
  ['telephone', 1],
  ['And', 1],
  ['a', 1],
  ['red', 1],
  ['balloon', 1]]}

In [18]:
while doc:
    id_filter = { '_id' : doc['_id'] }
    tokens = doc['tokens']
    update = { '$set' : {'processed' : 'counted'} }
    COLLECTOR(doc, 'corpus_vocab')
    corpus_db.documents.update_one(id_filter, update)
    doc = corpus_db.documents.find_one(tokenized_filter)

In [21]:
vocabulary = REDIS.smembers('corpus_vocab')
list(vocabulary)[:20]

[b'chairs',
 b'place',
 b'came',
 b'once',
 b'green',
 b'small',
 b'amazing',
 b'pick',
 b'walked',
 b'could',
 b'himself',
 b'over',
 b'All',
 b'went!',
 b'Max',
 b'climb"',
 b'hug',
 b'brown',
 b'house',
 b'bears']

In [20]:
for word in list(vocabulary)[:5]:
    print(REDIS.lrange(word, 0, -1))

[b'1', b'1', b'1']
[b'1', b'1']
[b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'1']
[b'1', b'1', b'1']
[b'1', b'1']


# `REDUCER`

In [22]:
def REDUCER(word):
    counts = [int(i) for i in REDIS.lrange(word, 0, -1)]
    return sum(counts)

In [23]:
word_counts = []
for word in vocabulary:
    word_counts.append((word.decode(), REDUCER(word)))

In [24]:
word_counts.sort(key=lambda x: x[1], reverse=True)

In [25]:
word_counts[:10]

[('and', 98),
 ('the', 97),
 ('a', 56),
 ('And', 41),
 ('said', 31),
 ('to', 30),
 ('he', 30),
 ('was', 24),
 ('tree', 21),
 ('of', 20)]