In [97]:
import pandas as pd
import numpy as np
import os
import json
import re
from progressbar import ProgressBar
from multiprocessing import Process, Queue
import threading
import enchant
import pickle

In [34]:
DATA_ROOT = '/Users/ivan/Documents/sp_17/reddit_data'
DATA_YEARS = ['2007']

RAW_DATA_FILES = [os.listdir(os.path.join(DATA_ROOT, 'raw_data', year)) for year in DATA_YEARS]

RAW_DATA_ABS_FILES = []

for i in range(len(DATA_YEARS)):
    for j in range(len(RAW_DATA_FILES[i])):
        if RAW_DATA_FILES[i][j].startswith('.'):
            pass
        else:
            RAW_DATA_ABS_FILES.append( os.path.join(DATA_ROOT, 'raw_data' , DATA_YEARS[i], RAW_DATA_FILES[i][j]))
RAW_DATA_FILES = RAW_DATA_ABS_FILES
RAW_DATA_ABS_FILES = []

In [13]:
_WORD_SPLIT = re.compile("([.,!?\"':;)(])")
_DIGIT_RE   = re.compile(r"\d")

def basic_tokenizer(sentence):
    """Very basic tokenizer: split the sentence into a list of tokens."""
    words = []
    for space_separated_fragment in sentence.strip().split():
        words.extend(_WORD_SPLIT.split(space_separated_fragment))
    return [w for w in words if w]

In [109]:
def add_to_word_dictionary(split_sentence, dictionary):
    for word in split_sentence:
        if word.lower() in dictionary:
            dictionary[word.lower()] += 1
        else:
            dictionary[word.lower()] = 1

In [98]:
def process_file(file, word_q):
    print("Current PID: " + str(os.getpid()))
    word_dict = {}
    pbar = ProgressBar()
    with open(file) as f:
        for line in pbar(f):
            add_to_word_dictionary(basic_tokenizer(json.loads(line)['body']), word_dict)
    word_q.put(word_dict)

## Testing multiple processes

In [55]:
word_dicts = []
pbar = ProgressBar()
for file in RAW_DATA_FILES:
    word_dict = {}
    p = Process(target=process_file, args=(file, word_dict,))
    p.start()
    word_dicts.append(word_dict)

Current PID: 14700
Current PID: 14699
Current PID: 14701


| 663 Elapsed Time: 0:00:33                                                    
| 1307 Elapsed Time: 0:01:06                                                   
| 1370 Elapsed Time: 0:01:09                                                   


In [62]:
word_dicts = []
pbar = ProgressBar()
for file in RAW_DATA_FILES:
    print("Thread Count: %i" % threading.active_count())
    word_dict = {}
    p = Process(target=process_file, args=(file, word_dict,))
    p.start()
    word_dicts.append(word_dict)

Thread Count: 5
Thread Count: 5
Thread Count: 5
Current PID: 14856
Current PID: 14857
Current PID: 14855


\ 630 Elapsed Time: 0:00:31                                                    
| 1066 Elapsed Time: 0:00:54                                                   
| 1110 Elapsed Time: 0:00:56                                                   


1. All cores running at about 90%
2. Takes 0:33, 1:06, 1:09 for the three to be done.
3. Brandon is done in 15 seconds! 

## Using just one process

In [63]:
print("Current PID: " + str(os.getpid()))
word_dict = {}
pbar = ProgressBar()
for file in RAW_DATA_FILES:
    with open(file) as f:
        for line in pbar(f):
            add_to_dictionary(basic_tokenizer(json.loads(line)['body']), word_dict)

/ 4 Elapsed Time: 0:00:00                                                      

Current PID: 12842


| 281 Elapsed Time: 0:00:14                                                    
| 847 Elapsed Time: 0:00:42                                                    
| 1449 Elapsed Time: 0:01:12                                                   


1. Cores running at about 35 % average usage.
2. Takes 0:12, 0:40, 1:11 for the tree to be done.
3. Using the threading package we see that in this one process there are 5 threads running. 
4. Brandons computer takes 0:06, 0:20, 0:35 secnods to complete

## Trying to generate scores

1. Create a dictionary that has KEY: name, value: score

## Using one process

In [81]:
def comment_score(comment, word_freq):
    if type(comment) == str:
        comment = basic_tokenizer(comment)
    word_count = len(comment)
    score = 0
    for word in comment:
        if not d.check(word):
            try: 
                score = score + 1.0/word_freq[word]
            except ZeroDivisionError:
                score = score + 1.0
    try:
        return score / word_count
    except ZeroDivisionError:
        return 1

In [83]:
score_dict = {}
pbar = ProgressBar()
d = enchant.Dict('en_US')
for file in RAW_DATA_FILES:
    print(threading.active_count())
    with open(file) as f:
        for line in pbar(f):
            data = json.loads(line)
            score_dict[data['name']] = comment_score((data['body']), word_dict)

| 3 Elapsed Time: 0:00:00                                                      

5


| 3223 Elapsed Time: 0:02:48                                                   
| 3226 Elapsed Time: 0:02:48                                                   

5


| 9127 Elapsed Time: 0:08:05                                                   
| 9131 Elapsed Time: 0:08:05                                                   

5


| 16389 Elapsed Time: 0:14:23                                                  


## Pickling the data

In [92]:
with open(os.path.join(DATA_ROOT, 'compressed_data' , '07_08' , 'score_dict'), 'wb') as pickle_file:
    pickle.dump(score_dict, pickle_file)
    
with open(os.path.join(DATA_ROOT, 'compressed_data' , '07_08' , 'word_dict'), 'wb') as pickle_file:
    pickle.dump(word_dict, pickle_file)

## Using multiple processes

In [93]:
def process_file_comments(file, score_dicts):
    print("Current PID: " + str(os.getpid()))
    score_dict = {}
    with open(file) as f:
        for line in pbar(f):
            data = json.loads(line)
            score_dict[data['name']] = comment_score((data['body']), word_dict)
    score_dicts.append(score_dict)

In [95]:
score_dicts = []
pbar = ProgressBar()
d = enchant.Dict('en_US')
for file in RAW_DATA_FILES:
    print(threading.active_count())
    p = Process(target=process_file_comments, args=(file, score_dicts,))
    p.start()
    

5
5
5
Current PID: 15694
Current PID: 15693
Current PID: 15695


| 6191 Elapsed Time: 0:05:35                                                   
| 10476 Elapsed Time: 0:09:42                                                  
| 11381 Elapsed Time: 0:10:07                                                  


1. Using multiple process in this case is actually noticeable.
2. Since there is more work here than just IO, which is probably the bottleneck in the previous test(converting to words).
3. 14:23 -> 10:07, so a non-negligible improvement.
4. The code is not fully functional since the number score_dicts list is empty.

## Working with Queue objects to store return value

In [106]:
word_q = Queue()
pbar = ProgressBar()
for file in RAW_DATA_FILES:
    p = Process(target=process_file, args=(file, word_q,))
    p.start()

Current PID: 17214
Current PID: 17213
Current PID: 17212


| 611 Elapsed Time: 0:00:31                                                    
| 988 Elapsed Time: 0:00:50                                                    
| 1033 Elapsed Time: 0:00:52                                                   


In [None]:
print(word_q.get())

## Generating a dictionary of sentences

### Using multiple processes

In [117]:
def process_file_sentences(file, q):
    print("Current PID: " + str(os.getpid()))
    sentence_dict = {}
    pbar = ProgressBar()
    with open(file) as f:
        for line in pbar(f):
            data = json.loads(line)
            sentence_dict[data['name']] = (data['body'])
    q.put(sentence_dict)

In [118]:
sentence_q = Queue()
pbar = ProgressBar()
for file in RAW_DATA_FILES:
    p = Process(target=process_file_sentences, args=(file, sentence_q,))
    p.start()

Current PID: 17487
Current PID: 17488


/ 0 Elapsed Time: 0:00:00                                                      

Current PID: 17489


| 170 Elapsed Time: 0:00:08                                                    
| 307 Elapsed Time: 0:00:15                                                    
| 316 Elapsed Time: 0:00:16                                                    


## Using one process

In [123]:
sentence_dict = {}
pbar = ProgressBar()
for file in RAW_DATA_FILES:
    with open(file) as f:
        for line in pbar(f):
            data = json.loads(line)
            sentence_dict[data['name']] = (data['body'])

| 67 Elapsed Time: 0:00:03                                                     
| 237 Elapsed Time: 0:00:11                                                    
| 402 Elapsed Time: 0:00:20                                                    


1. Dealing with merging the results from using multiple processes takes longer than just running one process. 

In [126]:
with open(os.path.join(DATA_ROOT, 'compressed_data' , '07_08' , 'sentence_dict'), 'wb') as pickle_file:
    pickle.dump(sentence_dict, pickle_file)

## Using the sentence dictionary to generate the word_dict and sentence_dict

In [129]:
new_word_dict = {}
pbar = ProgressBar()
for sentence in pbar(sentence_dict.keys()):
    add_to_word_dictionary(basic_tokenizer(sentence_dict[sentence]), new_word_dict)

100% (886802 of 886802) |#################| Elapsed Time: 0:01:07 Time: 0:01:07


In [130]:
tokenized_sentence_dict = {}
pbar = ProgressBar()
for sentence in pbar(sentence_dict.keys()):
    tokenized_sentence_dict[sentence] = basic_tokenizer(sentence_dict[sentence])

100% (886802 of 886802) |#################| Elapsed Time: 0:01:03 Time: 0:01:03


In [132]:
new_word_dict = {}
pbar = ProgressBar()
for sentence in pbar(tokenized_sentence_dict.keys()):
    add_to_word_dictionary(tokenized_sentence_dict[sentence], new_word_dict)

100% (886802 of 886802) |#################| Elapsed Time: 0:00:43 Time: 0:00:43
