## Parse data files from PushShift.io for selected subreddits using PyWren

Importing the necessary packages.

In [3]:
import json
import bz2
import pywren

Importing PyWren and configuring PyWren appropriately.

In [4]:
# Creating appropriate pywren buckets
! pywren create_config 
! pywren create_role
! pywren create_bucket
! pywren deploy_lambda

Traceback (most recent call last):
  File "/Users/angelicabosko/anaconda3/bin/pywren", line 8, in <module>
    sys.exit(main())
  File "/Users/angelicabosko/anaconda3/lib/python3.7/site-packages/pywren/scripts/pywrencli.py", line 711, in main
    return cli() # pylint: disable=no-value-for-parameter
  File "/Users/angelicabosko/anaconda3/lib/python3.7/site-packages/click/core.py", line 829, in __call__
    return self.main(*args, **kwargs)
  File "/Users/angelicabosko/anaconda3/lib/python3.7/site-packages/click/core.py", line 782, in main
    rv = self.invoke(ctx)
  File "/Users/angelicabosko/anaconda3/lib/python3.7/site-packages/click/core.py", line 1259, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/Users/angelicabosko/anaconda3/lib/python3.7/site-packages/click/core.py", line 1066, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/Users/angelicabosko/anaconda3/lib/python3.7/site-packages/click/core.py", line 610, in invoke
    ret

Listing Subreddits to pull data from.

In [None]:
target_subreddits = ['TheRedPill', 'Feminism', 'technews']

Creating a function to grab all the comments for the target subreddits.

In [None]:
# comment parser for single file (month)
# Code adapted from: https://github.com/AhmedSoli/Reddit-Politics/blob/master/01_Content_Analysis/PreProcessing/.ipynb_checkpoints/010_ExtractCommentsTextCorpus-checkpoint.ipynb
from IPython.display import display, clear_output
import json
import os
import multiprocessing
import pickle
import bz2

def extract_comments(file_path):
    '''
    '''
    month_comments = {}
    for sub in target_subreddits:
        month_comments[sub] = []
        
    if '.bz2' in file_path:
        o = bz2.open

    with o(file_path,'rt') as comments_info:
            for i,comment in enumerate(comments_info):
                # load comment into json object
                comment = json.loads(comment)
#               # checking if comment is located in desired subreddit 
                if comment['subreddit'] in target_subreddits:
                    # append the body of the comment and time created
                    month_comments[comment['subreddit']].append((comment['body'], comment['created_utc']))

                    
    # Returning the month_comments dictionary
    return month_comments

Using PyWren to parallelize comment scraping per month. 

In [None]:
pushshift_files = ['RC_2016-01.bz2', 'RC_2016-02.bz2', 'RC_2016-03.bz2',
                   'RC_2016-04.bz2', 'RC_2016-05.bz2', 'RC_2016-06.bz2',
                   'RC_2016-07.bz2', 'RC_2016-08.bz2', 'RC_2016-09.bz2',
                   'RC_2016-10.bz2', 'RC_2016-11.bz2', 'RC_2016-12.bz2']

pwex = pywren.default_executor()
futures = pwex.map(func = extract_comments, iterdata = pushshift_files)

monthly_files = pywren.get_all_results(futures)

## Parse data files from PushShift.io for selected subreddits using local computer

Code is adapted from:

https://github.com/AhmedSoli/Reddit-Politics/blob/master/01_Content_Analysis/PreProcessing/.ipynb_checkpoints/010_ExtractCommentsTextCorpus-checkpoint.ipynb

In [5]:
# Subreddits we are pulling data from
target_subreddits = ['TheRedPill', 'Feminism', 'technews']

In [17]:
from IPython.display import display, clear_output
import json
import os
import multiprocessing
import pickle
import bz2

def extract_comments(file_path):
    '''
    This function extracts all of the comments necessary from
    each subreddit from the file path given.
    '''
    month_comments = {}
    for sub in target_subreddits:
        month_comments[sub] = []
    if '.bz2' in file_path:
        o = bz2.open

    with o(file_path,'rt') as comments_info:
            for i,comment in enumerate(comments_info):
                # load comment into json object
                comment = json.loads(comment)
                # check if comment is contained within desired subreddit
                if comment['subreddit'] in target_subreddits:
                    # append the body of the comment and the time created
                    month_comments[comment['subreddit']].append((comment['body'], comment['created_utc']))
                
                 # display progress
                if i % 1000000 == 0:
                    clear_output(wait=True)
                    print(file_path,i)
                    
    # serialize the result for this month through pickling
    pickle_out = open("reddit_comments/comments_corpus_" + file_path[11:-4] + ".pickle","wb")
    pickle.dump(month_comments, pickle_out)
    pickle_out.close()
    

In [18]:
# Running Locally: January 2016
import time

start = time.time()
extract_comments('push_shift/RC_2016-01.bz2')
end_time = time.time() - start

end_time

push_shift/RC_2016-01.bz2 61000000


1099.5419058799744

In [19]:
# Opening the pickle file: January 2016
Jan_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-01.pickle","rb"))

In [59]:
#Jan_2016

In [21]:
# Running Locally: February 2016

extract_comments('push_shift/RC_2016-02.bz2')

push_shift/RC_2016-02.bz2 59000000


In [22]:
# Opening the pickle file: February 2016
Feb_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-02.pickle","rb"))

In [60]:
#Feb_2016

In [24]:
# Running Locally: March 2016

extract_comments('push_shift/RC_2016-03.bz2')

push_shift/RC_2016-03.bz2 63000000


In [25]:
# Opening the pickle file: March 2016
Mar_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-03.pickle","rb"))

In [61]:
#Mar_2016

In [27]:
# Running Locally: April 2016

extract_comments('push_shift/RC_2016-04.bz2')

push_shift/RC_2016-04.bz2 64000000


In [28]:
# Opening the pickle file: April 2016
Apr_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-04.pickle","rb"))

In [62]:
#Apr_2016

In [30]:
# Running Locally: May 2016

extract_comments('push_shift/RC_2016-05.bz2')

push_shift/RC_2016-05.bz2 65000000


In [31]:
# Opening the pickle file: May 2016
May_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-05.pickle","rb"))

In [63]:
#May_2016

In [33]:
# Running Locally: June 2016

extract_comments('push_shift/RC_2016-06.bz2')

push_shift/RC_2016-06.bz2 65000000


In [34]:
# Opening the pickle file: June 2016
Jun_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-06.pickle","rb"))

In [64]:
#Jun_2016

In [36]:
# Running Locally: July 2016

extract_comments('push_shift/RC_2016-07.bz2')

push_shift/RC_2016-07.bz2 66000000


In [37]:
# Opening the pickle file: July 2016
Jul_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-07.pickle","rb"))

In [65]:
#Jul_2016

In [39]:
# Running Locally: August 2016

extract_comments('push_shift/RC_2016-08.bz2')

push_shift/RC_2016-08.bz2 69000000


In [40]:
# Opening the pickle file: August 2016
Aug_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-08.pickle","rb"))

In [66]:
#Aug_2016

In [42]:
# Running Locally: September 2016

extract_comments('push_shift/RC_2016-09.bz2')

push_shift/RC_2016-09.bz2 67000000


In [43]:
# Opening the pickle file: September 2016
Sep_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-09.pickle","rb"))

In [67]:
#Sep_2016

In [45]:
# Running Locally: October 2016

extract_comments('push_shift/RC_2016-10.bz2')

push_shift/RC_2016-10.bz2 71000000


In [46]:
# Opening the pickle file: October 2016
Oct_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-10.pickle","rb"))

In [68]:
#Oct_2016

In [48]:
# Running Locally: November 2016

extract_comments('push_shift/RC_2016-11.bz2')

push_shift/RC_2016-11.bz2 71000000


In [49]:
# Opening the pickle file: November 2016
Nov_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-11.pickle","rb"))

In [69]:
#Nov_2016

In [70]:
# Running Locally: December 2016

extract_comments('push_shift/RC_2016-12.bz2')

push_shift/RC_2016-12.bz2 72000000


In [71]:
# Opening the pickle file: December 2016
Dec_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-12.pickle","rb"))

In [73]:
#Dec_2016

### Storing all comments into one corpus

In [90]:
push_shift = ['RC_2016-01.bz2', 'RC_2016-02.bz2', 'RC_2016-03.bz2',
                   'RC_2016-04.bz2', 'RC_2016-05.bz2', 'RC_2016-06.bz2',
                   'RC_2016-07.bz2', 'RC_2016-08.bz2', 'RC_2016-09.bz2',
                   'RC_2016-10.bz2', 'RC_2016-11.bz2', 'RC_2016-12.bz2']


comments_corpus = {}
# Keep file pickleing for storage reasons. 
# Storing all monthly comments into one corpus for each individual subreddit
for file in sorted(push_shift):
    comments_corpus_temp = pickle.load(open("reddit_comments/comments_corpus_" + file[:-4] + ".pickle","rb")) # loading in monthly files without file type
    for subreddit in comments_corpus_temp:
        if subreddit not in comments_corpus: # some selected subreddits might not be present in files
            comments_corpus[subreddit] = []
        # Add comment to entire corpus of comments for a single subreddit. This might be why we want to collect utc, incase we want to run time series on the data to map key events
        comments_corpus[subreddit].extend(
            comments_corpus_temp[subreddit]
        )
        print(subreddit,len(comments_corpus[subreddit]))
                        
pickle_out = open("reddit_comments/comments_corpus_final.pickle","wb")
pickle.dump(comments_corpus, pickle_out)
pickle_out.close()

TheRedPill 48964
Feminism 2722
technews 260
TheRedPill 98519
Feminism 5528
technews 493
TheRedPill 145073
Feminism 8431
technews 757
TheRedPill 187494
Feminism 10931
technews 993
TheRedPill 232865
Feminism 14248
technews 1234
TheRedPill 281897
Feminism 16884
technews 1489
TheRedPill 325336
Feminism 19570
technews 1760
TheRedPill 366283
Feminism 22379
technews 2072
TheRedPill 411257
Feminism 25746
technews 2405
TheRedPill 451480
Feminism 28512
technews 2653
TheRedPill 492092
Feminism 31604
technews 2854
TheRedPill 535804
Feminism 35754
technews 3126
