## Parse data files from PushShift.io for selected subreddits using PyWren

Importing the necessary packages.

In [3]:
import json
import bz2
import pywren

Importing PyWren and configuring PyWren appropriately.

In [4]:
# Creating appropriate pywren buckets
! pywren create_config 
! pywren create_role
! pywren create_bucket
! pywren deploy_lambda

Traceback (most recent call last):
  File "/Users/angelicabosko/anaconda3/bin/pywren", line 8, in <module>
    sys.exit(main())
  File "/Users/angelicabosko/anaconda3/lib/python3.7/site-packages/pywren/scripts/pywrencli.py", line 711, in main
    return cli() # pylint: disable=no-value-for-parameter
  File "/Users/angelicabosko/anaconda3/lib/python3.7/site-packages/click/core.py", line 829, in __call__
    return self.main(*args, **kwargs)
  File "/Users/angelicabosko/anaconda3/lib/python3.7/site-packages/click/core.py", line 782, in main
    rv = self.invoke(ctx)
  File "/Users/angelicabosko/anaconda3/lib/python3.7/site-packages/click/core.py", line 1259, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/Users/angelicabosko/anaconda3/lib/python3.7/site-packages/click/core.py", line 1066, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/Users/angelicabosko/anaconda3/lib/python3.7/site-packages/click/core.py", line 610, in invoke
    ret

Listing Subreddits to pull data from.

In [None]:
target_subreddits = ['TheRedPill', 'Feminism', 'technews']

Creating a function to grab all the comments for the target subreddits.

In [None]:
# comment parser for single file (month)
# Code adapted from: https://github.com/AhmedSoli/Reddit-Politics/blob/master/01_Content_Analysis/PreProcessing/.ipynb_checkpoints/010_ExtractCommentsTextCorpus-checkpoint.ipynb
from IPython.display import display, clear_output
import json
import os
import multiprocessing
import pickle
import bz2

def extract_comments(file_path):
    '''
    '''
    month_comments = {}
    for sub in target_subreddits:
        month_comments[sub] = []
        
    if '.bz2' in file_path:
        o = bz2.open

    with o(file_path,'rt') as comments_info:
            for i,comment in enumerate(comments_info):
                # load comment into json object
                comment = json.loads(comment)
#               # checking if comment is located in desired subreddit 
                if comment['subreddit'] in target_subreddits:
                    # append the body of the comment and time created
                    month_comments[comment['subreddit']].append((comment['body'], comment['created_utc']))

                    
    # Returning the month_comments dictionary
    return month_comments

Using PyWren to parallelize comment scraping per month. 

In [None]:
pushshift_files = ['RC_2016-01.bz2', 'RC_2016-02.bz2', 'RC_2016-03.bz2',
                   'RC_2016-04.bz2', 'RC_2016-05.bz2', 'RC_2016-06.bz2',
                   'RC_2016-07.bz2', 'RC_2016-08.bz2', 'RC_2016-09.bz2',
                   'RC_2016-10.bz2', 'RC_2016-11.bz2', 'RC_2016-12.bz2']

pwex = pywren.default_executor()
futures = pwex.map(func = extract_comments, iterdata = pushshift_files)

monthly_files = pywren.get_all_results(futures)

## Parse data files from PushShift.io for selected subreddits using local computer

Code is adapted from:

https://github.com/AhmedSoli/Reddit-Politics/blob/master/01_Content_Analysis/PreProcessing/.ipynb_checkpoints/010_ExtractCommentsTextCorpus-checkpoint.ipynb

In [5]:
# Subreddits we are pulling data from
target_subreddits = ['TheRedPill', 'Feminism', 'technews']

In [17]:
from IPython.display import display, clear_output
import json
import os
import multiprocessing
import pickle
import bz2

def extract_comments(file_path):
    '''
    This function extracts all of the comments necessary from
    each subreddit from the file path given.
    '''
    month_comments = {}
    for sub in target_subreddits:
        month_comments[sub] = []
    if '.bz2' in file_path:
        o = bz2.open

    with o(file_path,'rt') as comments_info:
            for i,comment in enumerate(comments_info):
                # load comment into json object
                comment = json.loads(comment)
                # check if comment is contained within desired subreddit
                if comment['subreddit'] in target_subreddits:
                    # append the body of the comment and the time created
                    month_comments[comment['subreddit']].append((comment['body'], comment['created_utc']))
                
                 # display progress
                if i % 1000000 == 0:
                    clear_output(wait=True)
                    print(file_path,i)
                    
    # serialize the result for this month through pickling
    pickle_out = open("reddit_comments/comments_corpus_" + file_path[11:-4] + ".pickle","wb")
    pickle.dump(month_comments, pickle_out)
    pickle_out.close()
    

In [18]:
# Running Locally: January 2016
import time

start = time.time()
extract_comments('push_shift/RC_2016-01.bz2')
end_time = time.time() - start

end_time

push_shift/RC_2016-01.bz2 61000000


1099.5419058799744

In [19]:
# Opening the pickle file: January 2016
Jan_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-01.pickle","rb"))

In [20]:
Jan_2016

{'TheRedPill': [('Why the fuck would you be friends with an ex? ', 1451606442),
  ('Great story full of RP truths. Thanks. Consider making a post on the TRP front page someday.',
   1451606479),
  ('Lets suss this out. \n\nFor one; she\'s either got a high paying job or a clueless Beta doormat in her pocket. Either way, the "Resource" box is checked. \n\nWhich means the only male she needs is a big, strong bull of a man. You won\'t find those in Silicon Valley. \n\nThis effect will become more pronounced with time. The typical AF/BB system only works when men of both categories are accessible to women. Soon women will begin out-earning men en masse  just as society begins to winnow out any sign of masculine behavior in male kids and teenages. \n\n Which means women will be on a ruthless hunt for Alphas, and with their resouce needs addressed they will totally ignore any man with even moderate Beta tendencies. Emily Holt is merely the vanguard of this dynamic. ',
   1451606497),
  ('Sou

In [21]:
# Running Locally: February 2016

extract_comments('push_shift/RC_2016-02.bz2')

push_shift/RC_2016-02.bz2 59000000


In [22]:
# Opening the pickle file: February 2016
Feb_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-02.pickle","rb"))

In [23]:
Feb_2016

{'TheRedPill': [('[removed]', 1454284808),
  ("&gt;**Self Perception = Frame**\n\n&gt;Frame is the ego boundry which you set for yourself\n\nThanks for the insight, I didn't consider the obvious parallels. The way I see it, holding frame is acknowledging your image and not allowing it to shatter.",
   1454284924),
  ('[removed]', 1454285018),
  ('redpillschool is the reason that when people hear "the red pill" they think of misogyny instead of The Matrix\n\n(although the manosphere did use the term \'red pill\' before it was used here, but it was a more general term, and its much more well known from the reddit version now)',
   1454285092),
  ('No... She wants the AF to reveal himself as a BB to justify a branch swing to a more valuable AF',
   1454285210),
  ("You're getting ahead of yourself.  \n\nDon't talk.  Until you learn what to talk about, say nothing. Better to be quiet and thought a fool that to speak and remove all doubt.   If you repeatedly find yourself saying stupid shit

In [24]:
# Running Locally: March 2016

extract_comments('push_shift/RC_2016-03.bz2')

push_shift/RC_2016-03.bz2 63000000


In [25]:
# Opening the pickle file: March 2016
Mar_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-03.pickle","rb"))

In [26]:
Mar_2016

{'TheRedPill': [('She threw that threat around numerous times. Dodged a bullet.',
   1456790502),
  ('Thanks for this. Really explains alot', 1456790550),
  ("But, it's OK to put 20 rounds into his face? Nope, sorry, you're wrong on this one. We have squads of  killers, professional, to be sure, but, they are out in the boonies killing. Not killing professional officers, in dress blues, but, insurgents/guerillas/rebels. Taking a whiz on a dead enemy is quite within bounds.\n\n Now, if they did it at a US-Taliban banquet, that may be out of line...",
   1456790573),
  ('This is great. The cons of crazy ass thoughts are still more appealing than a crazy ass bitch. ',
   1456790610),
  ('Dr. Phil, make way for your newest guest.', 1456790918),
  ("&gt; Pissing on a corpse is disgusting behaviour.\n\nHow does this rank with putting some bullets into the guy? You are missing the whole point. The soldiers do it to show contempt for the pissee. Like the Somalians who were dragging the bodies 

In [27]:
# Running Locally: April 2016

extract_comments('push_shift/RC_2016-04.bz2')

push_shift/RC_2016-04.bz2 64000000


In [28]:
# Opening the pickle file: April 2016
Apr_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-04.pickle","rb"))

In [29]:
Apr_2016

{'TheRedPill': [("I think you'd be surprised at just how many powerful and influential alpha men have taken engaged in questionable sexual acts. Ever hear of Bohemian Grove? ",
   1459468846),
  ('Definitely.  My friend references Tarzan when talking about this kind of thing.  He calls it "swinging from vine to vine".  Most women will never let go of one thing, unless they feel like something else is certain.  \n\nI would have played that shit the same way he did, just out of my natural arrogance to think, "fuck that, I\'m allowing some FWB to come into my house."  But I didn\'t think of it as a shit test, which it was.  She was testing him for neediness, which in hypergamy translates to weakness.  \n\nGood catch man.',
   1459468847),
  ('I agree with the message kinda but this comes off as BP garbage.',
   1459468873),
  ('[deleted]', 1459468894),
  ('This is exactly why "nice guys" aren\'t nice at all.  They are trying to leverage friendship into sex.  Its disgusting to men and wome

In [30]:
# Running Locally: May 2016

extract_comments('push_shift/RC_2016-05.bz2')

push_shift/RC_2016-05.bz2 65000000


In [31]:
# Opening the pickle file: May 2016
May_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-05.pickle","rb"))

In [32]:
May_2016

{'TheRedPill': [("Interesting mindset - guy's job: chase, girl's job: resist. I feel like chasing comes off as needy though. Perhaps you mean a guy's job is to advance, and the girl's job is to resist?\n\nReflecting, my actions must come from a scarcity mindset. *If I fuck this up, I have no other pussy to get at*, instead of an abundance mindset - *If she doesn't want my cock, then fuck it, because these 5 other plates will*",
   1462060919),
  ('Weird, I was just talking about this topic to a buddy last night.',
   1462060997),
  ("While you're at it, pick up [Thinking Fast and Slow](https://en.wikipedia.org/wiki/Thinking,_Fast_and_Slow). It'll make you infinitely more aware of your day to day decision making. It's one of those things that you can't unsee.",
   1462061068),
  ("I've been in interviews that asked a lot of these kinds of questions.  Its to gauge your ability to handle stress and the unexpected.\n\nI hate them. They prove nothing.",
   1462061321),
  ('&gt; Got rid of m

In [33]:
# Running Locally: June 2016

extract_comments('push_shift/RC_2016-06.bz2')

push_shift/RC_2016-06.bz2 65000000


In [34]:
# Opening the pickle file: June 2016
Jun_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-06.pickle","rb"))

In [35]:
Jun_2016

{'TheRedPill': [("Yeah it's really very simple.\n\nWomen want the same masculine polarity we want, but say they want the opposite.\n\n&amp;nbsp;",
   1464739310),
  ("Including soda cans. Don't drink lacroix", 1464739310),
  ("Short is different from average height; you're still fairly tall-",
   1464739323),
  ('I read your article. The gist of it seems to be to just use negs to knock her self esteem down. ',
   1464739345),
  ('[removed]', 1464739394),
  ("This is Redpill gold.  Especially the woman that had the affair, even though she knew he would eventually leave her, and even defined her life before him, and after him.\n\nSo much in the article.  One woman decided to 'take risks' with birth control. Lol, I wonder what that means?",
   1464739408),
  ('[deleted]', 1464739441),
  ('how dare men deprive these women the chance to divorce them.', 1464739441),
  ("There's an interesting study done with a rigged game of Monopoly and shows how those who have advantages tend to downplay t

In [36]:
# Running Locally: July 2016

extract_comments('push_shift/RC_2016-07.bz2')

push_shift/RC_2016-07.bz2 66000000


In [37]:
# Opening the pickle file: July 2016
Jul_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-07.pickle","rb"))

In [38]:
Jul_2016

{'TheRedPill': [("I think you missed the concept of spreading roots while also growing branches. One is just as Portland as the other. What you're describing is growing roots but not growing branches. The only way this post won't help someone is if they don't understand it fully.",
   1467331276),
  ('So you are Cuban American American.', 1467331276),
  ("Most of us are following Law 38. Hell, most decently intelligent east-european / west-asians I've talked to do so. \n\nThey will only take off that mask when they can be absolutely sure that no consequence sare going to arise from *going offroad*.\n\nThe majority of people not following Law 38 are the elderly who see through the kind of shit the govt. is feeding us right away - Natural RPers, if you will. However, the majority of aforementioned people (the ones I got to meet anyway) hang out in bars and are piss drunk by lunch... so theres that.\n\nSource: Born and raised in Germany",
   1467331277),
  ('&gt; any playful busting of ch

In [39]:
# Running Locally: August 2016

extract_comments('push_shift/RC_2016-08.bz2')

push_shift/RC_2016-08.bz2 69000000


In [40]:
# Opening the pickle file: August 2016
Aug_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-08.pickle","rb"))

In [41]:
Aug_2016

{'TheRedPill': [('[removed]', 1470009642),
  ("Either you lack life experience or you're not very intelligent. One of those factors is likely the reason this sounds weird to you. ",
   1470009717),
  ('Yes, this.  OP actually validated the fallen brother\'s suicide by valuing his shit relationship over the duty to properly honor a friend\'s memory.  Attending funerals helps the living- it validates the agonizing grief of those closest to the deceased.  By placing his worthless woman\'s "FEELINGS" above his friend\'s family\'s feelings, he proved to possess a desperation for validation that probably borders on pathological co-dependence.  \n\nOP should not leave this site, he need it more than anyone.  But he should read the sidebar, three times fully and completely, before posting even a question, much less a submitted read.  Because otherwise, other noobs are comparatively going to think they are heroes for insisting on going to... funerals.',
   1470009723),
  ("&gt; I have never hea

In [42]:
# Running Locally: September 2016

extract_comments('push_shift/RC_2016-09.bz2')

push_shift/RC_2016-09.bz2 67000000


In [43]:
# Opening the pickle file: September 2016
Sep_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-09.pickle","rb"))

In [44]:
Sep_2016

{'TheRedPill': [('[removed]', 1472688338),
  ('I saw [this](http://religionnews.com/2016/08/30/how-the-christian-masculinity-movement-is-ruining-men) garbage posted yesterday.',
   1472688811),
  ("Online dating can be a useful tool.  If only 1 out of 10 responds, so what?  Message 20 girls then, and get a couple of replies.  Ask if she can meet you someplace you're going anyway, like the bar you have a drink at on the way home from work, or the coffee shop you go to on the way to work.  Sometimes the girls you meet will be catfish, sometimes not.  With the slut epidemic going on, most of them will suck your dick on the first date.\n\nWith dating sites taking on tinder-like features, it makes it really easy.  You have a stack of girls nearby who say they think you're attractive.  Cool.  Ask a couple to meet up with you.  Bang a new one easily every week with minimal effort.\n\nWith all the practice you'll get, eventually you'll become completely desensitized to the shame normally felt 

In [45]:
# Running Locally: October 2016

extract_comments('push_shift/RC_2016-10.bz2')

push_shift/RC_2016-10.bz2 71000000


In [46]:
# Opening the pickle file: October 2016
Oct_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-10.pickle","rb"))

In [47]:
Oct_2016

{'TheRedPill': [('[removed]', 1475280161),
  ("You're on to something here", 1475280176),
  ('[removed]', 1475280270),
  ("&gt;But her illustrations make these guys look – well – disgusting really.\n\nI didn't think so. I thought they just looked like average dudes.",
   1475280273),
  ("Reading this makes me wonder went so fucking wrong in my life that I can't understand all of this social interaction stuff naturally without being told each explicit detail of the science behind it. Brilliant took either way.",
   1475280316),
  ('Very astute. \n\nWithout the *cheat codes*, one thinks he needs to convince her to have sex.\n\nTruthfully, womyns be dying to get half naked and titliate a dominant man. Its their point of existence and they derive value solely from the resources thet persuade men to give them',
   1475280343),
  ("I have an 8 inch cock and smash her cervix.  I talk dirty and strangle her, while also staying ripped.\n\nI'm not worried about my sex life",
   1475280417),
  ('

In [48]:
# Running Locally: November 2016

extract_comments('push_shift/RC_2016-11.bz2')

push_shift/RC_2016-11.bz2 71000000


In [49]:
# Opening the pickle file: November 2016
Nov_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-11.pickle","rb"))

In [50]:
Nov_2016

{'TheRedPill': [('[deleted]', 1477958405),
  ("&gt;And they know, they fucking — know — they have no idea, somewhere, deep down in that lizard brain inside her pretty little head.\n\nThis in spades.  The more RP I get, the more women just do WTF I tell them.  Not all women of course.  Not strangers.  Women that I have some sort of relationship.  Doesn't have to be sexual or romantic, but some sort of established thing.\n\nMan, if I'd known that in school, all that poon.  Sigh.",
   1477958429),
  ('Overall great post, but your first point, in my opinion, can be altered slighty. The bulk of the exercises you do should be heavier barbell exercises, but lighter dumbbell work is also beneficial when applied correctly.\n\nThat being said, the majority of "alternative" training are useless gimmicks.',
   1477958440),
  ('Fuck off with your politics. This is TRP not TheDonald.', 1477958445),
  ("Not trying to white knight but you can't be serious", 1477958557),
  ("Stop trying to reason your 

In [56]:
# Running Locally: December 2016

extract_comments('push_shift/RC_2016-12.bz2')

push_shift/RC_2016-12.bz2 0


OSError: Invalid data stream

In [None]:
# Opening the pickle file: December 2016
Dec_2016 = pickle.load(open("reddit_comments/comments_corpus_RC_2016-12.pickle","rb"))

In [None]:
Dec_2016

### Storing all comments into one corpus

In [38]:
comments_corpus = {}
# Keep file pickleing for storage reasons. 
# Storing all monthly comments into one corpus for each individual subreddit
for file in sorted(push_shift):
    comments_corpus_temp = pickle.load(open("reddit_commments/comments_corpus_" + file[11:-4] + ".pickle","rb")) # loading in monthly files without file type
    for subreddit in comments_corpus_temp:
        if subreddit not in comments_corpus: # some selected subreddits might not be present in files
            comments_corpus[subreddit] = []
        # Add comment to entire corpus of comments for a single subreddit. This might be why we want to collect utc, incase we want to run time series on the data to map key events
        comments_corpus[subreddit].extend(
            comments_corpus_temp[subreddit]
        )
        print(subreddit,len(comments_corpus[subreddit]))
                        
pickle_out = open("reddit_comments/comments_corpus.pickle","wb")
pickle.dump(comments_corpus, pickle_out)
pickle_out.close()

NameError: name 'push_shift' is not defined