In [46]:
import pandas as pd
import datetime as dt
import time
import requests

In [158]:
def query_pushshift(subreddit, kind = 'submission', day_window = 30, n = 2):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full 

In [159]:
investing = query_pushshift('investing')

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&size=500&after=60d
Query Complete!


In [156]:
bogleheads = query_pushshift('Bogleheads')

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Bogleheads&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Bogleheads&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Bogleheads&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Bogleheads&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Bogleheads&size=500&after=150d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Bogleheads&size=500&after=180d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Bogleheads&size=500&after=210d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Bogleheads&size=500&after=240d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Bogleheads&size=500&after=270d
Querying from: https://api.pushshift.io/reddit/se

r/Investing and r/Bogleheads are asymmetrical in size. I had to adjust the number of iterations to achieve equal data sets for the two subreddits. After much consdieraton I decided that having such different timeframes for the two is ok as the two mainly discuss investing fundamentals and the idea of "staying the course" in investing. If these had been subreddits dedicated to growth stocks or short term gains it may have had a greater affect on my results. 

In [157]:
#extended query to go back (make sure the two subreddits are balanced). 
bogleheads.shape

(1029, 9)

In [160]:
investing.shape

(1000, 9)

In [161]:
investing.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,"Free market capitalism, I’m Gen Z and honestly...",[removed],investing,1584573611,KellyCriterionHedged,13,0,True,2020-03-18
1,Tell me why I shouldn’t purchase a substantial...,[deleted],investing,1584573629,[deleted],10,1,True,2020-03-18
2,Can my 401k retirement account trigger a wash ...,Hypothetical scenario:\n\nI am looking to sell...,investing,1584573685,philosophytautology,9,0,True,2020-03-18
3,GILD.. The DD is in the link... FDA message to...,\n\nThe DD is in the link.. Why the fuck shou...,investing,1584573694,TawALittlePuttyTat,2,0,True,2020-03-18
4,Good time to invest?,Do y'all think there will be a raise in the ma...,investing,1584573941,rcmiddle,10,0,True,2020-03-18


In [135]:
bogleheads.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,25 year old with extra $10k in a bear market,I’m a 25 year old who set aside $10k earlier i...,Bogleheads,1584458843,cjone236,21,1,True,2020-03-17
1,This is what Jack Bogle said in 2011 during th...,I was doing some reading today and came across...,Bogleheads,1584486268,logicson,44,3,True,2020-03-17
2,27 year old 80% in VCAIX - time to ramp up DCA...,Lazy investor wanting to eventually get to an ...,Bogleheads,1584557617,Jacks_Tortoise,7,1,True,2020-03-18
3,How do you all resist the temptation to buy in...,I've been a Boglehead for a few years now and ...,Bogleheads,1584569244,SwoleBuddha,74,3,True,2020-03-18
4,Anyone lose their job and ready to sell everyt...,"400,000k is better than no k",Bogleheads,1584576682,Dodge1992,11,3,True,2020-03-18


In [166]:
df_reddit = pd.concat(objs = [investing, bogleheads], ignore_index = True )

In [170]:
df_reddit.shape

(2029, 9)

In [171]:
df_reddit.to_csv('./data/df_reddit.csv', index = False)