In [1]:
# Load modules
import pandas as pd
import requests
import json
import csv
import time
import datetime as dt
from psaw import PushshiftAPI # https://github.com/dmarx/psaw


In [2]:
# ## Just seeing how this API works...
# api = PushshiftAPI()
# gen = api.search_submissions(limit=100)
# results = list(gen)

In [3]:
# Functions

## Submissions
def crawl_page(subreddit: str, last_posttime = None):
    """Crawl a page of results from a given subreddit.
    :param subreddit: The subreddit to crawl.
    :param last_page: The last downloaded page.
    :return: A page of results.
    """
    
    url = "https://api.pushshift.io/reddit/search/submission"
    
    params = {"subreddit": subreddit,\
               "size": 500,\
               "sort": "desc",\
               "sort_type": "created_utc"}
    
    # Called to "scroll down" page based on before
    if last_posttime is not None:
        queries["before"] = last_posttime
        
    results = requests.get(url, params)
    
    if not results.ok:
        # something wrong happened
        raise Exception("Server returned status code {}".format(results.status_code))
    return results.json()["data"]


def crawl_subreddit(subreddit, max_submissions = 200000):
    """Crawl submissions from a subreddit.
    :param subreddit: The subreddit to crawl.
    :param max_submissions: The maximum number of submissions to download.
    :return: A list of submissions."""
    
    all_submissions = [] # empty list to hold all submissions
    last_posttime = None  # will become an empty list when reached the last page
    
    while len(all_submissions) < max_submissions:
        current_submissions = get_pages(subreddit, last_posttime)
        if len(current_submissions) == 0:
            break
        last_posttime = current_submissions[-1]["created_utc"]
        all_submissions += current_submissions
        
        #time.sleep(3)
        
        if len(all_submissions) % 10000 == 0: # to track progress for big pulls
            print(len(all_submissions))
    return all_submissions[:max_submissions]

## Comments
def crawl_comments(subreddit, max_comments = 10000000):
    """Crawl comments from a subreddit
    :param subreddit: The subreddit to crawl.
    :param max_submissions: The max number of comments to download.
    :return: a data frame of comments""" 
    
    api = PushshiftAPI()

    gen = api.search_comments(subreddit = subreddit)
    
    comments = []
    
    for c in gen:
        comments.append(c)
        
        if len(comments) % 10000 == 0:
            print(len(comments))
         # Omit this to not limit to max_comments
#         if len(comments) >= max_comments:
#             break
    
    # Below code only used if the `if len(comments)` lines above not commented out
    if False: # False flag - to be changed to True if we want to get rest of the results
        for c in gen:
            comments.append(c)
            
    # Create pandas data frame to return        
    df = pd.DataFrame([obj.d_ for obj in comments])
    
    return df





In [4]:
### SUBMISSIONS ### 

api = PushshiftAPI() 

url = "https://api.pushshift.io/reddit/search/submission"

# Get number of submissions in entire subreddit
# requests.get(url, params = {"subreddit": "TheRedPill", "size": 0, "aggs" : "subreddit"}).json()["aggs"]
# Result (May 15th): 112,196 posts in the entire subreddit


# Get submissions
submissions = crawl_subreddit("TheRedPill")

# Get date of submissions
yesterday = today - dt.timedelta(days = 1) # will count/collect posts after 00:00 on this date

# Save data as .json
os.chdir("/Users/mariajoseherrera/Documents/Admin/yahb/Turing Institute/trpred/data/raw/submissions")# change wd
filename = "submissions-" + str(yesterday) + ".json" # create filename 

with open(filename, 'w', encoding='utf-8') as f: # write file
    json.dump(submissions, f, ensure_ascii = False, indent=4)


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000


<function BufferedWriter.close>

In [5]:
len(submissions)

112207

In [None]:
### COMMENTS ###

# Get number of comments in entire subreddit
#requests.get(url, params = {"subreddit": "TheRedPill", "size": 0, "aggs" : "subreddit"}).json()["aggs"]
# Result (May 15th): 3022067

# Get comments
df_comments = crawl_comments('TheRedPill')

# Save data as .json
os.chdir("/Users/mariajoseherrera/Documents/Admin/yahb/Turing Institute/trpred/data/raw/comments")# change wd
filename = "comments-" + str(yesterday) + ".json" # create filename 

comments.to_json(filename)

In [None]:
# ## ORIGINAL CODE FROM K.Ren (dissertation in progress)
# import pandas as pd
# import requests
# import json
# import csv
# import time
# import datetime as dt
# from psaw import PushshiftAPI

# url = "https://api.pushshift.io/reddit/search/submission"
# params = {"subreddit": "depressed"}
# submissions = requests.get(url, params = params)

# api = PushshiftAPI()

# gen = api.search_submissions(limit=100)
# results = list(gen)

# def crawl_page(subreddit: str, last_page = None):
#     """Crawl a page of results from a given subreddit.
#     :param subreddit: The subreddit to crawl.
#     :param last_page: The last downloaded page.
#     :return: A page or results.
#     """
#     params = {"subreddit": subreddit, "size": 500, "sort": "desc", "sort_type": "created_utc"}
#     if last_page is not None:
#         if len(last_page) > 0:
#             # resume from where we left at the last page
#             params["before"] = last_page[-1]["created_utc"]
#         else:
#             # the last page was empty, we are past the last page
#             return []
#     results = requests.get(url, params)
#     if not results.ok:
#         # something wrong happened
#         raise Exception("Server returned status code {}".format(results.status_code))
#     return results.json()["data"]

# def crawl_subreddit(subreddit, max_submissions = 200000):
#     """Crawl submissions from a subreddit.
#     :param subreddit: The subreddit to crawl.
#     :param max_submissions: The maximum number of submissions to download.
#     :return: A list of submissions.
#     """
#     submissions = []
#     last_page = None
#     while last_page != [] and len(submissions) < max_submissions:
#         last_page = crawl_page(subreddit, last_page)
#         submissions += last_page
#         #time.sleep(3)
#         if len(submissions) % 10000 == 0:
#             print(len(submissions))
#     return submissions[:max_submissions]

# def crawl_comments(subreddit, max_comments = 10000000):
    
#     gen = api.search_comments(subreddit=subreddit)
#     comments = []
#     for c in gen:
#         comments.append(c)
        
#         if len(comments) % 10000 == 0:
#             print(len(comments))
# #         # Omit this 
# #         if len(comments) >= max_comments:
# #             break
            
#     # If you really want to: pick up where we left off to get the rest of the results.
#     if False:
#         for c in gen:
#             comments.append(c)
#     df = pd.DataFrame([obj.d_ for obj in comments])
#     return df


# requests.get(url, params = {"subreddit": "SuicideWatch", "size": 0, "aggs" : "subreddit"}).json()["aggs"]

In [None]:
# Other sources:
# source: https://medium.com/@RareLoot/using-pushshifts-api-to-extract-reddit-submissions-fb517b286563
# source: https://medium.com/@pasdan/how-to-scrap-reddit-using-pushshift-io-via-python-a3ebcc9b83f4

