Full documentation of the pushshift API can be found here -> https://github.com/pushshift/api

In [3]:
import requests
import praw # install this
import pandas as pd
import datetime as dt
import time
import math
from IPython.display import clear_output
import timeit
import csv
import numpy as np
import pmaw # install this
from pmaw import PushshiftAPI

Each subreddit in reddit has submissions, as well as the associated comments. You cannot fetch both at the same time, but you can get all the submissions first in a subreddit for a date, and then use the comment ids in the submissions to fetch the comments. So 2 steps. We show the first step here first.

In [4]:
api = PushshiftAPI()

subreddit = 'wallstreetbets' 
limit = 1000000
comment_threshold  = 10
start_date = dt.datetime(2020,1,1) # year, month, day
end_date = dt.datetime(2020,1,5) # year, month, day

# Use this if you want to count from today and get say the last 30 days of submissions
# end_date = dt.datetime.today() 
# timespan = dt.timedelta(days=30)
# start_date = end_date - timespan
# print(start_date, '|', end_date)

after = int(start_date.timestamp()) # subs after this date, i.e. start
before = int(end_date.timestamp()) # subs before this date, i.e. end

# print(after, before)

In [5]:
# around 1m subs for 5 days, 2m subs for a month
submissions = api.search_submissions(subreddit=subreddit, limit=limit, before=before, after=after) # get subs

# process subs into a dataframe
submissions_df = pd.DataFrame(submissions)
submissions_df['date'] = pd.to_datetime(submissions_df['created_utc'], unit='s')
print(len(submissions_df))

# filter subs with comments of > 10
submissions_df = submissions_df[submissions_df.num_comments>comment_threshold]

INFO:pmaw.PushshiftAPIBase:999198 result(s) not found in Pushshift
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 13 - Batches: 2 - Items Remaining: 1
INFO:pmaw.PushshiftAPIBase:1 result(s) not found in Pushshift
801


In [6]:
# submissions_df[['id', 'title', 'num_comments']].sort_values('num_comments', ascending=False)[:10]

Here we show step 2, where we get all the comment ids based on the sub id.

In [7]:
submission_list = list(submissions_df.id)
len(submission_list)

271

In [8]:
# We use this to get the IDs of all the comments of a specific submission ID, and download it with the search_comments function in the api
submission_id = submission_list[0]
api_endpt = f'https://api.pushshift.io/reddit/submission/comment_ids/{submission_id}'
response = requests.get(api_endpt)
comment_ids = list(response.json()['data'])
comments = api.search_comments(ids=comment_ids)
comments_df = pd.DataFrame(comments)
comments_df['submission_id'] = submission_id
print(len(comments_df))

INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
16
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
44
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
62
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
78
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
165


In [None]:
# Loop to fetch all submissions
for i in range(1,len(submission_list)):
    submission_id = submission_list[i]
    api_endpt = f'https://api.pushshift.io/reddit/submission/comment_ids/{submission_id}'
    response = requests.get(api_endpt)
    comment_ids = list(response.json()['data'])
    comments = api.search_comments(ids=comment_ids)
    temp_df = pd.DataFrame(comments)
    temp_df['submission_id'] = submission_id
    comments_df = comments_df.append(temp_df)
    print(len(comments_df))