In [1]:
import pandas as pd
import requests
import json
from datetime import datetime

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### What are your parameters ?

In [2]:
def get_pushshift_data(after, before, subreddit) -> dict():
    try:
        URL = 'https://api.pushshift.io/reddit/submission/search/?subreddit='+str(subreddit)+'&after='+str(after)+'&before='+str(before)
        print(URL)
        r = requests.get(URL)
        if r.status_code == 200:
            data = json.loads(r.text, strict = False)
            return data['data']
        
        #Si on a eu une erreur en récupérant l'URL on réessaye 5 fois, sinon on abandonne
        else:
            nb_try = 0
            while r.status_code != 200 | nb_try < 5:
                URL = 'https://api.pushshift.io/reddit/submission/search/?subreddit='+str(subreddit)+'&after='+str(after)+'&before='+str(before)
                print(URL)
                r = requests.get(URL)
                data = json.loads(r.text, strict = False)
                nb_try += 1
            if r.status_code == 200:           
                return data['data']
            else: return ''
    except:
        print('Error while accessing API')
        print(r)
        return ''

In [3]:

def collect_clean_data(subpost, columns) -> pd.Series():
    clean_data = list()
    title = subpost['title']
    url = subpost['url']
    try:
        flair = subpost['link_flair_text']
    except KeyError:
        flair = 'NaN'
    try:
        body = subpost['selftext']
    except KeyError:
        body = ''
    author = subpost['author']
    postId = subpost['id']
    # score = subpost['score']
    created = datetime.fromtimestamp(subpost['created_utc'])
    # num_com = subpost['num_comments']
    permalink = subpost['permalink']    
    return pd.Series([postId,title,body,url,author,created,permalink,flair], index = columns)

In [4]:


def loop_between_dates(after = int, before = int, subreddit = str) -> pd.DataFrame():
    columns = ['postId','title','body','url','author','created','permalink','flair']
    rows_list = []

    #First call of the API with the original after / before intervals and the subreddit
    data = get_pushshift_data(after, before, subreddit)

    #While our API calls are returning something, we keep scrapping
    while len(data) > 0:
        for subpost in data:
            subpost_list = collect_clean_data(subpost, columns)
            rows_list.append(subpost_list)        

        #Getting the timestamp of the last post scrapped and replacing the old 'after' value
        print(str(datetime.fromtimestamp(data[-1]['created_utc'])))
        after = data[-1]['created_utc']        

        #New request with the new 'after' interval
        data = get_pushshift_data(after, before, subreddit)
        nb_try = 0

        print('Je suis le premier stop : ' + str(len(data)))
        # Sometimes the API doesn't return anything, so we're trying a few more times to be sure
        while len(data) < 1  | nb_try < 5:
            data = get_pushshift_data(after, before, subreddit)
            print("I am trying again to get data, but didn't receive anything from the API.")            
            nb_try += 1
            print("J'ai tout ça de data : " + str(len(data)) + " et j'ai essayé : " + str(nb_try))
        

        #Printing number of posts returned by the API (max = 25)
        print('Je suis le deuxième stop : ' + str(len(data)))

    return pd.DataFrame(rows_list, columns=columns)

In [5]:
def count_nb_months(after = int, before = int) -> int:
    return (int(datetime.fromtimestamp(before).strftime("%Y")) - int(datetime.fromtimestamp(after).strftime("%Y"))) * 12 + int(datetime.fromtimestamp(before).strftime("%m")) - int(datetime.fromtimestamp(after).strftime("%m"))

def add_one_month(date_timestamp = int) -> int:
    months_31 = ['02', '03', '04', '06','07','09','11', '12']
    months_30 = ['03', '05', '08', '10']
    if datetime.fromtimestamp(date_timestamp).strftime("%m") in months_31:
        return date_timestamp + 86400*31
    elif datetime.fromtimestamp(date_timestamp).strftime("%m") in months_30:
        return date_timestamp + 86400 * 30
    else: return date_timestamp + 86400 * 28

def save_dataframe(df, after : datetime, before : datetime):
    #On veut retransformer les timestamps en date pour le nom des fichiers
    after = str(after.strftime("%Y")) +  str(after.strftime("%m")) + str(after.strftime("%d"))
    before = str(before.strftime("%Y")) +  str(before.strftime("%m")) + str(before.strftime("%d"))
    csv_file_name = 'france_' + str(after) + '_' + str(before) + '.csv'
    df.to_csv('csv_exports' + '/' + csv_file_name, index = False, encoding="utf-8")

In [6]:
datetime.fromtimestamp(1640905200).strftime("%Y")

'2021'

In [7]:
test = add_one_month(int(datetime.timestamp(datetime(2022, 1, 31))))
datetime.fromtimestamp(test)

datetime.datetime(2022, 2, 28, 0, 0)

In [8]:
nb_test = 0
len_data = 0
while len_data < 1 | nb_test < 5:
    print('hihi')
    nb_test += 1

hihi
hihi
hihi
hihi


In [9]:
len_data == 0

True

In [10]:
after = int(datetime.timestamp(datetime(2021, 12, 31)))
before = int(datetime.timestamp(datetime(2022, 4, 1)))
subreddit = 'france'

nb_files_created = 0
nb_months = count_nb_months(after, before)
print(nb_months)
# Loop to make a file every month
while nb_files_created < nb_months:
    if nb_files_created != nb_months - 1:
        df = loop_between_dates(after,add_one_month(after), subreddit)
        save_dataframe(df, datetime.fromtimestamp(after), datetime.fromtimestamp(add_one_month(after)))
    else:
        df = loop_between_dates(after,before, subreddit)
        save_dataframe(df, datetime.fromtimestamp(after), datetime.fromtimestamp(before))
    print("J'ai fais un tour after : " + str(after) + " before " + str(before))
    after = add_one_month(after)
    nb_files_created += 1

3
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640991600&before=1643410800
2022-01-01 06:55:20
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1641016520&before=1643410800
Je suis le premier stop : 25
Je suis le deuxième stop : 25
2022-01-01 12:51:38
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1641037898&before=1643410800
Je suis le premier stop : 25
Je suis le deuxième stop : 25
2022-01-01 15:06:49
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1641046009&before=1643410800
Je suis le premier stop : 25
Je suis le deuxième stop : 25
2022-01-01 17:46:01
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1641055561&before=1643410800
Je suis le premier stop : 25
Je suis le deuxième stop : 25
2022-01-01 20:15:11
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1641064511&before=1643410800
Je suis le premier stop : 25
Je suis le deux

### Can't get score and comments datas
Pushshift ingests comments once, in real time as soon as they are created. It doesn't update them afterwards. So at that time, the score is 1. There are various reasons that pushshift might have the score something other than 1, it was behind and ingested after it had been voted on, or at some point in the past there was a second ingest that updated comment scores 24 hours later. But neither of those are true for the beta api.

### Possible solution
If up to date data on things like score and num_comments etc is important, you can always combine Pushshift with the Reddit formal API via praw to get the best of both worlds. You use psaw and pushshift syntax to do things like retrieve large numbers of posts and/or specify date ranges, but the actual metadata for each post ID retrieved from Pushshift is then retrieved from Reddit itself.

This combined method is slower than just using Pushshift, but if you want to just make one API call and make sure you get the up to date metadata, it works well. To do this though you do need reddit API creds. See here for code for combining psaw and praw.

https://psaw.readthedocs.io/en/latest/#demo-usage-python

One thing to be aware of though when looking at controversial things though... Pushshift retains all removed and deleted comments. So when you see a lot more reported comments via the Pushshift API than via Reddit itself, it's almost certainly because a lot of those comments were either removed or deleted. And, of course, if you try and retrieve those via the combination method I talked about above, you won't get anything because the code is trying to retrieve metadata for a post ID that exists on Pushshift but that doesn't exist on reddit itself anymore.

Source : https://www.reddit.com/r/pushshift/comments/ofteoo/beta_api_inconsistencies_in_results/