In [5]:
import pandas as pd
import requests
import json
from datetime import datetime
import time

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### What are your parameters ?

In [32]:
def get_pushshift_data(after, before, subreddit) -> dict():
    try:
        URL = 'https://api.pushshift.io/reddit/submission/search/?subreddit='+str(subreddit)+'&after='+str(after)+'&before='+str(before)
        print(URL)
        r = requests.get(URL)
        if r.status_code == 200:
            data = json.loads(r.text, strict = False)
            return data['data']
        
        #Si on a eu une erreur en récupérant l'URL on réessaye 5 fois, sinon on abandonne
        else:
            # Sleeping for 1 sec can be the solution to avoid spamming the API
            time.sleep(1)
            nb_try = 0
            while r.status_code != 200 | nb_try < 5:
                URL = 'https://api.pushshift.io/reddit/submission/search/?subreddit='+str(subreddit)+'&after='+str(after)+'&before='+str(before)
                print(URL)
                r = requests.get(URL)
                data = json.loads(r.text, strict = False)
                nb_try += 1
            if r.status_code == 200:           
                return data['data']
            else: return ''
    except:
        print('Error while accessing API')
        print(r)
        return ''

In [33]:
def collect_clean_data(subpost, columns) -> pd.Series():
    title = subpost['title']
    url = subpost['url']
    try:
        flair = subpost['link_flair_text']
    except KeyError:
        flair = 'NaN'
    try:
        body = subpost['selftext']
    except KeyError:
        body = ''
    author = subpost['author']
    postId = subpost['id']
    # score = subpost['score']
    created = datetime.fromtimestamp(subpost['created_utc'])
    # num_com = subpost['num_comments']
    permalink = subpost['permalink']    
    return pd.Series([postId,title,body,url,author,created,permalink,flair], index = columns)

In [34]:
def loop_between_dates(after = int, before = int, subreddit = str) -> pd.DataFrame():
    columns = ['postId','title','body','url','author','created','permalink','flair']
    rows_list = []

    #First call of the API with the original after / before intervals and the subreddit
    data = get_pushshift_data(after, before, subreddit)

    #While our API calls are returning something, we keep scrapping
    while len(data) > 0:
        for subpost in data:
            subpost_list = collect_clean_data(subpost, columns)
            rows_list.append(subpost_list)        

        #Getting the timestamp of the last post scrapped and replacing the old 'after' value
        print(str(datetime.fromtimestamp(data[-1]['created_utc'])))
        after = data[-1]['created_utc']        

        #New request with the new 'after' interval
        data = get_pushshift_data(after, before, subreddit)
        nb_try = 0

        # Sometimes the API doesn't return anything, so we're trying a few more times to be sure
        while len(data) < 1  | nb_try < 5:
            data = get_pushshift_data(after, before, subreddit)
            print("I am trying again to get data, but didn't receive anything from the API.")            
            nb_try += 1
        

        #Printing number of posts returned by the API (max = 25)
        print(str(len(data)))

    return pd.DataFrame(rows_list, columns=columns)

In [35]:
def count_nb_months(after = int, before = int) -> int:
    return (int(datetime.fromtimestamp(before).strftime("%Y")) - int(datetime.fromtimestamp(after).strftime("%Y"))) * 12 + int(datetime.fromtimestamp(before).strftime("%m")) - int(datetime.fromtimestamp(after).strftime("%m"))

def add_one_month(date_timestamp = int) -> int:
    months_31 = ['02', '03', '04', '06','07','09','11', '12']
    months_30 = ['03', '05', '08', '10']
    if datetime.fromtimestamp(date_timestamp).strftime("%m") in months_31:
        return date_timestamp + 86400*31
    elif datetime.fromtimestamp(date_timestamp).strftime("%m") in months_30:
        return date_timestamp + 86400 * 30
    else: return date_timestamp + 86400 * 28

def save_dataframe(df, after : datetime, before : datetime):
    #On veut retransformer les timestamps en date pour le nom des fichiers
    after = str(after.strftime("%Y")) +  str(after.strftime("%m")) + str(after.strftime("%d"))
    before = str(before.strftime("%Y")) +  str(before.strftime("%m")) + str(before.strftime("%d"))
    csv_file_name = 'france_' + str(after) + '_' + str(before) + '.csv'
    df.to_csv('csv_exports' + '/' + csv_file_name, index = False, encoding="utf-8")

### Complete scrap between two given dates and saving datas in a file

In [23]:
def scrap_save_between_dates(after:int, before:int, subreddit:str):
    nb_files_created = 0
    nb_months = count_nb_months(after, before)
    print(nb_months)
    # Loop to make a file every month
    while nb_files_created < nb_months:
        if nb_files_created != nb_months - 1:
            df = loop_between_dates(after,add_one_month(after), subreddit)
            save_dataframe(df, datetime.fromtimestamp(after), datetime.fromtimestamp(add_one_month(after)))
        else:
            df = loop_between_dates(after,before, subreddit)
            save_dataframe(df, datetime.fromtimestamp(after), datetime.fromtimestamp(before))
        after = add_one_month(after)
        nb_files_created += 1

In [None]:
after = int(datetime.timestamp(datetime(2021, 9, 1)))
before = int(datetime.timestamp(datetime(2021, 12, 31)))
subreddit = 'france'

scrap_save_between_dates(after, before, subreddit)

### Update a scrap file from the last scrap content to a given date

In [6]:
import glob
def all_csv_to_df():
    # getting csv files from the folder MyProject
    path = "csv_exports"

    # read all the files with extension .csv
    filenames = glob.glob(path + "\*.csv")
    print('File names:', filenames)
    all_titres = pd.DataFrame()
    # for loop to iterate all csv files
    for file in filenames:
        # reading csv files
        print("\nReading file = ",file)
        all_titres = all_titres.append(pd.read_csv(file))

        all_titres = all_titres.reset_index(drop=True)
    return all_titres

In [25]:
# to_ts = the timestamp we want as the last scrap
def append_from_last_scrap(df, before:int, subreddit = 'france'):
    # We're getting the most recent date and converting it to timestamp and then int
    after = int(datetime.timestamp(datetime.strptime(df['created'].max(), "%Y-%m-%d %H:%M:%S")))

    #Now we're doing the same scrap as before
    scrap_save_between_dates(after, before, subreddit)

In [36]:
#Scrapping until today

before = int(datetime.timestamp(datetime.now()))
df = all_csv_to_df()
append_from_last_scrap(df, before)

File names: ['csv_exports\\france_20210901_20211002.csv', 'csv_exports\\france_20211002_20211031.csv', 'csv_exports\\france_20211031_20211231.csv', 'csv_exports\\france_20211231_20220131.csv', 'csv_exports\\france_20220131_20220228.csv', 'csv_exports\\france_20220228_20220331.csv', 'csv_exports\\france_20220331_20220401.csv']

Reading file =  csv_exports\france_20210901_20211002.csv

Reading file =  csv_exports\france_20211002_20211031.csv

Reading file =  csv_exports\france_20211031_20211231.csv

Reading file =  csv_exports\france_20211231_20220131.csv

Reading file =  csv_exports\france_20220131_20220228.csv

Reading file =  csv_exports\france_20220228_20220331.csv

Reading file =  csv_exports\france_20220331_20220401.csv
1
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1648763897&before=1650977038
2022-04-01 04:15:19
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1648779319&before=1650977038
25
2022-04-01 08:50:07
https://api.pus

### Can't get score and comments datas
Pushshift ingests comments once, in real time as soon as they are created. It doesn't update them afterwards. So at that time, the score is 1. There are various reasons that pushshift might have the score something other than 1, it was behind and ingested after it had been voted on, or at some point in the past there was a second ingest that updated comment scores 24 hours later. But neither of those are true for the beta api.

### Possible solution
If up to date data on things like score and num_comments etc is important, you can always combine Pushshift with the Reddit formal API via praw to get the best of both worlds. You use psaw and pushshift syntax to do things like retrieve large numbers of posts and/or specify date ranges, but the actual metadata for each post ID retrieved from Pushshift is then retrieved from Reddit itself.

This combined method is slower than just using Pushshift, but if you want to just make one API call and make sure you get the up to date metadata, it works well. To do this though you do need reddit API creds. See here for code for combining psaw and praw.

https://psaw.readthedocs.io/en/latest/#demo-usage-python

One thing to be aware of though when looking at controversial things though... Pushshift retains all removed and deleted comments. So when you see a lot more reported comments via the Pushshift API than via Reddit itself, it's almost certainly because a lot of those comments were either removed or deleted. And, of course, if you try and retrieve those via the combination method I talked about above, you won't get anything because the code is trying to retrieve metadata for a post ID that exists on Pushshift but that doesn't exist on reddit itself anymore.

Source : https://www.reddit.com/r/pushshift/comments/ofteoo/beta_api_inconsistencies_in_results/

### To get comments

https://www.reddit.com/r/pushshift/comments/qufgqa/get_all_comments_from_a_post_id/