In [9]:
import pandas as pd
import requests
import json
import datetime

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### What are your parameters ?

In [10]:
after = '2021-12-31'
before = '2022-02-01'
subreddit = 'france'

In [11]:
def get_pushshift_data(after, before, subreddit) -> dict():
    try:
        URL = 'https://api.pushshift.io/reddit/submission/search/?subreddit='+str(subreddit)+'&after='+str(after)+'&before='+str(before)
        print(URL)
        r = requests.get(URL)
        data = json.loads(r.text, strict = False)
    except:
        print('Error while accessing API')
        print(r)
    return data['data']

In [12]:

def collect_clean_data(subpost, columns) -> pd.Series():
    clean_data = list()
    title = subpost['title']
    url = subpost['url']
    try:
        flair = subpost['link_flair_text']
    except KeyError:
        flair = 'NaN'
    try:
        body = subpost['selftext']
    except KeyError:
        body = ''
    author = subpost['author']
    postId = subpost['id']
    # score = subpost['score']
    created = datetime.datetime.fromtimestamp(subpost['created_utc'])
    # num_com = subpost['num_comments']
    permalink = subpost['permalink']    
    return pd.Series([postId,title,body,url,author,created,permalink,flair], index = columns)

In [13]:


def loop_between_dates(after, before, subreddit) -> pd.DataFrame():
    columns = ['postId','title','body','url','author','created','permalink','flair']
    rows_list = []

    #First call of the API with the original after / before intervals and the subreddit
    data = get_pushshift_data(after, before, subreddit)

    #While our API calls are returning something, we keep scrapping
    while len(data) > 0:
        for subpost in data:
            subpost_list = collect_clean_data(subpost, columns)
            rows_list.append(subpost_list)

        #Printing number of posts returned by the API (max = 25)
        print(len(data))

        #Getting the timestamp of the last post scrapped and replacing the old 'after' value
        print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
        after = data[-1]['created_utc']

        #New request with the new 'after' interval
        data = get_pushshift_data(after, before, subreddit)

    return pd.DataFrame(rows_list, columns=columns)

In [14]:
sub_clean_data = loop_between_dates(after, before, subreddit)
sub_clean_data


https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=2021-12-31&before=2022-02-01
25
2021-12-31 08:31:47
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640935907&before=2022-02-01
25
2021-12-31 11:35:19
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640946919&before=2022-02-01
25
2021-12-31 14:17:12
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640956632&before=2022-02-01
25
2021-12-31 16:14:16
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640963656&before=2022-02-01
25
2021-12-31 18:46:45
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640972805&before=2022-02-01
25
2021-12-31 21:10:59
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640981459&before=2022-02-01
25
2022-01-01 01:07:32
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640995652&before=2022-02-01
25
2022-01-01 1

Unnamed: 0,postId,title,body,url,author,created,permalink,flair
0,rsg4ft,L’Université de Paris sommée de changer de nom...,,https://www.lemonde.fr/societe/article/2021/12...,kanetix,2021-12-31 01:01:42,/r/france/comments/rsg4ft/luniversité_de_paris...,
1,rsgl2u,"Maxime Vachier-Lagrave, champion du monde en B...",,/r/chess/comments/rs9hfr/congrats_to_the_world...,mauricesarin,2021-12-31 01:24:20,/r/france/comments/rsgl2u/maxime_vachierlagrav...,Actus
2,rsgnbi,why is it so rare to get a 20 in the french un...,I started teaching an english course at a fren...,https://www.reddit.com/r/france/comments/rsgnb...,CauliflowerFew1515,2021-12-31 01:27:40,/r/france/comments/rsgnbi/why_is_it_so_rare_to...,Culture
3,rsgos1,Vos meilleures recettes de grogs ?,Après un joli noël familial ou le papa noël m'...,https://www.reddit.com/r/france/comments/rsgos...,BAGUETTOR,2021-12-31 01:29:41,/r/france/comments/rsgos1/vos_meilleures_recet...,
4,rsgqvm,"Parlons bien, parlons jouets !","Bonjour à tous. \n\nAvec les fêtes passées, et...",https://www.reddit.com/r/france/comments/rsgqv...,CreditGlittering,2021-12-31 01:32:26,/r/france/comments/rsgqvm/parlons_bien_parlons...,
...,...,...,...,...,...,...,...,...
6025,shfgod,Les gens qui jouent de la music à haute voix p...,"J'écris ça en live, un mec viens de garer sa v...",https://www.reddit.com/r/france/comments/shfgo...,hooveer,2022-02-01 00:18:08,/r/france/comments/shfgod/les_gens_qui_jouent_...,
6026,shfkyn,"Bonjour, croissant dough puff pastry question.",Do you add sugar to the dough or is it an amer...,https://www.reddit.com/r/france/comments/shfky...,Hylfnur,2022-02-01 00:22:54,/r/france/comments/shfkyn/bonjour_croissant_do...,
6027,shfogk,Pele,,https://rarible.com/token/0xc9154424b823b10579...,thekarnabou,2022-02-01 00:27:19,/r/france/comments/shfogk/pele/,
6028,shg05a,Uni Exchange Advice,Bonjour! :-) I am an Arab-Canadian girl in uni...,https://www.reddit.com/r/france/comments/shg05...,initfortheinf0,2022-02-01 00:41:42,/r/france/comments/shg05a/uni_exchange_advice/,


In [15]:
# Exporting the dataframe as a CSV
csv_file_name = 'france_' + after + '_' + before + '.csv'
sub_clean_data.to_csv('csv_exports' + '/' + csv_file_name, index = False, encoding="utf-8")

### Can't get score and comments datas
Pushshift ingests comments once, in real time as soon as they are created. It doesn't update them afterwards. So at that time, the score is 1. There are various reasons that pushshift might have the score something other than 1, it was behind and ingested after it had been voted on, or at some point in the past there was a second ingest that updated comment scores 24 hours later. But neither of those are true for the beta api.

### Possible solution
If up to date data on things like score and num_comments etc is important, you can always combine Pushshift with the Reddit formal API via praw to get the best of both worlds. You use psaw and pushshift syntax to do things like retrieve large numbers of posts and/or specify date ranges, but the actual metadata for each post ID retrieved from Pushshift is then retrieved from Reddit itself.

This combined method is slower than just using Pushshift, but if you want to just make one API call and make sure you get the up to date metadata, it works well. To do this though you do need reddit API creds. See here for code for combining psaw and praw.

https://psaw.readthedocs.io/en/latest/#demo-usage-python

One thing to be aware of though when looking at controversial things though... Pushshift retains all removed and deleted comments. So when you see a lot more reported comments via the Pushshift API than via Reddit itself, it's almost certainly because a lot of those comments were either removed or deleted. And, of course, if you try and retrieve those via the combination method I talked about above, you won't get anything because the code is trying to retrieve metadata for a post ID that exists on Pushshift but that doesn't exist on reddit itself anymore.

Source : https://www.reddit.com/r/pushshift/comments/ofteoo/beta_api_inconsistencies_in_results/