In [1]:
import pandas as pd
import requests
import json
import datetime

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### What are your parameters ?

In [2]:
after = '2021-12-31'
before = '2022-01-02'
subreddit = 'france'

In [3]:
def get_pushshift_data(after, before, subreddit) -> dict():
    URL = 'https://api.pushshift.io/reddit/submission/search/?subreddit='+str(subreddit)+'&after='+str(after)+'&before='+str(before)
    print(URL)
    r = requests.get(URL)
    data = json.loads(r.text, strict = False)
    return data['data']

In [4]:

def collect_clean_data(subpost, columns) -> pd.Series():
    clean_data = list()
    title = subpost['title']
    url = subpost['url']
    try:
        flair = subpost['link_flair_text']
    except KeyError:
        flair = 'NaN'
    try:
        body = subpost['selftext']
    except KeyError:
        body = ''
    author = subpost['author']
    postId = subpost['id']
    score = subpost['score']
    created = datetime.datetime.fromtimestamp(subpost['created_utc'])
    num_com = subpost['num_comments']
    permalink = subpost['permalink']    
    return pd.Series([postId,title,body,url,author,score,created,num_com,permalink,flair], index = columns)

In [5]:


def loop_between_dates(after, before, subreddit) -> pd.DataFrame():
    columns = ['postId','title','body','url','author','score','created','num_com','permalink','flair']
    rows_list = []

    #First call of the API with the original after / before intervals and the subreddit
    data = get_pushshift_data(after, before, subreddit)

    #While our API calls are returning something, we keep scrapping
    while len(data) > 0:
        for subpost in data:
            subpost_list = collect_clean_data(subpost, columns)
            rows_list.append(subpost_list)

        #Printing number of posts returned by the API (max = 25)
        print(len(data))

        #Getting the timestamp of the last post scrapped and replacing the old 'after' value
        print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
        after = data[-1]['created_utc']

        #New request with the new 'after' interval
        data = get_pushshift_data(after, before, subreddit)

    return pd.DataFrame(rows_list, columns=columns)

In [6]:
sub_clean_data = loop_between_dates(after, before, subreddit)
sub_clean_data


https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=2021-12-31&before=2022-01-02
25
2021-12-31 08:31:47
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640935907&before=2022-01-02
25
2021-12-31 11:35:19
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640946919&before=2022-01-02
25
2021-12-31 14:17:12
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640956632&before=2022-01-02
25
2021-12-31 16:14:16
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640963656&before=2022-01-02
25
2021-12-31 18:46:45
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640972805&before=2022-01-02
25
2021-12-31 21:10:59
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640981459&before=2022-01-02
25
2022-01-01 01:07:32
https://api.pushshift.io/reddit/submission/search/?subreddit=france&after=1640995652&before=2022-01-02
25
2022-01-01 1

Unnamed: 0,postId,title,body,url,author,score,created,num_com,permalink,flair
0,rsg4ft,L’Université de Paris sommée de changer de nom...,,https://www.lemonde.fr/societe/article/2021/12...,kanetix,1,2021-12-31 01:01:42,0,/r/france/comments/rsg4ft/luniversité_de_paris...,
1,rsgl2u,"Maxime Vachier-Lagrave, champion du monde en B...",,/r/chess/comments/rs9hfr/congrats_to_the_world...,mauricesarin,1,2021-12-31 01:24:20,0,/r/france/comments/rsgl2u/maxime_vachierlagrav...,Actus
2,rsgnbi,why is it so rare to get a 20 in the french un...,I started teaching an english course at a fren...,https://www.reddit.com/r/france/comments/rsgnb...,CauliflowerFew1515,1,2021-12-31 01:27:40,0,/r/france/comments/rsgnbi/why_is_it_so_rare_to...,Culture
3,rsgos1,Vos meilleures recettes de grogs ?,Après un joli noël familial ou le papa noël m'...,https://www.reddit.com/r/france/comments/rsgos...,BAGUETTOR,1,2021-12-31 01:29:41,0,/r/france/comments/rsgos1/vos_meilleures_recet...,
4,rsgqvm,"Parlons bien, parlons jouets !","Bonjour à tous. \n\nAvec les fêtes passées, et...",https://www.reddit.com/r/france/comments/rsgqv...,CreditGlittering,1,2021-12-31 01:32:26,0,/r/france/comments/rsgqvm/parlons_bien_parlons...,
...,...,...,...,...,...,...,...,...,...,...
319,rtvm4b,EXCLUSIF. Covid-19 : voici les nouvelles règle...,,https://www.lejdd.fr/Societe/Sante/exclusif-co...,Fearless-Cricket3297,1,2022-01-02 00:29:43,0,/r/france/comments/rtvm4b/exclusif_covid19_voi...,Covid-19
320,rtvnld,Nos 10 meilleures offres de Velo électrique po...,,https://www.choisirsonvelo.fr/12860462/nos-10-...,berliozd,1,2022-01-02 00:31:34,0,/r/france/comments/rtvnld/nos_10_meilleures_of...,
321,rtvskd,"Un moyen pour ""tester"" la prise en charge d'un...","Bonjour tout le monde,\n\nPassionnée d'animaux...",https://www.reddit.com/r/france/comments/rtvsk...,pyooon,1,2022-01-02 00:38:12,0,/r/france/comments/rtvskd/un_moyen_pour_tester...,
322,rtvvk5,Covid 19: cas détectés versus décès,Depuis quelques temps je me dis que les statis...,https://www.reddit.com/r/france/comments/rtvvk...,pstomi,1,2022-01-02 00:42:15,0,/r/france/comments/rtvvk5/covid_19_cas_détecté...,
