### Get comments from ID post

If I want to get all comments from post ID "ucbjgz" : https://api.pushshift.io/reddit/comment/search/?link_id=ucbjgz&limit=100&q=*

The limit is currently clamped down to 100 so need to make a loop if we want more (see https://www.reddit.com/r/pushshift/comments/qufgqa/get_all_comments_from_a_post_id/)

In [5]:
import pandas as pd
import requests
import json
from datetime import datetime
import time

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
def get_pushshift_data(link_id, limit = 100) -> dict():
    try:
        URL = 'https://api.pushshift.io/reddit/comment/search/?link_id='+str(link_id)+'&limit='+str(limit)+'&q=*'
        print(URL)
        r = requests.get(URL)
        if r.status_code == 200:
            data = json.loads(r.text, strict = False)
            return data['data']
        
        #Si on a eu une erreur en récupérant l'URL on réessaye 5 fois, sinon on abandonne
        else:
            time.sleep(1)
            nb_try = 0
            while r.status_code != 200 | nb_try < 5:
                URL = 'https://api.pushshift.io/reddit/comment/search/?link_id='+str(link_id)+'&limit='+str(limit)+'&q=*'
                print(URL)
                r = requests.get(URL)
                data = json.loads(r.text, strict = False)
                nb_try += 1
            if r.status_code == 200:           
                return data['data']
            else: return ''
    except:
        print('Error while accessing API')
        print(r)
        return ''

In [None]:
def collect_clean_data(comment, columns) -> pd.Series():
    clean_data = list()
    title = comment['title']
    url = comment['url']
    try:
        flair = comment['link_flair_text']
    except KeyError:
        flair = 'NaN'
    try:
        body = comment['selftext']
    except KeyError:
        body = ''
    author = comment['author']
    postId = comment['id']
    created = datetime.fromtimestamp(comment['created_utc'])
    permalink = comment['permalink']    
    return pd.Series([postId,title,body,url,author,created,permalink,flair], index = columns)

### We know want to get coments from all the post we scrapped

In [6]:
import glob

# getting csv files from the folder MyProject
path = "csv_exports"

# read all the files with extension .csv
filenames = glob.glob(path + "\*.csv")
print('File names:', filenames)
all_titres = pd.DataFrame()
# for loop to iterate all csv files
for file in filenames:
   # reading csv files
   print("\nReading file = ",file)
   all_titres = all_titres.append(pd.read_csv(file))

all_titres = all_titres.reset_index(drop=True)
all_titres

File names: ['csv_exports\\france_20210901_20211002.csv', 'csv_exports\\france_20211002_20211031.csv', 'csv_exports\\france_20211031_20211231.csv', 'csv_exports\\france_20211231_20220131.csv', 'csv_exports\\france_20220131_20220228.csv', 'csv_exports\\france_20220228_20220331.csv', 'csv_exports\\france_20220331_20220401.csv', 'csv_exports\\france_20220331_20220426.csv']

Reading file =  csv_exports\france_20210901_20211002.csv

Reading file =  csv_exports\france_20211002_20211031.csv

Reading file =  csv_exports\france_20211031_20211231.csv

Reading file =  csv_exports\france_20211231_20220131.csv

Reading file =  csv_exports\france_20220131_20220228.csv

Reading file =  csv_exports\france_20220228_20220331.csv

Reading file =  csv_exports\france_20220331_20220401.csv

Reading file =  csv_exports\france_20220331_20220426.csv


Unnamed: 0,postId,title,body,url,author,created,permalink,flair
0,pfg0r5,"Bravo la France, on bat tous les records",,https://i.redd.it/hsbrhsc0kpk71.jpg,Feisty-Ad-7455,2021-09-01 00:05:43,/r/france/comments/pfg0r5/bravo_la_france_on_b...,Société
1,pfgezw,CD blue ray raye,[removed],https://www.reddit.com/r/france/comments/pfgez...,Successful_Sport_565,2021-09-01 00:27:12,/r/france/comments/pfgezw/cd_blue_ray_raye/,
2,pfh52l,"Dans la Creuse, des tags anti-vaccins et antis...",,https://www.liberation.fr/politique/dans-la-cr...,Tigrafr,2021-09-01 01:07:33,/r/france/comments/pfh52l/dans_la_creuse_des_t...,Paywall
3,pfh9pl,Une histoire de manipulation et de violences,[removed],https://www.reddit.com/r/france/comments/pfh9p...,temp7894561230,2021-09-01 01:14:51,/r/france/comments/pfh9pl/une_histoire_de_mani...,
4,pfhakb,Pourquoi la gestion de la désinformation par R...,Suite à [ce poteau](https://www.reddit.com/r/f...,https://www.reddit.com/r/france/comments/pfhak...,dixfoisdix,2021-09-01 01:16:08,/r/france/comments/pfhakb/pourquoi_la_gestion_...,Méta
...,...,...,...,...,...,...,...,...
42266,ucb6cz,De la guerre à la drogue à la prévention des a...,,/r/AddictionsFR/comments/uc5v6q/de_la_guerre_à...,unmalepourunbien,2022-04-26 14:14:47,/r/france/comments/ucb6cz/de_la_guerre_à_la_dr...,Société
42267,ucbjgz,Claude Francois - Si j'avais un marteau - 1963,,https://www.youtube.com/watch?v=Y7nrcb3oGkc,not_franck_the_cook,2022-04-26 14:34:43,/r/france/comments/ucbjgz/claude_francois_si_j...,Musique
42268,ucboq7,Le cabinet de Brigitte Macron déclenche l’insp...,,https://www.mediapart.fr/journal/france/260422...,ThymEtRomarin,2022-04-26 14:42:17,/r/france/comments/ucboq7/le_cabinet_de_brigit...,Paywall
42269,ucboz4,Mon dada : détourner la culture pop américaine...,,https://v.redd.it/hrm5kn7l0vv81,shalli,2022-04-26 14:42:38,/r/france/comments/ucboz4/mon_dada_détourner_l...,Cinéma Séries


In [None]:
post_ids = all_titres['postId'].tolist()
post_ids