# Acquiring post lists for subreddit
In this notebook, we will use PushShift to get links to all posts within a given timeframe from our subreddits of interest

In [1]:
import requests
import pandas as pd
import json
import praw
from praw.models import MoreComments
import re
from datetime import date, timedelta
import time
import os
from configparser import ConfigParser

## Setup

In [5]:
subreddit = 'naturalhair'
start_date = date(2018, 1, 1)
end_date = date(2022, 3, 28)

if not os.path.isdir('output'):
    os.mkdir('output')
    
if not os.path.isdir('output/post_lists'):
    os.mkdir('output/post_lists')
        
if not os.path.isdir('output/post_lists/'+subreddit):
    os.mkdir('output/post_lists/'+subreddit)

## Iterate over date range

This code goes week by week, queries PushShift for a list of all posts from our subreddit within that week, then saves a list of the urls of every post posted within that week.

The amount of time to cover in each query can be changed by editing `daterange` (replacing the 7 with however many days you want to use). The larger the number, the more time each query will take (and the more time wasted if the query fails for whatever reason — lost internet connection, etc.).

In [6]:
def daterange(start_date, end_date):
    for n in range(0, int((end_date - start_date).days), 7):
        yield start_date + timedelta(n)



for single_date in daterange(start_date, end_date):
    output_filename = 'output/post_lists/' + subreddit + "/" + single_date.strftime("%Y-%m-%d") + ".txt"
    
    if not os.path.isfile(output_filename):
        # sleep so we aren't antagonizing pushshift too much
        time.sleep(5)

        start=int(time.mktime(single_date.timetuple()))
        single_date2 = single_date + timedelta(days=7)
        print(single_date.strftime("%Y-%m-%d") + " to " + single_date2.strftime("%Y-%m-%d"))
        end = int(time.mktime(single_date2.timetuple())-1)

        # get list of posts
        all_posts = "https://api.pushshift.io/reddit/search/submission/?subreddit=" + subreddit + "&sort=desc&after=" + str(start) + "&before=" + str(end)

        # send request to web page
        r = requests.get(all_posts)
        try:
            if 'data' in r.json():
                # get links
                links = [x['full_link'] for x in r.json()['data']]

                with open(output_filename, 'w') as f:
                    for item in links:
                        f.write("%s\n" % item)
        except:
            print("exception")
            continue


2018-01-01 to 2018-01-08
2018-01-08 to 2018-01-15
2018-01-15 to 2018-01-22
2018-01-22 to 2018-01-29
2018-01-29 to 2018-02-05
2018-02-05 to 2018-02-12
2018-02-12 to 2018-02-19
2018-02-19 to 2018-02-26
2018-02-26 to 2018-03-05
2018-03-05 to 2018-03-12
2018-03-12 to 2018-03-19
2018-03-19 to 2018-03-26
2018-03-26 to 2018-04-02
2018-04-02 to 2018-04-09
2018-04-09 to 2018-04-16
2018-04-16 to 2018-04-23
2018-04-23 to 2018-04-30
2018-04-30 to 2018-05-07
2018-05-07 to 2018-05-14
2018-05-14 to 2018-05-21
2018-05-21 to 2018-05-28
2018-05-28 to 2018-06-04
2018-06-04 to 2018-06-11
2018-06-11 to 2018-06-18
2018-06-18 to 2018-06-25
2018-06-25 to 2018-07-02
2018-07-02 to 2018-07-09
2018-07-09 to 2018-07-16
2018-07-16 to 2018-07-23
2018-07-23 to 2018-07-30
2018-07-30 to 2018-08-06
2018-08-06 to 2018-08-13
2018-08-13 to 2018-08-20
2018-08-20 to 2018-08-27
2018-08-27 to 2018-09-03
2018-09-03 to 2018-09-10
2018-09-10 to 2018-09-17
2018-09-17 to 2018-09-24
2018-09-24 to 2018-10-01
2018-10-01 to 2018-10-08
