# Start Here. Collect the Data

In [1]:
import pandas as pd
import time
import requests
import os
import csv

In [2]:
# Pushshift Reddit API and the two subreddits to collect data for
url = 'https://api.pushshift.io/reddit/search/'
subreddits = ['interestingasfuck', 'mildlyinteresting']

In [3]:
# CODE TAKEN FROM STACKOVERFLOW. USER GTOWNROWER
# https://stackoverflow.com/questions/41420742/python-repeatedly-writing-to-a-csv-file-from-a-dictionary-with-header-equal-to

# This function appends a dictionary to a csv file. If there are no headers in the file, they are added. If not, just the rows (values of the keys) will be added
def dict_to_csv(post_dict, file): 
    with open(f'../data/{file}.csv', 'a', encoding='utf-8') as f:
        w = csv.DictWriter(f, post_dict.keys())
        if f.tell() == 0:
            w.writeheader()
            w.writerow(post_dict)
        else: 
            w.writerow(post_dict)

In [4]:
# function to gather data
# The function takes in:
#   A list of subreddits to gather data on
#   What kind of data to pull (submission or comment)
#   How many cycles the function will run for (pulls 100 entries each cycle)
# The function writes to a csv files and returns a dataframe
def pushshift_query(subreddits, kind, cycles):
    # checks to see if there are any entries in the csv file. 
    # If there are not, time is blank. If there are entries, the current_time is set to the last entry to be able to continue where it left off
    if os.stat(f'../data/{kind}.csv').st_size == 0:
        current_time = ''
    else:
        current_time = pd.read_csv(f'../data/{kind}.csv')['time'].min()

    posts = []
    title_or_body = 'title'

    # switches between 'title' and 'body', based on what 'kind' was entered
    if kind == 'comment':
        title_or_body = 'body'

    # iterates over the subreddit list
    for subreddit in subreddits:
        # for each subreddit, pulls data for the amount of 'cycles' that was input
        for _ in range(cycles):
            # params that get updated and injected into the url
            params = {
                'subreddit': subreddit,
                'size': 100,
                'before': current_time
            }

            # sends a request to the API
            res = requests.get(url + kind, params)

            # Checks the response. If not 200, exit function with error message
            if res.status_code != 200:
                return 'Error Occurred'

            # iterates over all the entries for this cycle
            # gathers the subreddit, title or body, auth, and time. adds them to a dictionary
            for post in res.json()['data']:
                post_dict = {}

                post_dict['subreddit'] = post['subreddit']
                post_dict[title_or_body] = post[title_or_body]
                post_dict['auth'] = post['author']
                post_dict['time'] = post['created_utc']

                posts.append(post_dict)

                # calls the function to append a dictionary to a csv
                dict_to_csv(post_dict, kind)

                # sets the last entries time to the current_time, to continue pulling data in sequential order
                current_time = pd.DataFrame(posts)['time'].min()

            print(f'Current data frame has {len(posts)} rows')
            # sleeps for 3 seconds to let the servers rest
            time.sleep(3)

    # returns a data frame of all the posts collected for this function call
    return pd.DataFrame(posts)

In [7]:
# pushshift_query(subreddits, 'comment', 100)

In [6]:
# pushshift_query(subreddits, 'submission', 50)

# Go to the file "comment_cleaning.ipynb" next to clean the data