## Data Collection

This Notebook outlines the steps taken to collect data from the subreddits.

### Library Imports

In [1]:
# Standards
import pandas as pd
import numpy as np

# API
import requests

# Automating
import time
import datetime
import warnings
import sys

### Data Collection

In [None]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [2]:
def get_posts(subreddit, n_iter, epoch_right_now): # subreddit name and number of times function should run
    # store base url variable
    base_url = 'https://api.pushshift.io/reddit/search/submission/?subreddit='

    # instantiate empty list    
    df_list = []
    
    # save current epoch, used to iterate in reverse through time
    current_time = epoch_right_now
    
    # set up for loop
    for post in range(n_iter):
        
        # instantiate get request
        res = requests.get(
            
            # requests.get takes base_url and params
            base_url,
            
            # parameters for get request
            params = {
                
                # specify subreddit
                'subreddit' : subreddit,
                
                # specify number of posts to pull
                'size' : 500,
                
                # ???
                'lang' : True,
                
                # pull everything from current time backward
                'before' : current_time }
        )
        
        # take data from most recent request, store as df
        df = pd.DataFrame(res.json()['data'])
        
        # pull specific columns from dataframe for analysis
        df = df.loc[:, :]
        
        # append to empty dataframe list
        df_list.append(df)
        
        # add wait time
        time.sleep(100)
        
        # set current time counter back to last epoch in recently grabbed df
        current_time = df['created_utc'].min()

    # return one dataframe for all requests
    return pd.concat(df_list, axis=0)

# Adapated from Tim Book's Lesson Example

### Using function on subreddits

In [10]:
twd = get_posts('thewalkingdead',5,1601390824)

In [11]:
twd.shape

(500, 84)

In [12]:
twd1 = get_posts('thewalkingdead', 5, 1569989715)

In [18]:
twd1.shape

(500, 74)

In [17]:
twd2 = get_posts('thewalkingdead', 5, 1538453715)

In [19]:
twd2.shape

(500, 78)

In [14]:
twd3 = get_posts('thewalkingdead', 5,1506917715)

In [20]:
twd3.shape

(500, 45)

In [15]:
twd4 = get_posts('thewalkingdead', 5, 1475381715)

In [21]:
twd4.shape

(500, 37)

In [22]:
all_df = pd.concat([twd,twd1,twd2,twd3,twd4])

In [24]:
all_df.shape

(2500, 97)

In [25]:
# Save all the Walking dead subreddits
all_df.to_csv('twd.csv')

In [3]:
hp1 = get_posts('harrypotter',5,1601390824)

In [4]:
hp1.shape

(500, 81)

In [5]:
hp2 = get_posts('harrypotter', 5, 1569989715)

In [6]:
hp2.shape

(500, 74)

In [7]:
hp3 = get_posts('harrypotter', 5, 1538453715)

In [8]:
hp3.shape

(500, 79)

In [9]:
hp4 = get_posts('harrypotter', 5,1506917715)

In [10]:
hp4.shape

(500, 44)

In [11]:
hp5 = get_posts('harrypotter', 5, 1475381715)

In [12]:
hp5.shape

(500, 37)

In [14]:
all_hp = pd.concat([hp1, hp2, hp3, hp4, hp5])

In [15]:
# Save harry potter subreddit
all_hp.to_csv('harrypotter.csv')