In [1]:
# citing : Sara Soueidan - Project 3, Tim Book - API Code; Zoom recording DSIR-824 

In [23]:
# imports 
import pandas as pd 
import numpy as np

# APi
import requests

# automating - use for time based operations
import time
import datetime
import warnings
import sys
from time import sleep

In [3]:
# push shift api for url
url = 'https://api.pushshift.io/reddit/search/submission/?subreddit='

In [4]:
# set params for resquests 
params = {
    'subreddit' : 'wow',
    'size'      : 50,
    'lang'      : True,
    'before'    : 1615836061
}

In [5]:
# response variable set up using url and params and check status code
res = requests.get(url, params)
res.status_code

200

In [6]:
# check as json dictionary 
res.json()

{'data': [{'all_awardings': [],
   'allow_live_comments': False,
   'author': 'NEM-Furious',
   'author_flair_css_class': None,
   'author_flair_richtext': [],
   'author_flair_text': None,
   'author_flair_type': 'text',
   'author_fullname': 't2_2i6sznjr',
   'author_patreon_flair': False,
   'author_premium': False,
   'awarders': [],
   'can_mod_post': False,
   'contest_mode': False,
   'created_utc': 1615836006,
   'domain': 'i.redd.it',
   'full_link': 'https://www.reddit.com/r/wow/comments/m5siy9/john_hight_congratulated_the_player_who_got_all/',
   'gildings': {},
   'id': 'm5siy9',
   'is_crosspostable': True,
   'is_meta': False,
   'is_original_content': False,
   'is_reddit_media_domain': True,
   'is_robot_indexable': True,
   'is_self': False,
   'is_video': False,
   'link_flair_background_color': '',
   'link_flair_css_class': 'humor-meme',
   'link_flair_richtext': [{'e': 'text', 't': 'Humor / Meme'}],
   'link_flair_template_id': 'b578b03c-494b-11ea-befc-0ebeb8f4878f

In [7]:
# convert json to dataframe
df_wow = pd.DataFrame(res.json()['data'])
df_wow.head(3)

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,wls,media,media_embed,removed_by_category,secure_media,secure_media_embed,author_flair_background_color,author_flair_text_color,author_flair_template_id,media_metadata
0,[],False,NEM-Furious,,[],,text,t2_2i6sznjr,False,False,...,6,,,,,,,,,
1,[],False,Gnnz,,[],,text,t2_tgzf9,False,False,...,6,,,,,,,,,
2,[],False,ColonialList796,,[],,text,t2_9zjw0ej9,False,False,...,6,,,,,,,,,


In [8]:
#examine columns
df_wow.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_css_class',
       'link_flair_richtext', 'link_flair_template_id', 'link_flair_text',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'post_hint',
       'preview', 'pwls', 'retrieved_on', 'score', 'selftext', 'send_replies',
       'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_subscribers', 'subr

In [9]:
# pull columns of interest 
df_wow = df_wow[['subreddit', 'selftext', 'title', 'created_utc', 'media_only', 'author']]
df_wow.head()

Unnamed: 0,subreddit,selftext,title,created_utc,media_only,author
0,wow,,John Hight congratulated the player who got al...,1615836006,False,NEM-Furious
1,wow,"Hello,\n\nI have just began my journey with Wo...",What is end-game all about?,1615835954,False,Gnnz
2,wow,,Free 4K wallpaper for the homies,1615835431,False,ColonialList796
3,wow,,POKIMANE EPIC GAMER COMPILATION,1615835346,False,Preston341
4,wow,,This is my home now!,1615834985,False,chesucat


In [57]:
# build a api function,  to automate the process
# define function, with subreddit, # of times function should run 
def get_posts(subreddit, n_iter, epoch_right_now):
    # store base url
    base_url = 'https://api.pushshift.io/reddit/search/comment/?subreddit='
    # create empty list
    df_list = []
    # set epoch to current time
    current_time = epoch_right_now
    #set up for loop
    for post in range(n_iter):
        # get requests
        res = requests.get(
            # base url for response variable
            base_url,
            # parameters for response
            params = {
                # subreddit
                'subreddit' : subreddit,
                # size
                'size'      : 80,
                # lang == True
                'lang'      : True,
                # before this time pull everything
                'before'    : current_time} # close parameters
        ) # close .get
        # take data from most recent request, store as df
        df = pd.DataFrame(res.json()['data'])
        # pull specific columns for df
        df = df[['subreddit', 'body', 'comment_type', 'created_utc', 'author', 'permalink']]
        # create submission_title column
        link_list = list(df['permalink'].str.split('/'))
        sub_list = []
        for i in link_list:
            sub_list.append(i[5])
        df['submission_title'] = sub_list
        #drop permalink column
        df.drop(columns='permalink', inplace=True)
        # append to empty dataframe list
        df_list.append(df)
        # set current time counter back to last epoch in recently appended df
        current_time = df['created_utc'].min()
        #set sleep time
        time.sleep(60)
    #return one df for all requests
    return pd.concat(df_list, axis=0)

In [48]:
%%time
sportsbook = get_posts('sportsbook',1, 1616950200)

Wall time: 1.35 s


In [49]:
type(sportsbook)

pandas.core.frame.DataFrame

In [58]:
# create large df of both subreddits
reddit_df = pd.concat([get_posts('sportsbook', 3, 1616963400), get_posts('dfsports', 3, 1616963400)])

In [59]:
# need to run this for both subreddits
# then concat them to one large df and output to csv
#reddit_df.head()
reddit_df['subreddit'].value_counts()

dfsports      240
sportsbook    240
Name: subreddit, dtype: int64

In [60]:
reddit_df.to_csv('reddit_comments.csv')

---

This section takes the four csv files created and combined into one large dataframe.

In [3]:
# read in both files
df1 = pd.read_csv('Comments/reddit_comments.csv')
df2 = pd.read_csv('Comments/reddit_comments_1.csv')

In [4]:
# concat both together
df = pd.concat([df1, df2])

In [5]:
# check shape
df.shape

(3200, 7)

In [6]:
# read in 3rd file
df3 = pd.read_csv('Comments/reddit_comments_2.csv')

In [7]:
df = pd.concat([df, df3])

In [8]:
df.shape

(4800, 7)

In [9]:
# read in 4th file
df4 = pd.read_csv('Comments/reddit_comments_3.csv')

In [10]:
df = pd.concat([df, df4])

In [11]:
df.shape

(6400, 7)

In [12]:
# check classes
df['subreddit'].value_counts()

dfsports      3200
sportsbook    3200
Name: subreddit, dtype: int64

In [13]:
# export final file to comments folder
df.to_csv('Comments/reddit_comments_final.csv')