# About this notebook:

This is where we use the requests library and pandas to retrieve our data from the pushshift api, and then save it as a csv file.

Endpoints can be found here:
https://pushshift.io/api-parameters/

### Table of Contents:
- [Imports and Useful Functions](#Imports-and-Useful-Functions)
- [Scraping /r/DadJokes](#Scraping-/r/DadJokes)
- [Scraping /r/Jokes](#Scraping-/r/Jokes)
- [Scraping /r/MommaJokes](#Scraping-/r/MommaJokes)
- [Scraping /r/CleanJokes](#Scraping-/r/CleanJokes)
- [Scraping /r/DirtyJokes](#Scraping-/r/DirtyJokes)

# Imports and Useful Functions

In [1]:
# importing the requeststs library and pandas

import requests
import pandas as pd

In [2]:
# scrape_reddit function:
# input is url for endpoint to scrape, and parameters for that endpoint
# output is dataframe of subreddit submissions

def scrape_reddit(url, params):
    res = requests.get(url, params)
    # 200 means the website gave us what we wanted and everything is ok
    print('Status code: ', res.status_code)

    # saving our requested data as a variable
    data = res.json()

    # creating a list of dictionaries, each dictionary is one post
    posts = data['data'] 
    print('You have a list of ', len(posts), ' posts.')
    
    #youngest submission utc
    youngest = posts[0]['created_utc']
    print('Your most recent submission was created ', youngest)
    
    # oldest submission utc
    oldest = posts[-1]['created_utc']
    print('Your oldest submission was created ', oldest)
    
    # turn it into a dataframe!
    df = pd.DataFrame(posts)
    
    return df, youngest, oldest


In [3]:
# joins multiple scrapes together into one dataframe

def merge_reddit_dfs(list_of_dfs):
    # getting the columns that are common to both dataframes
    list_of_cols = [df.columns for df in list_of_dfs]
    common_cols = set(list_of_cols[0]).intersection(*list_of_cols)  # the star does list expansion, gets rid of the [] in list
    
    # getting list of the dataframes with just those common columns
    list_of_dfs_common_cols = [df[common_cols] for df in list_of_dfs]
    
    #merging them all together
    merged_df = pd.concat(list_of_dfs_common_cols)
    
    return merged_df

In [1]:
# scrapes N entries from a subreddit and joins them together into one dataframe
# N should ideally be a multiple of 500 
# subreddit should be string name of the subreddit

def scrape_N_subs(N, subreddit):
    # will put each scrape in this list then merge them
    list_of_dfs = []
    
    # initial scrape
    sub_url = 'https://api.pushshift.io/reddit/search/submission'
    params = {
        'subreddit' : subreddit,
        'size' : 500
    }
    first_scrape = scrape_reddit(sub_url, params)
    list_of_dfs.append(first_scrape[0])
    
    # next scrapes
    n = int(N/500)
    for i in range(n-1):
        params = {'subreddit' : subreddit,
                  'size' : 500,
                  'before': first_scrape[2]
        }
        if first_scrape[0].shape[0] < 500:
            list_of_dfs.append(first_scrape[0])
            big_df = merge_reddit_dfs(list_of_dfs)
            return big_df
        
        else:
            next_scrape = scrape_reddit(sub_url, params)
            first_scrape = next_scrape
            list_of_dfs.append(first_scrape[0])
    
    # merge them all 
    big_df = merge_reddit_dfs(list_of_dfs)
    
    return big_df

# Scraping /r/DadJokes

In [7]:
dad_jokes_5000 = scrape_N_subs(5000, 'dadjokes')

Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1587602951
Your oldest submission was created  1587391477
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1587391373
Your oldest submission was created  1587156339
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1587155720
Your oldest submission was created  1586922099
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1586921791
Your oldest submission was created  1586702675
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1586702508
Your oldest submission was created  1586478853
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1586478328
Your oldest submission was created  1586262381
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1586262341
Your

In [8]:
# Checking the shape
dad_jokes_5000.shape

(5000, 62)

In [9]:
# saving as a csv file
dad_jokes_5000.to_csv('../data/dad_jokes_5000.csv', index = False)

# Scraping /r/Jokes

In [10]:
jokes_5000 = scrape_N_subs(5000, 'jokes')

Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1587603167
Your oldest submission was created  1587537841
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1587537787
Your oldest submission was created  1587475986
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1587475983
Your oldest submission was created  1587404729
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1587404601
Your oldest submission was created  1587338631
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1587338617
Your oldest submission was created  1587279167
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1587279084
Your oldest submission was created  1587206841
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1587206735
Your

In [12]:
jokes_5000.shape

(5000, 60)

In [13]:
# saving as a csv file
jokes_5000.to_csv('../data/jokes_5000.csv', index = False)

# Scraping /r/MommaJokes

In [11]:
momma_jokes_5000 = scrape_N_subs(5000, 'mommajokes')

Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1587647159
Your oldest submission was created  1446059155
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1446056672
Your oldest submission was created  1441775887
Status code:  200
You have a list of  186  posts.
Your most recent submission was created  1441480002
Your oldest submission was created  1393611335


In [14]:
momma_jokes_5000.shape

(1372, 31)

In [15]:
momma_jokes_1372 = momma_jokes_5000

In [16]:
# saving as a csv file
momma_jokes_1372.to_csv('../data/momma_jokes_1372.csv', index = False)

# Scraping /r/CleanJokes

In [55]:
clean_jokes_5000 = scrape_N_subs(5000, 'cleanjokes')

Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1587686893
Your oldest submission was created  1576873832
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1576870290
Your oldest submission was created  1564617422
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1564613409
Your oldest submission was created  1556620361
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1556616051
Your oldest submission was created  1549313207
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1549292959
Your oldest submission was created  1540182881
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1540182837
Your oldest submission was created  1531015800
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1530968228
Your

In [56]:
clean_jokes_5000.shape

(5000, 39)

In [57]:
# saving as a csv file
clean_jokes_5000.to_csv('../data/clean_jokes_5000.csv', index = False)

# Scraping /r/DirtyJokes

In [58]:
dirty_jokes_5000 = scrape_N_subs(5000, 'dirtyjokes')

Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1587691384
Your oldest submission was created  1579088194
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1579075794
Your oldest submission was created  1570677946
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1570673793
Your oldest submission was created  1562854506
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1562845439
Your oldest submission was created  1554737922
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1554704805
Your oldest submission was created  1539950702
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1539884642
Your oldest submission was created  1513476202
Status code:  200
You have a list of  500  posts.
Your most recent submission was created  1513463725
Your

In [59]:
dirty_jokes_5000.shape

(4276, 24)

In [60]:
# saving as a csv file
dirty_jokes_5000.to_csv('../data/dirty_jokes_5000.csv', index = False)

# Comments

In [20]:
# getting first 500 comments 

url = 'https://api.pushshift.io/reddit/search/comment'

params = {
        'subreddit' : 'jokes',
        'size' : 1000,
        'nest_level' : 1,
        'after' : 1587537841,
        'before' : 1587603167
    }

jokes_comments = scrape_reddit(url, params)

Status code:  200
You have a list of  1000  posts.
Your most recent submission was created  1587537852
Your oldest submission was created  1587562910


In [21]:
jc_df = jokes_comments[0]
jc_df['link_id'].value_counts()


t3_g5s8fo    575
t3_g5thmx     66
t3_g5xr6u     18
t3_g5wsca     11
t3_g5ymcy     11
            ... 
t3_fvvcrf      1
t3_g5vthy      1
t3_g3478i      1
t3_g5s0of      1
t3_g5yvlp      1
Name: link_id, Length: 131, dtype: int64

In [22]:
jokes_5000 = pd.read_csv('../data/jokes_5000.csv')

In [24]:
linked_sub_com = []
for sub_id in jokes_5000['id']:
    for com_id in jc_df['link_id']:
        if sub_id in com_id:
            linked_sub_com.append(sub_id)

In [26]:
len(linked_sub_com)

994

In [29]:
com_id_no_t3 = []
for com_id in jc_df['link_id']:
    com_id_no_t3.append(com_id.strip('t3_'))

In [52]:
comment_col = []
for sub_id in set(linked_sub_com):
    label = 't3_'+ sub_id
    comment_body = jc_df.loc[jc_df['link_id'] == label,'body'].head(1)
    comment_body = list(comment_body)
    comment_col.append([sub_id, *comment_body])

126

In [38]:
comment_body = jc_df.loc[jc_df['link_id'] == 't3_g5vthy','body'].head(1)

In [47]:
things = list(stuff)

In [49]:
molly = []
molly.append(*things)
molly

['both bark at me']

In [None]:
train.loc[train['Pclass'] == 3, 'Fare']

In [None]:
jokes_comments.shape

In [None]:
cols = jokes_comments.columns

In [None]:
id_cols = []
for col in cols:
    if 'id' in col:
        id_cols.append(col)

In [None]:
id_cols

In [None]:
joke_cols = jokes.columns
joke_id_cols = []
for col in joke_cols:
    if 'id' in col:
        joke_id_cols.append(col)

In [None]:
joke_id_cols

In [None]:
jokes['id'].head()

In [None]:
jokes_comments[l5]

In [None]:
jokes_comments['link_id']

In [None]:
l5 = [('g5qyrw' in lid) for lid in jokes_comments['link_id']]

In [None]:
l5

# Comment id's for submissions:

In [None]:
# getting first 500 
url = 'https://api.pushshift.io/reddit/submission/comment_ids/{base36 submission id}'
params = {
        'subreddit' : 'dadjokes',
    }

dad_jokes_sub_com = scrape_reddit_subs(url, params)

In [None]:

params = {
        'subreddit' : 'Jokes',
    }
res = requests.get('https://api.pushshift.io/reddit/submission/comment_ids/{base36 submission id}', params)

In [None]:
res.status_code