# Reddit API Data Collection
###### By: Nick Gayliard

In [3]:
import requests
import time
import pandas as pd
import numpy as np
import re
import json
import pdb

### GET requests

In [4]:
url = 'https://www.reddit.com/r/nba.json'

req = requests.get(url)

In [5]:
req

<Response [429]>

https://httpstatuses.com/429

### Requests with parameters / queries

The reddit API gave us a 429 (too many requests) error without a 'User-agent' header assigned. That value can be anything in the case of the reddit API. This can differ from API to API, or be completely unneeded. Many APIs will require a private key, given to you by the company. Be sure to PROTECT your API keys, especially ones attached to bank accounts / credit cards (e.g. Amazon Web Services and Google API keys)

In [6]:
req = requests.get(url, headers = {'User-agent' : 'Nick'})

In [7]:
req.status_code

200

In [8]:
req.content

b'{"kind": "Listing", "data": {"modhash": "", "dist": 27, "children": [{"kind": "t3", "data": {"approved_at_utc": null, "subreddit": "nba", "selftext": "# Today\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n\\n# Yesterday\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n\\n# Top Highlights:\\n\\n0. [Michael Jordan hits a Triple Clutch Layup](https://gfycat.com/leafydimwittedarcticfox) | [(Comments)](https://reddit.com/r/nba/comments/c5pw9l)\\n\\n0. [Donovan Mitchell\'s luggage gets mixed up with a tourist\'s](https://streamable.com/xt8p8) | [(Comments)](https://reddit.com/r/nba/comments/c5fw47)\\n\\n0. [Iguodala says Mark Jackson was blackballed from the league for homophobic views; conflicted with Rick Welts (Warriors President) who is gay.](https://streamable.com/1hf45) | [(Comments)](https://reddit.com/r/nba/comments/c5e743)\\n\\n0. [Andre Iguodala asked: Who\'s tougher to guard, Kawhi Leonard or LeBron James? \\"Kobe Br

#### Sample URL with a query

In [9]:
req2 = requests.get(url, headers = {'User-agent' : 'Nick'}, params = {'after' : 't3_bor3tn'})

In [10]:
req2.url

'https://www.reddit.com/r/nba.json?after=t3_bor3tn'

##### Everything after the '?' symbol in the URL is a query for specific information from the API. You need to check the API documentation to see what variables you can use to grab what information.

In [11]:
req2.url

'https://www.reddit.com/r/nba.json?after=t3_bor3tn'

In [12]:
req2.headers

{'Content-Type': 'application/json; charset=UTF-8', 'x-ua-compatible': 'IE=edge', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'set-cookie': 'loid=000000000040rxcmlv.2.1561579925287.Z0FBQUFBQmRFOUdWUzUxTjhBb3BlYnBTSGRkZlE4ZFBVREF5LUhoUjV3MDJObjBCUHd1RTMwLW9remdjclVCRUVtSVlQbUhjU05QTG9saFBsZlhmeEJISnFiSnNKbkZWdU9pNEJOVkh2RTN6OGJsZE5XRU52LVB1LURNSFFCc0JBTHRyTHhJUmJuZGU; Domain=reddit.com; Max-Age=63071999; Path=/; expires=Fri, 25-Jun-2021 20:12:05 GMT; secure, session_tracker=OBymXmLJBJZ3p2yJTG.0.1561579925287.Z0FBQUFBQmRFOUdWbTFLNkpOODFtMjNQUFNfM2t0V0ZjeF81czhPTDhMMmQybHFfTjRKZHhQZ1Q5bUZ1bnB6ME1DUlVTTzB4TjdWZkNjUzhWSTRqaENBa2FDYzdzOXFmSXhvV3JFSzVYdF9ESEM1blFseVhpak5yemxFV25FT0dFQUhubTlnOElab2c; Domain=reddit.com; Max-Age=7199; Path=/; expires=Wed, 26-Jun-2019 22:12:05 GMT; secure, edgebucket=JNWUuD5Mkye0VOhPKS; Domain=reddit.com; Max-Age=63071999; Path=/;  secure', 'access-control-allow-origin': '*', 'access-control-expose-he

### Another reason to not use pd.read_json()

In [97]:
req.text

'{"kind": "Listing", "data": {"modhash": "", "dist": 27, "children": [{"kind": "t3", "data": {"approved_at_utc": null, "subreddit": "nba", "selftext": "# Today\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n\\n# Yesterday\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n\\n# Top Highlights:\\n\\n0. [Danny Green on why Marc Gasol didn\'t have a speech at the Raptors parade: \\"He\'s like \'I\'m drunk bro, that\'s my song!\' The real Memphis came out in Marc when he started drinking. I think he bit me at one point too. That\'s when I told Matt, \'You can\'t give Marc the mic. He might say something crazy.\\"](https://streamable.com/73m1y) | [(Comments)](https://reddit.com/r/nba/comments/c5crq0)\\n\\n0. [Donovan Mitchell\'s luggage gets mixed up with a tourist\'s](https://streamable.com/xt8p8) | [(Comments)](https://reddit.com/r/nba/comments/c5fw47)\\n\\n0. [Michael Jordan hits a Triple Clutch Layup](https://gfycat.com/leafydim

In [95]:
df = pd.read_json(req.text)

In [96]:
df

Unnamed: 0,kind,data
after,Listing,t3_c5g63a
before,Listing,
children,Listing,"[{'kind': 't3', 'data': {'approved_at_utc': No..."
dist,Listing,27
modhash,Listing,


In [81]:
json.loads(req.content).keys()

dict_keys(['kind', 'data'])

### Let's check out our request content

In [82]:
# Lots of crazy bytecode 

req.content

b'{"kind": "Listing", "data": {"modhash": "", "dist": 27, "children": [{"kind": "t3", "data": {"approved_at_utc": null, "subreddit": "nba", "selftext": "# Today\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n\\n# Yesterday\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n\\n# Top Highlights:\\n\\n0. [Danny Green on why Marc Gasol didn\'t have a speech at the Raptors parade: \\"He\'s like \'I\'m drunk bro, that\'s my song!\' The real Memphis came out in Marc when he started drinking. I think he bit me at one point too. That\'s when I told Matt, \'You can\'t give Marc the mic. He might say something crazy.\\"](https://streamable.com/73m1y) | [(Comments)](https://reddit.com/r/nba/comments/c5crq0)\\n\\n0. [Donovan Mitchell\'s luggage gets mixed up with a tourist\'s](https://streamable.com/xt8p8) | [(Comments)](https://reddit.com/r/nba/comments/c5fw47)\\n\\n0. [Michael Jordan hits a Triple Clutch Layup](https://gfycat.com/leafydi

#### Convert it to json and navigate through the json to the data we want

In [13]:
page_pull = req.json()

In [14]:
page_pull

{'kind': 'Listing',
 'data': {'modhash': '',
  'dist': 27,
  'children': [{'kind': 't3',
    'data': {'approved_at_utc': None,
     'subreddit': 'nba',
     'selftext': '# Today\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n\n# Yesterday\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n\n# Top Highlights:\n\n0. [Michael Jordan hits a Triple Clutch Layup](https://gfycat.com/leafydimwittedarcticfox) | [(Comments)](https://reddit.com/r/nba/comments/c5pw9l)\n\n0. [Donovan Mitchell\'s luggage gets mixed up with a tourist\'s](https://streamable.com/xt8p8) | [(Comments)](https://reddit.com/r/nba/comments/c5fw47)\n\n0. [Iguodala says Mark Jackson was blackballed from the league for homophobic views; conflicted with Rick Welts (Warriors President) who is gay.](https://streamable.com/1hf45) | [(Comments)](https://reddit.com/r/nba/comments/c5e743)\n\n0. [Andre Iguodala asked: Who\'s tougher to guard, Kawhi Leonard or LeBron James? "Kobe Bryan

In [15]:
page_pull.keys()

dict_keys(['kind', 'data'])

In [16]:
page_pull['data']

{'modhash': '',
 'dist': 27,
 'children': [{'kind': 't3',
   'data': {'approved_at_utc': None,
    'subreddit': 'nba',
    'selftext': '# Today\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n\n# Yesterday\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n\n# Top Highlights:\n\n0. [Michael Jordan hits a Triple Clutch Layup](https://gfycat.com/leafydimwittedarcticfox) | [(Comments)](https://reddit.com/r/nba/comments/c5pw9l)\n\n0. [Donovan Mitchell\'s luggage gets mixed up with a tourist\'s](https://streamable.com/xt8p8) | [(Comments)](https://reddit.com/r/nba/comments/c5fw47)\n\n0. [Iguodala says Mark Jackson was blackballed from the league for homophobic views; conflicted with Rick Welts (Warriors President) who is gay.](https://streamable.com/1hf45) | [(Comments)](https://reddit.com/r/nba/comments/c5e743)\n\n0. [Andre Iguodala asked: Who\'s tougher to guard, Kawhi Leonard or LeBron James? "Kobe Bryant"](https://streamable.com/yocnw) 

In [18]:
page_pull['data'].keys()

dict_keys(['modhash', 'dist', 'children', 'after', 'before'])

In [21]:
page_pull['data']['children']

[{'kind': 't3',
  'data': {'approved_at_utc': None,
   'subreddit': 'nba',
   'selftext': '# Today\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n\n# Yesterday\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n\n# Top Highlights:\n\n0. [Michael Jordan hits a Triple Clutch Layup](https://gfycat.com/leafydimwittedarcticfox) | [(Comments)](https://reddit.com/r/nba/comments/c5pw9l)\n\n0. [Donovan Mitchell\'s luggage gets mixed up with a tourist\'s](https://streamable.com/xt8p8) | [(Comments)](https://reddit.com/r/nba/comments/c5fw47)\n\n0. [Iguodala says Mark Jackson was blackballed from the league for homophobic views; conflicted with Rick Welts (Warriors President) who is gay.](https://streamable.com/1hf45) | [(Comments)](https://reddit.com/r/nba/comments/c5e743)\n\n0. [Andre Iguodala asked: Who\'s tougher to guard, Kawhi Leonard or LeBron James? "Kobe Bryant"](https://streamable.com/yocnw) | [(Comments)](https://reddit.com/r/nba/comme

In [23]:
page_pull['data']['children'][7]['data'].keys()

dict_keys(['approved_at_utc', 'subreddit', 'selftext', 'author_fullname', 'saved', 'mod_reason_title', 'gilded', 'clicked', 'title', 'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls', 'link_flair_css_class', 'downs', 'hide_score', 'name', 'quarantine', 'link_flair_text_color', 'author_flair_background_color', 'subreddit_type', 'ups', 'total_awards_received', 'media_embed', 'author_flair_template_id', 'is_original_content', 'user_reports', 'secure_media', 'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed', 'link_flair_text', 'can_mod_post', 'score', 'approved_by', 'thumbnail', 'edited', 'author_flair_css_class', 'author_flair_richtext', 'gildings', 'content_categories', 'is_self', 'mod_note', 'created', 'link_flair_type', 'wls', 'banned_by', 'author_flair_type', 'domain', 'selftext_html', 'likes', 'suggested_sort', 'banned_at_utc', 'view_count', 'archived', 'no_follow', 'is_crosspostable', 'pinned', 'over_18', 'all_awardings', 'media_only', 'can_gild'

In [18]:
len(page_pull['data']['children'])

27

name, subreddit, selftext, title, num_comments, url, score

In [54]:
# When you are indexing deeply into json, it can help to make variable names for certain levels of indexing
# that you plan on reusing, to improve readability and make sure you don't make indexing errors as often

post_list = page_pull['data']['children']

In [55]:
post_list[1].keys()

dict_keys(['kind', 'data'])

In [56]:
for post in post_list:
    print(post['data']['name'])

t3_c28033
t3_c4zm74
t3_c4u2cf
t3_c4uf08
t3_c4polz
t3_c4wanm
t3_c4ocgk
t3_c4mvl6
t3_c4zrki
t3_c4u6tc
t3_c4r087
t3_c4st9t
t3_c4qd00
t3_c4q2ju
t3_c4t4fy
t3_c4q2pw
t3_c4wcsj
t3_c4zx0k
t3_c4tz31
t3_c4slyu
t3_c4uoi0
t3_c4qp00
t3_c4nz6o
t3_c4v582
t3_c4n7cv
t3_c4x8yd
t3_c4q85n


In [57]:
post_list[0]['data']['title']

'2019-20 NBA Free Agents and Team Roster Tracker'

### Scrape and build a dictionary to make a dataframe

In [59]:
# Sloppy way! Too much indexing in loop

post_dict = {}

for count, post in enumerate(post_list):
    post_dict[post_list[count]['data']['name']] = [post_list[count]['data']['title'], post_list[count]['data']['num_comments']]

In [60]:
# CLEAN WAY - using an indexer variable!!

post_dict = {}

for count, post in enumerate(post_list):
    post_indexer = post_list[count]['data']
    post_dict[post_indexer['name']] = [post_indexer['title'], post_indexer['num_comments']]

In [61]:
df = pd.DataFrame(post_dict).T
df.columns = ['title', 'num_comments']
df

Unnamed: 0,title,num_comments
t3_c28033,2019-20 NBA Free Agents and Team Roster Tracker,499
t3_c4zm74,[GAME THREAD] NBA Awards,1224
t3_c4u2cf,[Jalen Rose] Paul Pierce I just ran out of TP ...,528
t3_c4uf08,[Wojnarowski] Portland is trading Evan Turner ...,770
t3_c4polz,"Charles Barkley on the Ballfather: ""Wherever B...",606
t3_c4wanm,"Got destroyed by kids today, just wanted to sa...",144
t3_c4ocgk,Shaq dreamshaking Michael Jordan out of his shoes,620
t3_c4mvl6,[Lewenberg] Canada Basketball will make it off...,430
t3_c4zrki,[Golliver] Mavericks' Luka Doncic named Rookie...,139
t3_c4u6tc,Kawhi's game-winner gets the Lego treatment,76


## Put it in a function!

In [None]:
# function to scrape reddit page (takes a reddit .json url)
# returns posts 

headers = {'User-agent' : 'Nick'}

def scraper_bike(url):
    posts = []
    after = {}

    for page in range(40):
        params = {'after' : after}
        url = url
        pagepull = requests.get(url = url, params = params, headers = headers)
        page_dict = pagepull.json()
        posts.extend(page_dict['data']['children'])
        after = page_dict['data']['after']
        # sleep is a best practice (probably not necessary for such a small scrape)
        time.sleep(.2)
        
    return posts

In [None]:
nba_post_list = scraper_bike('https://www.reddit.com/r/nba.json')

In [None]:
len(nba_post_list)

In [None]:
# function to convert posts to DataFrame - won't allow duplicate posts since unique id 'name' is set as index
# Extract: name (as index) and subreddit, selftext, title (as columns)

def posts_to_df(post_list):
    post_dict = {}
    
    for i, post in enumerate(post_list):
        ind = post_list[i]['data']
        post_dict[ind['name']] = [ind['subreddit'], ind['title'], ind['selftext']]

    df_name = pd.DataFrame(post_dict)
    df_name = df_name.T
    df_name.columns = ['subreddit', 'title', 'selftext'] #'selftext'
    
    return df_name

In [None]:
posts_to_df(nba_post_list)

## Couple extra functions for simplicity in running

In [None]:
# takes scraper function and url - outputs dataframe

def scrape_to_df(scrape_func, url):
    
    return posts_to_df(scrape_func(url))

### Function to scrape and save to csv. HIGHLY recommended when gathering data online that you want to ensure you maintain a copy of locally (and remotely if you want to be secure)

In [None]:
# NOTE: YOU NEED A CSV ALREADY MADE TO SAVE TO IN THIS CASE. 
# YOU COULD ADD CODE TO CREATE A NEW CSV IF NONE EXISTS

# scrape, import csv, concat, drop duplicate, and output to csv

# takes in scraper function, url, csv filename to import, csv filename to output

# Outputs - Concatenated DataFrame as csv

def scrape_add(scrape_func, url, import_file, export_file):
    
    scrape_df = posts_to_df(scrape_func(url))
    
    imported_df = pd.read_csv(import_file, index_col = 'Unnamed: 0')
    
    concat_df = pd.concat([imported_df, scrape_df])
    
    concat_df = concat_df[~concat_df.index.duplicated(keep='first')]
    
    concat_df.to_csv(export_file)