# Step 1: Data Scraping

## 0. Import libraries

In [14]:
import os
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector

import plotnine
import altair
import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

## 1. Load credentials

This loads the `credentials.json` file in each of our local repos.

In [15]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

# Obtaining a token

In [16]:
# Set up authentication parameters 
client_auth = r.auth.HTTPBasicAuth(credentials["app_client_id"], credentials["app_client_secret"])

# Send, via HTTP POST, your Reddit username and password
post_data = {"grant_type": "password", "username": credentials["reddit_username"], "password": credentials["reddit_password"]}

# Reddit API requests that we self-identify ourselves in the User-Agent
headers = {"User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

In [17]:
# From Reddit's API documentation, this is the endpoint I need
ACCESS_TOKEN_ENDPOINT = "https://www.reddit.com/api/v1/access_token"

# Send a HTTP POST 
response = r.post(ACCESS_TOKEN_ENDPOINT, auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAxMzA1NDkwLjEwMDUwMSwiaWF0IjoxNzAxMjE5MDkwLjEwMDUwMSwianRpIjoiSHBzZDg3U3RLQUlxd1ZqMEUxRGZNbGFfemwyajVnIiwiY2lkIjoibWhUbV82eEVUNzVkOWhmWkJrS0ZYQSIsImxpZCI6InQyXzE2ZmE0MiIsImFpZCI6InQyXzE2ZmE0MiIsImxjYSI6MTQ5MDI0NzcyNzAxMSwic2NwIjoiZUp5S1Z0SlNpZ1VFQUFEX193TnpBU2MiLCJmbG8iOjl9.MUA-g6lhG2c1xVMMqri-SFg4AFZTAiHL2slDYVy4QYo42d3jVtTiv_yMJGo6eVCXaRDhGPfGqYE1FjOYAJ9R0D5dzywSkfrh5c5d7U0Mt6r8cV9g4AT5m-rf-YMIjVZ9BMewHYXv-0Dz3I9uJKtFBtW7Dhr22QcgRC7Gl_5DxhXplT6vNyEqr4B7EcUAXozjzlNbyTSLe_ZLAoa_CG9DdaslhePsZNEOhfXoaRgYG9vUFP0X5KpnfdUpvabtJC0LjQtvr0IZWV3hL6-5d4kXFuiV-KS872sk-CwWZ9PvMNHzN3BfplR9VNPQBTpvSZY76oR4hgzT2pjCPtmZqilt2g',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

Save our token

In [18]:
my_token = response.json()['access_token']

From now on, all my requests need to be followed by these HTTP HEADERS:

In [19]:
headers = {"Authorization": f"bearer {my_token}", "User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

Sending our first request with the token for flair=Recipe

In [20]:
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = 'Recipe'
subreddit_name = 'recipes'

params = {'limit': 100, 'q': f'flair_name:"{flair_name}"'}
response = r.get(f"{BASE_ENDPOINT}/r/{subreddit_name}", headers=headers, params=params)

# response.json()


{'kind': 'Listing',
 'data': {'after': 't3_16ofo8m',
  'dist': 102,
  'modhash': None,
  'geo_filter': None,
  'children': [{'kind': 't3',
    'data': {'approved_at_utc': None,
     'subreddit': 'recipes',
     'selftext': 'Hello! Due to large amounts of blog spam and YT spam, we have included an automated message to remind users that we auto filter content. Send the mods a modmail when your formatted text recipe is posted and we can look at manually approving your post. Thanks !',
     'author_fullname': 't2_3fv3c05j',
     'saved': False,
     'mod_reason_title': None,
     'gilded': 0,
     'clicked': False,
     'title': '[MOD PSA] All link posts require a formatted text recipe. All posts are autoremoved until mod approved.',
     'link_flair_richtext': [],
     'subreddit_name_prefixed': 'r/recipes',
     'hidden': False,
     'pwls': 6,
     'link_flair_css_class': None,
     'downs': 0,
     'thumbnail_height': None,
     'top_awarded_type': None,
     'hide_score': False,
     

paginating as limit is 100

In [21]:
# Initialize an empty list to store the data from all pages
all_data = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data.extend(data['data']['children'])

# Continue paginating until there is no more data
while data['data']['after'] is not None:
    after_id = data['data']['after']
    response = r.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/top?limit=100&after={after_id}", headers=headers)
    data = response.json()

    # Process the data from the current page
    all_data.extend(data['data']['children'])



saving the data to a json file

In [22]:
# Saving `data` to file called `.json`
with open("./../data/all_data_flair_is_recipe.json", "w") as f:
    json.dump(all_data, f)