# Step 1: Data Scraping

## 0. Import libraries

In [12]:
import os
import json
import requests as re

import numpy
import pandas as pd
from scrapy import Selector

import plotnine
import altair
import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

## 1. Load credentials

This loads the `credentials.json` file in each of our local repos.

In [13]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

# Obtaining a token

In [14]:
# Set up authentication parameters 
client_auth = re.auth.HTTPBasicAuth(credentials["app_client_id"], credentials["app_client_secret"])

# Send, via HTTP POST, your Reddit username and password
post_data = {"grant_type": "password", "username": credentials["reddit_username"], "password": credentials["reddit_password"]}

# Reddit API requests that we self-identify ourselves in the User-Agent
headers = {"User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

In [15]:
# From Reddit's API documentation, this is the endpoint I need
ACCESS_TOKEN_ENDPOINT = "https://www.reddit.com/api/v1/access_token"

# Send a HTTP POST 
response = re.post(ACCESS_TOKEN_ENDPOINT, auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAxMjk5MDkyLjc0MzE0MiwiaWF0IjoxNzAxMjEyNjkyLjc0MzE0MiwianRpIjoiOXhxZHhDSGpIUUIzVlFzLUlBTjFvUnljNm8yOFpnIiwiY2lkIjoiQmVvRVNfeUhwNDJXWXF0aUNBeHVhZyIsImxpZCI6InQyXzhwNHl1NzBrIiwiYWlkIjoidDJfOHA0eXU3MGsiLCJsY2EiOjE2MDQxNDY3NTU0MjQsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.AFTUynhbU5Wm-neyGQdEXpKs_GqJCkA2WUU2_VDyq1OmCwAhMvuztagX6eTM29czlFGbadNh7zthCABs-_KrtxlmnGUjdhRf_BBzUeb8sYecajNFiLadvyGHSIWqolTQzrCvYRkglLI-GvQnTUxanAvqb024FKXM_NPKnteLXP729CODCPhEDF8AwM0jS8m1zFTQSmDfJ27ZYvMyePW9eqm_qmwfCHmR__i51isyMpPde7bElkQv-ViTxRJmabs25RHmdfY057KAfScMpj_dr1xfZkGbll4W2lny3YSHvxMpwxCQUJFbS3Ho5DC0cX6uICZhCVqWZWAlYgSOEM48GA',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

saving our token

In [16]:
my_token = response.json()['access_token']

From now on, all my requests need to be followed by these HTTP HEADERS:

In [17]:
headers = {"Authorization": f"bearer {my_token}", "User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

Sending our first request with the token for flair=Recipe

In [21]:
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = 'Recipe'
subreddit_name = 'recipes'

params = {'limit': 100, 'q': f'flair_name:"{flair_name}"'}
response = re.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/top", headers=headers, params=params)

response.json()

{'kind': 'Listing',
 'data': {'after': None,
  'dist': 1,
  'modhash': None,
  'geo_filter': '',
  'children': [{'kind': 't3',
    'data': {'approved_at_utc': None,
     'subreddit': 'recipes',
     'selftext': '',
     'author_fullname': 't2_2elyzmmv',
     'saved': False,
     'mod_reason_title': None,
     'gilded': 0,
     'clicked': False,
     'title': 'Quick &amp; Easy Nut Brittle',
     'link_flair_richtext': [{'e': 'text', 't': 'Recipe'}],
     'subreddit_name_prefixed': 'r/recipes',
     'hidden': False,
     'pwls': 6,
     'link_flair_css_class': 'recipe',
     'downs': 0,
     'top_awarded_type': None,
     'hide_score': False,
     'name': 't3_1866xrq',
     'quarantine': False,
     'link_flair_text_color': 'dark',
     'upvote_ratio': 0.94,
     'author_flair_background_color': None,
     'subreddit_type': 'public',
     'ups': 14,
     'total_awards_received': 0,
     'media_embed': {},
     'author_flair_template_id': None,
     'is_original_content': False,
     'use