# Step 1: Data Scraping

## 0. Import libraries

In [56]:
import os
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector

import plotnine
import altair
import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

## 1. Load credentials

This loads the `credentials.json` file in each of our local repos.

In [57]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

## 2. Obtaining a token

In [58]:
s = r.Session()

# Set up authentication parameters 
client_auth = r.auth.HTTPBasicAuth(credentials["app_client_id"], credentials["app_client_secret"])

# Send, via HTTP POST, your Reddit username and password
post_data = {"grant_type": "password",
             "username": credentials["reddit_username"],
             "password": credentials["reddit_password"]}

# Reddit API requests that we self-identify ourselves in the User-Agent
headers = {"User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

In [59]:
# From Reddit's API documentation, this is the endpoint I need
ACCESS_TOKEN_ENDPOINT = "https://www.reddit.com/api/v1/access_token"

# Send a HTTP POST 
response = s.post(ACCESS_TOKEN_ENDPOINT, auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAxMzE0ODMwLjAxNTM1LCJpYXQiOjE3MDEyMjg0MzAuMDE1MzUsImp0aSI6IldWTlRpdWFfcTZlc1l1OEtNR2FDMG1JOVR0VlFjQSIsImNpZCI6Im1oVG1fNnhFVDc1ZDloZlpCa0tGWEEiLCJsaWQiOiJ0Ml8xNmZhNDIiLCJhaWQiOiJ0Ml8xNmZhNDIiLCJsY2EiOjE0OTAyNDc3MjcwMTEsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.rspy8I4TZRuDThFy22h5XCQFGnaw8J6sFAlvwyo2HL0XOxlfDemf2nEsZvwt9JA56CN2cv70Gd6leAOHJ-F7p2J3qKMyUdsHa4ZIF_DA8GhyLbKkfgAoaJlvjPk941uMv2YlgsAZuuSIn18s3SmVSrtZjWfuw-jQWBk_7l7MOHUtDnp8d0v8a46sub_cYG7l0Q-fA8WVVgpgdcguQmz4gTnZ5mJ0Wd-OvawhtI-831NY7qum5txbCy2WQxsxatPRHxD6JxdNp_gmpkTVbDOldA7IVGtMuq7AaPgBQ84-ep1DsMLTkGMl-fuovF5KLqF54e3bDNRGmzohKLXPGmYHhw',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

Save our token

In [60]:
my_token = response.json()['access_token']

From now on, all requests need to be followed by these HTTP HEADERS:

In [61]:
headers = {"Authorization": f"bearer {my_token}",
           "User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

## 3. Sending our first request

We will limit our search to 3 posts first, to test whether our GET request works.

In [62]:
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = 'Recipe'
subreddit_name = 'recipes'

params = {'q': f'flair_name:"{flair_name}"',
          'limit': 3,
          'restrict_sr': 1,
          'sort': 'new'}

response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)

# response.json()

We will try paginating 3 times first, before increasing the number of page or paginating to the end.

In [63]:
# Initialize an empty list to store the data from all pages
all_data = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data.extend(data['data']['children'])

# Continue paginating until there is no more data (or paginate for a set number of times)

# while data['data']['after'] is not None:
for i in range(2):
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {i+2}")
    data = response.json()

    # Process the data from the current page
    all_data.extend(data['data']['children'])

Requesting Page 2
Requesting Page 3


In [64]:
len(all_data)

9

## 4. Saving the data to JSON

In [65]:
with open("../data/all_data_flair_is_recipe.json", "w") as f:
    json.dump(all_data, f)