In [None]:
### NOTES ###
# This code is far from perfect. It works well enough for the purpose it was designed for.
# However, I am far from the best programmer, and am far more comfortable in R than Python. 
# That is all just to say that this is not text book, but I hope it is good enough to help someone scrape some data.
# If you do see errors here or in the other scripts, please reach out to me at matt.meister@colorado.edu

## The best way to use this to learn: ##
# Would be to take code out of the for loops it is in, and run things piece by piece
# Do so with the REI website up alongside, preferably while able to see the source code (i.e., right click "inspect")

## Citing ##
# If you use this code or data in a way that feels necessary to cite, please do so as:
# Meister, M., & Reinholtz, N. (2022). Quality in Context: Evidence that Consumption Context Influences User-Generated Product Ratings. Available at SSRN 4155522.
# The most updated version of that paper can be found here: https://drive.google.com/file/d/1BfDzIxTsCtQOMRwkbAlK12xDKWDXSbSN/view?usp=sharing

In [1]:
# Start by importing necessary stuff
import json # A lot of data online is stored as jsons
import pandas as pd
import re # Regular expressions, to help us parse html, json.
import requests # To pull internet pages
import time # To allow us to "sleep" the scraping, so as not to overwhelm the servers
import os # To create folders
import math
from bs4 import BeautifulSoup # To make html easier to parse

In [3]:
products = [] # We'll put products in here
larger_categories = [] # This is where we'll put overarching categories. 
# I don't want them, since they'll lead to a lot of duplicate scraping

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.2 Safari/605.1.15'}

# Scrape categories
html = BeautifulSoup(requests.get('https://www.rei.com/categories', headers = headers).text, 'html.parser')

# Find overarchign categories
larger_list = html.find_all('div', class_="all-categories__parent-header")
larger_categories = ["https://www.rei.com" + tag.find('a').get('href','') if tag.find('a') else '' for tag in larger_list]


# Find all categories
product_list = html.find_all('a', class_='cdr-link_13-5-3 cdr-link--standalone_13-5-3')
products = ["https://www.rei.com" + d['href'] for d in product_list]

# Remove overarching categories
products = [x for x in products if x not in larger_categories]
products = [re.compile(r'https://www\.rei\.com/c/').sub('', url) for url in products]

# Set up the rest of the scraping!
# Set the base url to call reviews from their hosting site
url = 'https://api.bazaarvoice.com/data/batch.json'
# Give yourself something to not look like a bot
    
# Function to create a directory if it doesn't exist
def create_directory(directory):
    if not os.path.exists(directory):
        os.mkdir(directory)
        print(f"Directory '{directory}' created.")
    else:
        print(f"Directory '{directory}' already exists.")

# Create directories if they don't exist
create_directory('PRICES')
create_directory('JSONs')
create_directory('QUESTIONS')
create_directory('REVIEWS')
create_directory('OVRs')

for product in products:
    page = 1
    product_hrefs = []
    pids = []
    full_prices = []
    sale_prices = []
    compare_prices = []
    
    # Get links for all results pages
    html = BeautifulSoup(requests.get(f'https://www.rei.com/c/{product}?pagesize=90',
                                     headers = headers).text, 'html.parser')
    time.sleep(1)
    
    # Try to see how many pages we have
    try:
        pagination = html.find('div', {'data-id': 'pagination-test-count'}).get_text(strip=True)
        num_pg = math.ceil(int(re.findall(r'\b\d+\b', pagination)[2])/int(re.findall(r'\b\d+\b', pagination)[1]))
        pages = list(range(1,int(num_pg)+1))
    except:
        pages = list(range(1,int(1)+1))
    
    # Go through each page, scrape all product links
    for page in pages:
        html = BeautifulSoup(requests.get(f'https://www.rei.com/c/{product}?pagesize=90&page={page}',
                                         headers = headers).text, 'html.parser')
        html_contents = html.find_all('li', class_ = "VcGDfKKy_dvNbxUqm29K")

        for html_content in html_contents:
            html_product = str(html_content)

            href_match = re.compile(r'href="(/product/\d+/[a-zA-Z0-9-]+)"').search(html_product)
            if href_match:
                product_hrefs.append(href_match.group(1))
                pids.append(re.search(r"(\d{6})", href_match.group(1))[0])
            else:
                product_hrefs.append('NA')
                pids.append('NA')

            sale_price = re.compile(r'data-ui="sale-price">(\$[\d.]+)').search(html_product)
            if sale_price:
                sale_prices.append(sale_price.group(1))
            else:
                sale_prices.append('NA')

            compare_price = re.compile(r'data-ui="compare-at-price">(\$[\d.]+)').search(html_product)
            if compare_price:
                compare_prices.append(compare_price.group(1))
            else:
                compare_prices.append('NA')

            full_price = re.compile(r'data-ui="full-price">(\$[\d.]+)').search(html_product)
            if full_price:
                full_prices.append(full_price.group(1))
            else:
                full_prices.append('NA')

        # Create the data frame of prices for each category, and write it. 
    pricesDF = pd.DataFrame({'pid':pids,
                             'href':product_hrefs,
                            'full_price':full_prices,
                            'sale_price':sale_prices,
                            'compare_price':compare_prices})

    pricesDF.to_csv(f'PRICES/REI_{product}_prices.csv')

    
    # Now the good stuff! Call the API to get reviews, questions, other stuff
    for pid in pids:
        params = {
            # TO GET PARAMETER INFO:
            # 1) GO TO A PRODUCT ON REI.COM
            # 2) RIGHT CLICK
            # 3) INSPECT ELEMENT (ON SAFARI)
            # 4) GO TO NETWORK > BATCH.JSON > HEADERS
        'passkey': 'thvpbov9ywkkl4nkhbeq0wm1i',#This changes semi-frequently
        'apiversion': '5.5',
        'displaycode': '15372-en_us',
        'resource.q0': 'products',
        'filter.q0': f'id:eq:{pid}',
        'stats.q0': 'questions,reviews',
        'filteredstats.q0': 'questions,reviews',
        'filter_questions.q0': 'contentlocale:eq:en*,en_US',
        'filter_answers.q0': 'contentlocale:eq:en*,en_US',
        'filter_reviews.q0': 'contentlocale:eq:en*,en_US',
        'filter_reviewcomments.q0': 'contentlocale:eq:en*,en_US',
        'resource.q1': 'questions',
        'filter.q1': f'productid:eq:{pid}',
        'sort.q1': 'totalanswercount:desc',
        'stats.q1': 'questions',
        'filteredstats.q1': 'questions',
        'include.q1': 'authors,products,answers',
        'filter_questions.q1': 'contentlocale:eq:en*,en_US',
        'filter_answers.q1': 'contentlocale:eq:en*,en_US',
        'limit.q1': '20',
        'offset.q1': '0',
        'limit_answers.q1': '10',
        'resource.q2': 'reviews',
        'filter.q2': f'productid:eq:{pid}',
        'sort.q2': 'submissiontime:desc',
        'stats.q2': 'reviews',
        'filteredstats.q2': 'reviews',
        'include.q2': 'authors,products,comments',
        'filter_reviews.q2': 'contentlocale:eq:en*,en_US',
        'filter_reviewcomments.q2': 'contentlocale:eq:en*,en_US',
        'filter_comments.q2': 'contentlocale:eq:en*,en_US',
        'limit.q2': '100',
        'offset.q2': '0',
        'limit_comments.q2': '20',
        'callback': 'JSONPHandler'
    }
        #Get that stuff!
        req = requests.get(url, headers = headers, params = params)
        data_json = req.text.split("(", 1)[1].strip(")") # Convert to json
        batched_results = json.loads(data_json)['BatchedResults']
        
        # Write the raw JSON so you can use it later if needed
        if batched_results['q0']['Results']==[]:
            continue
        products_json = batched_results['q0']['Results'][0]
        with open(f'JSONs/REI_{pid}_batched_results.json', 'w') as outfile:
            json.dump(products_json, outfile)
        
        # Scrape questions to use if desired
        questions = pd.DataFrame(batched_results['q1']['Results'])
        questions.to_csv(f'QUESTIONS/REI_{pid}_questions.csv')  # Write them as csv
        
        # Scrape reviews
        reviews = pd.DataFrame(batched_results['q2']['Results'])
        reviews.to_csv(f'REVIEWS/REI_{pid}_reviews.csv')  
        
        # Product overall information
        ovr = pd.DataFrame({'Name': 'NA' if products_json['Name'] == {} else[products_json['Name']],
                            'Active': 'NA' if products_json['Active'] == {} else[products_json['Active']],
                            'Brand': 'NA' if products_json['Brand'] == {} else [products_json['Brand']['Id']],
                            'reviews': 'NA' if products_json['TotalReviewCount'] == {} else [products_json['TotalReviewCount']],
                            'pid': f'{pid}', 
                            'ovr': 'NA' if products_json['ReviewStatistics']['AverageOverallRating'] == {} else[products_json['ReviewStatistics']['AverageOverallRating']],
                           'desc': 'NA' if products_json['Description'] == {} else [products_json['Description']],
                           'ProductPageUrl': 'NA' if products_json['ProductPageUrl'] == {} else [products_json['ProductPageUrl']],
                            'category': f'{product}',
                           })
        ovr.to_csv(f'OVRs/REI_{pid}_ovr.csv')
        time.sleep(8) #Rest, so that you don't overload REI

Directory 'PRICES' already exists.
Directory 'JSONs' already exists.
Directory 'QUESTIONS' already exists.
Directory 'REVIEWS' already exists.
Directory 'OVRs' already exists.
