# Nike Footwear Scraping

Scrape footwear product information from the [official Nike website](https://www.nike.com/w/shoes-y7ok). The following information are obtained:
* Product title and subtitle
* Product special label (e.g. Best Seller, Coming Soon, Sustainable Materials, Member Access, etc.)
* Direct URL to the individual product page
* Prices (original and discounted)
* Product description
* Color choices / number of color choices
* Number of reviews
* Average rating for the product based on the reviews


In [1]:
from time import sleep
import json
import re
import requests

from selenium import webdriver
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

In [14]:
from utils import NikeProductCard

## Open driver and pull html

Use Selenium Chrome driver to open connection and scroll down across all product listings:

In [3]:
# have to use webdriver instead of requests because
# the page needs to be scrolled in order to get the list of all shoes..
# Otherwise, will get only the first 24 shoes.
driver = webdriver.Chrome()
driver.get("https://www.nike.com/w/shoes-y7ok")
sleep(2)  # Allow 2 seconds for the web page to open

scroll_pause_time = 1 # in seconds
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1

while True:
    # scroll one screen height each time
    driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
    i += 1
    sleep(scroll_pause_time)
    
    # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
    scroll_height = driver.execute_script("return document.body.scrollHeight;")  
    
    # Break the loop when the height needed to scroll to is larger than the total scroll height
    if (screen_height) * i > scroll_height:
        break 
        
# create a BeautifulSoup object with the parsed html
nike_soup = BeautifulSoup(driver.page_source, "html.parser")
print (nike_soup.prettify()[:500])

<html class="js-focus-visible" data-js-focus-visible="" lang="en">
 <head>
  <script async="" src="https://www.googletagmanager.com/dclk/ns/v1.js" type="text/javascript">
  </script>
  <script async="" src="https://www.googletagmanager.com/dclk/ns/v1.js" type="text/javascript">
  </script>
  <script async="" src="https://www.googletagmanager.com/dclk/ns/v1.js" type="text/javascript">
  </script>
  <script async="" src="https://www.googletagmanager.com/dclk/ns/v1.js" type="text/javascript">
  </s


In [10]:
# def get_label(tag): 
#     # e.g. "Best Seller", "Coming Soon", "Just In", "Sold our", "Member Access",
#     # "Sustainable Materials", "Launching in SNKRS", "Available in SNKRS", 
#     # "Customize", etc.
#     label_tag = tag.select_one('figure .product-card__info .product-card__messaging')
#     if label_tag is None:
#         return ""
#     return label_tag.text

# def get_title(tag):
#     return tag.select_one("figure .product-card__title").text

# def get_subtitle(tag):
#     # e.g. "Shoes", "Men's Shoes", "Women's Shoes", 
#     # "Big Kids' Shoes", "Basketball Shoes", etc.
#     return tag.select_one("figure .product-card__subtitle").text

# def get_count(tag):
#     # Number of colors
#     # e.g. "2 Colors"
#     return tag.select_one("figure .product-card__product-count").text

# def get_reduced_price(tag):
#     reduced_pricetag = tag.find(attrs = {'data-test': 'product-price-reduced'})
#     if reduced_pricetag is None:
#         return "" # full price (see get_price())
#     return reduced_pricetag.text

# def get_price(tag):
#     pricetag = tag.find(attrs = {'data-test': 'product-price'})
#     if pricetag is None: # will result in N/A in dataframe => need to drop
#         return None
#     return pricetag.text

# def get_url(tag):
#     return tag.select_one('figure a').get('href')

# ### below is for individual shoe pages
# def get_page(url):
#     sleep(1)
#     return BeautifulSoup(requests.get(url, 'html.parser').content)

# def get_description(page):
#     description = page.select_one('.description-preview p')
#     if description is None:
#         return None
#     return description.text

# def get_colors(page):
#     all_colors = page.select('.colorway-images img')
#     if all_colors is None:
#         return None
#     return '; '.join([c.get('alt') for c in all_colors])

# def get_reviews(page):
#     review_section = page.find(attrs = {'data-test': 'reviewsAccordionClick'})
#     if review_section is None:
#         return None, None
#     review_title = review_section.select_one('h3').text
#     n_reviews = review_title[review_title.index('(') + 1 : review_title.index(')')]
#     avg_stars = review_section.select_one('div').get('aria-label')
#     return n_reviews, avg_stars

In [4]:
def parse_nike_shoes(nike_soup):
    shoes_tags = nike_soup.select('.product-card__body')
    shoes_cards = [NikeProductCard(tag) for tag in shoes_tags]
    shoes_info = [ {'label': card.get_label(),
                     'title': card.get_title(),
                     'subtitle': card.get_subtitle(),
                     'num_colors': card.get_count(),
                     'price': card.get_price(),
                     'reduced_price': card.get_reduced_price(),
                     'url': card.url,
                     'description': card.get_description(),
                     'colors': card.get_colors(), # each color is separated by "; "
                     'n_reviews': card.get_review_info()[0],
                     'avg_stars': card.get_review_info()[1]
                    } 
                  for card in shoes_cards]
    return shoes_info

In [15]:
def get_reduced_price(card_tag):
    reduced_pricetag = card_tag.find(attrs = {'data-testid': 'product-price-reduced'})
    # if reduced_pricetag is None:
    #     return "" # full price (see get_price())
    return np.nan if reduced_pricetag is None else reduced_pricetag.text

def get_price(card_tag):
    pricetag = card_tag.find(attrs = {'data-testid': 'product-price'})
    # if pricetag is None: # will result in N/A in dataframe 
    #     return None
    return np.nan if pricetag is None else pricetag.text

In [20]:
nike_shoes_list = parse_nike_shoes(nike_soup)
nike_shoes_df = pd.DataFrame(nike_shoes_list)
print("Total number of shoes:", len(nike_shoes_df))

Total number of shoes: 1659


In [21]:
nike_shoes_df.sample(5)

Unnamed: 0,label,title,subtitle,num_colors,price,reduced_price,url,description,colors,n_reviews,avg_stars
1498,,Nike Air Force 1 '07 LX NBHD,Men's Shoes,1 Color,$140,,https://www.nike.com/t/air-force-1-07-lx-nbhd-...,Tumbled leather. Premium canvas. Easy-to-style...,,0,0.0
523,,Nike Tiempo Legend 9 Academy MG,Multi-Ground Soccer Cleats,2 Colors,$75,,https://www.nike.com/t/tiempo-legend-9-academy...,"1 of our lightest Tiempos to date, the Nike Ti...",Black/Summit White/Light Photo Blue/Dark Smoke...,33,4.3
681,,Nike Pegasus Trail 4,Men's Trail Running Shoes,5 Colors,$140,,https://www.nike.com/t/pegasus-trail-4-mens-tr...,"Running is your daily ritual, taking you from ...",Neutral Olive/Stadium Green/Phantom/Light Bone...,179,4.6
1555,,Nike Air Force 1 Mid '07 LX,Men's Shoes,1 Color,$150,$111.97,https://www.nike.com/t/air-force-1-mid-07-lx-m...,The holi'yays just keep coming. Unbox the AF1 ...,,2,5.0
1194,,Nike Zoom LeBron 3,Men's Shoes,1 Color,$185,$129.97,https://www.nike.com/t/zoom-lebron-3-mens-shoe...,Stay sharp in the Nike Zoom LeBron 3. Classic ...,,0,0.0


In [22]:
nike_shoes_df.to_csv("../data/nike_raw.csv")