In [None]:
from bs4 import BeautifulSoup
import requests
import re
import random
import time
import numpy as np
import pandas as pd
import os
import pickle

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from collections import defaultdict
from tqdm import tqdm

# User agent set-up

In [None]:
######fake useragent for import#######
from fake_useragent import UserAgent
ua = UserAgent()

#need regex to use chrome data from fake_user_agent
browser_regex = re.compile('((\w+)[\/][^\s]+)')

#regex finds dictionary of matches, must grab first
def custom_agent():
    x = browser_regex.findall(ua.random)
    browser = x[random.randrange(0,len(x))][0]
    user_agent = {'User-agent': browser} 
    return(user_agent)

In [None]:
def get_tiles(soup):
    url_list = []
    
    for tile in soup.find_all(class_='tile Tile-oazi1d-0 bbbEOC'):
        url_list.append('http://stockx.com'+tile.find('a')['href'])
    return(url_list)

# Extract URLS for most popular shoes

In [None]:
###############PULL SNEAKER URLS###################
url = 'https://stockx.com/sneakers/most-popular?page={}' 
urls = []

# 40 shoes per most popular page, 14*40 = 560 shoes
for i in range(1,14):
    
    """
    GOES TO THE 14 MOST POPULAR PAGES.
    ALL THE SHOES HAVE SHOE INFORMATION IN TILES.
    GRAB THE URL PER TILE.
    """
    
    # timer
    time.sleep(30+random.random())
    
    # create url
    url_page = str.format(url, i)
    
    # pull response and construct soup
    session = requests.Session()
    response  = session.get(url_page, headers = custom_agent())
    print(response.status_code)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    # append pulled urls to list
    urls= urls + get_tiles(soup)

#check if 520
len(urls)

#save as file to save time
with open("urls.txt", "wb") as fp:   #Pickling
    pickle.dump(urls, fp)

In [None]:
# read in previously saved data
with open("urls.txt", "rb") as fp:   # Unpickling
    urls = pickle.load(fp)
len(urls)

In [None]:
urls = list(dict.fromkeys(urls))

# Pull functions
for different elements of each url

In [None]:
def pull_gauge_vars(soup):
    #this requires the soup to be pulled
    var_dict = defaultdict(str)
    for gauge in soup.find_all(class_='gauge-container'):
        if(gauge.find(class_='gauge-value')):
            var_dict[gauge.find(class_='gauge-title').text] = gauge.find(class_='gauge-value').text
        elif(gauge.find(class_='gauge-value-negative')):
            var_dict[gauge.find(class_='gauge-title').text] = gauge.find(class_='gauge-value-negative').text
        else:
            print(gauge)
    return(var_dict)

In [None]:
def pull_row_list_vars(soup):
    #this requires the soup to be pulled
    var_dict = defaultdict(str)
    for row_list in soup.find_all(class_='row list-unstyled'):
        if(row_list.find(class_='ft-high-low-col')):
            var_dict['52 Week High/Low'] =  row_list.find(class_='ft-high-low-col').text
        if(row_list.find(class_='ds-range value-container')):
            var_dict['12 Month Trade Range'] =  row_list.find(class_='ds-range value-container').text
        if(row_list.find(class_='volatility-col market-down')):
            var_dict['Volatility'] =  row_list.find(class_='volatility-col market-down').text
    return(var_dict)

In [None]:
def pull_inset_vars(soup):
    #this requires the soup to be pulled
    var_dict = defaultdict(str)
    for inset in soup.find_all(class_='inset'):
        if(inset.find(class_='title')):
            var_dict[inset.find(class_='title').text.strip()] = inset.find(class_='subtitle').text.strip()
        else:
            return({'all':0})
    return(var_dict)

In [None]:
def pull_pinfo_vars(soup):
    #this requires the soup to be pulled
    var_dict = defaultdict(str)
    pinfo = soup.find(class_='product-info')
    for detail in pinfo.find_all(class_='detail'):
        #print(detail.text)
        
        for span in detail.find_all('span', recursive=False):
            #print(detail.find(class_='pinfo-container').text)
            #print(span.text)
            var_dict[detail.find(class_='pinfo-container').text] = span.text
    return(var_dict)


In [None]:
#merge dictionary function
def Merge(dictlist): 
    return_dict = {}
    for dict_obc in dictlist:
        return_dict= {**return_dict, **dict_obc}
    return(return_dict) 

In [None]:
#pull variables by extracting urls for different shoe sizes of the shoe
def pull_vars(url):
    tile_response  = requests.get(url, headers = custom_agent()) 
    if (tile_response.status_code != 200):
        print(tile_response.status_code)
        time.sleep(500)
        try:
            tile_response  = requests.get(url, headers = custom_agent()) 
        except:
            return({})
    tile_soup = BeautifulSoup(tile_response.text, "html")
    dict_list = []
    sizes = pull_inset_vars(tile_soup)
    for key in sizes:
        size_info = {'Size': key, 'Lowest Ask': sizes[key]}
        if(key != 'ALL' and sizes[key] != 'BID'):
            time.sleep(30+random.random())
            row = Merge([pull_x_vars(url, key),size_info])
            dict_list.append(row)
            
    return(dict_list)

#pull last sale prices from the url
def pull_x_vars(url, size):
    size_url = url+'?size='+size
    tile_response = requests.get(size_url, headers = custom_agent())
    if (tile_response.status_code != 200):
        print(tile_response.status_code)
        time.sleep(500)
        try:
            tile_response  = requests.get(size_url, headers = custom_agent())
        except:
            return({"url":size_url})
    
    tile_soup = BeautifulSoup(tile_response.text, "html")
    
    
    
    y_value = {'Sale Value' : tile_soup.find(class_='sale-value').text,
              'Name' : tile_soup.find(class_='name').text}
    
    gauge = pull_gauge_vars(tile_soup)
    pinfo = pull_pinfo_vars(tile_soup)
    row_list = pull_row_list_vars(tile_soup)
    url_dict = {'url' : size_url}
    returndict = Merge([y_value,gauge,pinfo,row_list,url_dict])
    return(returndict)
    

# Actual webscraping process

In [None]:
#data pull
df_list = []
missed_urls = []

for ix,url in tqdm(enumerate(urls)):
    try:
        df_list.extend(pull_vars(url).copy())
        df = pd.DataFrame(df_list)
        df.to_pickle('shoes_temp.pkl')
    except Exception as err: 
        print(ix)
        missed_urls.extend(url)
        
df = pd.DataFrame(df_list)
df.to_pickle('shoes.pkl')