# Predicting Sneaker Prices

In [None]:
from bs4 import BeautifulSoup
import requests
import time, os
import re
import pickle

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from fake_useragent import UserAgent

import pandas as pd
import datetime
import numpy as np
import random

In [None]:
# Scrape some data (start off small)

url = 'https://stockx.com/air-jordan-11-retro-playoffs-2019' 

response = requests.get(url)

In [None]:
response.status_code

What status code 403 means per Wikipedia: "The request contained valid data and was understood by the server, but the server is refusing action. This may be due to the user not having the necessary permissions for a resource or needing an account of some sort, or attempting a prohibited action (e.g. creating a duplicate record where only one is allowed)."

update: Using User-agent works

## Scraping

### Generating list of shoe names

In [None]:
page_list = list(range(1,7))
shoes_list=[]

for page in page_list:
    
    # Get URL into a BeautifulSoup object
    user_agent = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
    url = 'https://stockx.com/retro-jordans?size_types=women&page={}'.format(page)
    response  = requests.get(url, headers = user_agent)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    # Generate list
    current_shoes_list=[]
    for link in soup.find_all('a'):
        current_shoes_list.append(link.get('href'))
    
    # Focus on the shoe names
    shoes_list.extend(current_shoes_list[26:64])
    
    # Pause
    time.sleep(3+2*random.random())

In [None]:
shoes_list

In [None]:
# Save as Pickle
with open('shoes.pickle', 'wb') as to_write:
    pickle.dump(shoes_list, to_write)

In [None]:
# Read Pickle and assign once again to variable shoes_list
with open('shoes.pickle','rb') as read_file:
    shoes_list = pickle.load(read_file)

In [None]:
shoes_list = shoes_list[153:228]

In [None]:
len(shoes_list)

### Iterating through our list of shoe names, scraping data, and adding to dataframe

In [None]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [None]:
x_paths=[1] 
sale_list=[]
v_list=[]
high_low=[]
trade_range=[]
twelve_list=[]
size_list=[]
price_list=[]
sale_date_list=[]
n_sales=[]
price_premium=[]
avg_price=[]

for shoe in range(len(shoes_list)):
    
    shoe_type = shoes_list[shoe]
    
    # Get URL into a BeautifulSoup object
    user_agent = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
    url = 'https://stockx.com' + shoe_type
    response  = requests.get(url, headers = user_agent)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
      # Open url in chrome
#     driver = webdriver.Chrome(chromedriver) 
#     time.sleep(3+1*random.random())
#     driver.get('https://stockx.com' + shoe_type)
#     time.sleep(3+2*random.random())  #pause to be sure page has loaded

    
    # click size drop-down menu
#     drop_down = driver.find_element_by_link_text('All')
#     time.sleep(1+2*random.random())
#     drop_down.click()
#     time.sleep(1+2*random.random())

    # click size
#     click_size = driver.find_element_by_xpath('//*[@id="menuitem-{}"]'.format(num))
#     time.sleep(1+2*random.random())
#     click_size.click()
#     time.sleep(1+2*random.random())

#     soup = BeautifulSoup(driver.page_source)
#     time.sleep(1+2*random.random())
    
    # Find 52-week high low data and put it in a list
    #high = soup.find_all(class_='value-container')[0].text
    try:
        high_low = soup.find_all(class_='value-container')[0].text
    except:
        high_low = np.nan
    time.sleep(2+2*random.random())
    
    # Find trade range and put it in a liast
    #trade = soup.find_all(class_='ds-range value-container')[0].text
    try:
        trade_range = soup.find_all(class_='ds-range value-container')[0].text
    except:
        trade_range = np.nan
    time.sleep(2+1*random.random())
    
    # Find volatility and put in list
    volatility = soup.find_all(class_='value')[-1].text
    try:
        v_list = volatility
    except:
        v_list = np.nan
    
    # Find 12-month rolling data and put it in separate lists
    twelve = soup.find_all(class_='gauge-value')
    current_twelve = [twelve.text for twelve in twelve]
    try:
        n_sales = current_twelve[0]
    except:
        n_sales = np.nan
    try:
        price_premium = current_twelve[1]
    except:
        price_premium = np.nan
    try:        
        avg_price = current_twelve[2]
    except:
        avg_price = np.nan
        
    time.sleep(2+3*random.random())
   
    # click View All Sales
#     view_all = driver.find_element_by_link_text('View All Sales')
#     view_all.click()
#     time.sleep(1+3*random.random())

    # Instantiate BeautifulSoup with Chrome above chrome driver
#     soup = BeautifulSoup(driver.page_source)
#     time.sleep(1+3*random.random())
    
    # Find our table
#     table = soup.find('table')

    # Get the rows and put them in a list
#     rows = [row.text for row in table.find_all('td')]

    # Make list for sizes
#     size_list = rows[0]
    
    # Make list for sizes
    size_list = soup.find('span', class_='bid-ask-sizes').text

    # Make list for sale price
    price_list = soup.find('div', class_='sale-value').text

    # Make list for sale prices
    sale_date_list = np.nan
    
    # Create list of names of the current shoe name so it can be added to df
    name_list = [shoe_type for i in range(len(x_paths))]

    # Find retail price
    try:
        retail = soup.find(text='Retail Price').findNext().text
    except:
        retail = np.nan
    # Remove white space
    try:
        retail = "".join(retail.split())
    except:
        retail = np.nan
    # Make List
    retail_list = [retail for i in range(len(x_paths))]

    # Find release date
    try:
        release = soup.find(text='Release Date').findNext().text
    except:
        release = np.nan
    # Remove white space
    try:
        release = "".join(release.split())
    except:
        release = np.nan
    try:    
        release_list = [release for i in range(len(x_paths))]
    except:
        release_list = np.nan

    # Dataframe
    d = {'Name': name_list, 'Release Date': release_list, 'Retail Price': retail_list, 'Sale Price': price_list, 'Size': size_list, 'Sale Date': sale_date_list,
         '52wk High|Low': high_low, '12mo Trade Range': trade_range, 'Volatility': v_list, '# of Sales': n_sales,
         'Price Premium': price_premium, 'Avg Resale Price': avg_price}
    df_current = pd.DataFrame(data=d)
    df = pd.concat([df, df_current], ignore_index=True) # READ CSV INTO JUPYTER NOTEBOOK BEFORE RUNNING THIS LOOP
    
    # pause
    time.sleep(10+2*random.random())

In [None]:
# Save 'df' to csv
df.to_csv(r'/Users/dominguez/Documents/onl20_ds4/curriculum/project-02/df.csv', index=False)

In [None]:
# Read the CSV and assign to 'df'
df = pd.read_csv('df.csv')
df

### Data Exploration

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.dtypes

All objects! :-(

Remove '/'

Add a column for brand names

In [None]:
df['Name'] = df['Name'].str.replace('/','')

In [None]:
df['Brand'] = df['Name']

In [None]:
for i in range(len(df['Brand'])):
    if 'jordan' in df['Brand'][i]:
        df['Brand'][i] = 'Jordan'
    elif 'nike' in df['Name'][i]:
        df['Brand'][i] = 'Nike'
    elif 'adidas' in df['Name'][i]:
        df['Brand'][i] = 'Adidas'
    else:
        df['Brand'][i] = 'Other'

In [None]:
df.head()

Re-arrange columns

In [None]:
df = df[['Name','Brand','Release Date','Retail Price','Sale Price','Size','Sale Date','52wk High|Low',
         '12mo Trade Range','Volatility','# of Sales','Price Premium','Avg Resale Price']]
df.head()

In [None]:
avg_price_by_brand = df.groupby(['Brand'])[['Sale Price']].mean().reset_index()
#plt.bar(df.Brand, df['Sale Price'])

In [None]:
plt.bar(avg_price_by_brand['Brand'], avg_price_by_brand['Sale Price'])

Convert Sale Price to int \
',' and '$' were removed and the '--' were converted to 0

In [None]:
df["Sale Price"] = df['Sale Price'] \
    .str.replace(',','').str.replace('$','').str.replace('--','0').astype(int)

In [None]:
df["Sale Price"].describe()

Fix # of Sales column

Replace '--' with a small number, since a '--' likely means there wasn't much data to extract

In [None]:
df['# of Sales'] = df['# of Sales'].str.replace('--','100').astype(int)

In [None]:
df['# of Sales'].describe()

Fix Size column

Remove '%', replace '--' with small volatility since those shoes with '--' likely had little activity

In [None]:
df['Volatility'] = df['Volatility'].str.replace('%','').str.replace('--','2.0').astype(float)

In [None]:
df['Volatility'].describe()

In [None]:
import matplotlib.pyplot as plt
# You can configure the format of the images: ‘png’, ‘retina’, ‘jpeg’, ‘svg’, ‘pdf’.
%config InlineBackend.figure_format = 'svg'
# this statement allows the visuals to render within your Jupyter Notebook
%matplotlib inline

plt.bar(df['Volatility'], df['Sale Price']);

In [None]:
plt.scatter(df['# of Sales'], df['Sale Price']);

Let's get some datetime columns

In [None]:
df['Release Date'] = pd.to_datetime(df['Release Date'])

There seems to be quite a bit of NaN values for Sale Date, let's take a closer look

In [None]:
df['Sale Date'].value_counts()

Most of the Sale Date values are for 10/3. In other words, most of the sale date datapoints represent recent purchases

Let's fill the NaN values with current purchases data

In [None]:
df['Sale Date'].fillna('Saturday, October 3, 2020', inplace=True)

In [None]:
df['Sale Date'].value_counts()

How many NaNs do we have in this column

In [None]:
df['Sale Date'].isna().sum()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
# Save as Pickle
with open('df.pickle', 'wb') as to_write:
    pickle.dump(df, to_write)

In [None]:
# Read Pickle and assign once again to variable shoes_list
with open('df.pickle','rb') as read_file:
    df = pickle.load(read_file)

In [None]:
df.head()

In [None]:
df_new = df[['Name','Sale Price','Brand','Release Date','Sale Date','# of Sales','Volatility','Price Premium']]

In [None]:
df_new.head()

In [None]:
df_new.dtypes

Price Premium

In [None]:
df_new['Price Premium'].value_counts()

Replace '--' with 1%. After reviewing some shoes with '--' values, it was determined these shoes do not resale
as often as more mainstream shoes. 

In [None]:
df_new['Price Premium'] = df_new['Price Premium'].str.replace(',','') \
    .str.replace('$','').str.replace('%','').str.replace('--','1.0').astype(float)

In [None]:
df_new.describe()

Create 'Days from Release' column

'Sale Date' column needs to be converted to datetime

In [None]:
df_new['Sale Date']

In [None]:
df_new['Sale Date'] = pd.to_datetime(df_new['Sale Date']) 

In [None]:
df_new.dtypes

In [None]:
df_new['Days from Release'] = df_new['Sale Date'] - df_new['Release Date']

Rearrange columns

In [None]:
df_new = df_new[['Sale Price','Brand','Release Date','Sale Date','Days from Release','# of Sales','Volatility','Price Premium']]

### STOPPING POINT, encode categorical  variable 'Brand'

In [None]:
df_new.head()