In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import numpy as np

In [2]:
# Initialize web driver for scraping

chromedriver = "/Applications/chromedriver 2"
driver = webdriver.Chrome(chromedriver)

In [3]:
# Create dictionary for scraped ski jacket information

def scrape_evo_jackets(link):
    
    jacket_dict = {"url":[],"jacket_name":[],"jacket_price":[],"total_rating":[], "num_reviews":[], 
                   "review_titles": [], "review_ratings":[], "review_text":[]}
    
    response = driver.get(link)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, 'lxml')
    
    jacket_name = soup.find('div', id = 'buy-grid').find('h1')
    jacket_price = soup.find('span', id = 'price-display')
    total_rating = soup.find('div', class_ = 'pr-snippet-rating-decimal')
    num_reviews = soup.find('span', class_ = 'pr-snippet-review-count')
    
    review_titles = [desc.text if desc != None else np.nan 
                     for desc in soup.find_all("h2", {"class": "pr-rd-review-headline"})]
    
    review_ratings = [item.text if element != None else np.nan 
                      for element in soup.find_all("div", {"class": "pr-review"})
                      for item in element.find_all("div", class_ = "pr-snippet-rating-decimal")]
    
    review_text = [item.text if element != None else np.nan 
                   for element in soup.find_all("section", {"class": "pr-rd-description pr-rd-content-block"})
                   for item in element.find_all("p", {"class": "pr-rd-description-text"})]
    
    jacket_dict["url"].append(link)
    
    if jacket_name != None:
        jacket_dict["jacket_name"].append(jacket_name.text.strip())
    else:
        jacket_dict["jacket_name"].append(np.nan)
    
    if jacket_price != None:
        jacket_dict["jacket_price"].append(jacket_price.text)
    else:
        jacket_dict["jacket_price"].append(np.nan)
    
    if total_rating != None:
        jacket_dict["total_rating"].append(float(total_rating.text))
    else:
        jacket_dict["total_rating"].append(np.nan)
    
    if num_reviews != None:
        jacket_dict["num_reviews"].append(num_reviews.text)
    else:
        jacket_dict["num_reviews"].append(np.nan)
    
    if review_titles != None:
        jacket_dict["review_titles"].append(review_titles)
    else:
        jacket_dict["review_titles"].append(np.nan)
    
    if review_ratings != None:
        jacket_dict["review_ratings"].append(review_ratings)
    else:
        jacket_dict["review_ratings"].append(np.nan)   
    
    if review_text != None:
        jacket_dict["review_text"].append(review_text)
    else:
        jacket_dict["review_text"].append(np.nan)
    
    return jacket_dict

In [4]:
# Load list of ski jacket urls 

jackets_url_list = pd.read_csv("jackets_url_list.csv", names = ["url"], header = 0, index_col = 0)

In [5]:
jackets_url_list.head()

Unnamed: 0,url
0,https://www.evo.com/outlet/shell-jackets/thirt...
1,https://www.evo.com/insulated-jackets/l1-fairb...
2,https://www.evo.com/insulated-jackets/686-glcr...
3,https://www.evo.com/outlet/shell-jackets/dakin...
4,https://www.evo.com/outlet/shell-jackets/oakle...


In [6]:
# Run scrape function for all urls

jackets_list = [scrape_evo_jackets(link) for link in jackets_url_list["url"]]

In [7]:
df_jackets = pd.DataFrame(jackets_list)

In [8]:
df_jackets.to_csv("jackets_list.csv")
df_jackets.to_pickle("jackets_list.pkl")