In [239]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import bs4
import re
from collections import OrderedDict
import time
import random

%matplotlib inline

First page's url is here: https://www.newegg.com/Product/ProductList.aspx?Submit=Property&N=100019096%2050010772%2050001186%2050010418%208000&IsNodeId=1&bop=And&Order=RELEASE&PageSize=96

**Sorting parameters within this page:**
* Seller is Newegg
* Brand is Dell, HP, Lenovo
* Sort by Newest to Oldest
* 96 Products per page
* Date is 10/3 at 10:33 PM

## Scraping functions

In [83]:
def get_soup(url):
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, 'lxml')
    return soup

In [87]:
def get_components(soup):
    '''Grabs all available PC components from Newegg webpage. 
       ====Parameters====
       soup: BeautifulSoup object (use get_soup() function)
       ====Returns====
       this_computer: OrderedDict mapping this computer's variables to values
    '''
    # Narrow down to specifications part of page
    specs = soup.find('div', class_='plinks')
    # All categories of technical specifications in dt tags
    categories = specs.find_all('dt')
    # All details of technical specifications in dd tags
    details = specs.find_all('dd')
    
    this_computer = OrderedDict()
    
    for cat, det in zip(categories, details):
        # raw detail is always located here
        raw_det = det.contents[0]
        
        # in some cases, have to go one tag deeper to get category name
        if type(cat.contents[0]) == bs4.element.Tag:
            raw_cat = cat.contents[0].contents[0]
        else:
            raw_cat = cat.contents[0]
            
        this_computer[raw_cat] = raw_det
    return this_computer

In [118]:
soup = get_soup('https://www.newegg.com/Product/Product.aspx?Item=N82E16883794794&ignorebbr=1')

In [119]:
soup.find_all('div', class_='subscription-price student-purchase physical-subscription multiple-selections')

[]

In [105]:
soup.find_all('ul', class_='price price-product-cells price-main-product')

[]

In [120]:
get_components(soup)

OrderedDict([('Brand', 'Lenovo'),
             ('Series', '720-18ASU'),
             ('Model', '90H1000NUS'),
             ('Type', 'Gaming & Entertainment'),
             ('Usage', 'Consumer'),
             ('Colors', 'Silver'),
             ('Processor', 'AMD Ryzen 7 1700 3.0 GHz'),
             ('Processor Main Features', '64 bit 8-Core Processor'),
             ('Cache Per Processor', '16 MB L3 Cache'),
             ('Memory', '16 GB DDR4'),
             ('Storage', '1 TB HDD + 128 GB SSD'),
             ('Optical Drive', 'DVDRW'),
             ('Graphics', 'AMD Radeon RX 570 4 GB'),
             ('Power Supply', '400W'),
             ('Operating System', 'Windows 10 Home 64-Bit'),
             ('CPU Type', 'Ryzen 7'),
             ('CPU Speed', '1700 (3.00 GHz)'),
             ('L3 Cache Per CPU', '16 MB'),
             ('CPU Main Features', '64 bit 8-Core Processor'),
             ('GPU/VGA Type', 'AMD Radeon RX 570'),
             ('Video Memory', '4 GB'),
             ('Memory 

## Cleaning functions

In [91]:
def processor_brand(processor):
    # Return the first word in the processor description (usually the brand)
    return processor.split(' ')[0]

In [92]:
def ram_cap(memory):
    # Returns RAM capacity
    return memory.upper().split('GB')[0]

In [94]:
def ram_type(memory):
    # Returns type of RAM (DDR2, DDR3, or DDR4)
    memory = memory.upper()
    if 'DDR2' in memory:
        return 'ddr2'
    if 'DDR3' in memory:
        return 'ddr3'
    # DDR4 is industry standard these days, we default to it
    else:
        return 'ddr4'

In [95]:
def disk_cap(storage):
    # Returns the disk capacity of a hard disk (will also grab first letter of storage unit GB or TB)
    storage = storage.upper()
    stor_split = storage.split('B')
    return stor_split[0]

In [100]:
def ssd_or_hdd(storage):
    # Check contents of storage for keywords indicating disk type
    storage = storage.upper()
    # Some systems have both SSD and HDD in 
    if ('+' in storage) or ('plus' in storage):
        return 'both'
    elif 'SSD' in storage:
        return 'ssd'
    elif 'RPM' in storage:
        return 'hdd'
    elif 'HDD' in storage:
        return 'hdd'
    else:
        return 'hdd'

In [121]:
def num_cores(processor, processor_main_features):
    # TODO: grab number of cores based on keywords such as 'dual', 'quad', 'six', '6'. 
    # be careful not to grab from the processor speed!

SyntaxError: unexpected EOF while parsing (<ipython-input-121-a73db5ef83d3>, line 3)

## Scaling with Selenium

In [143]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time

import os

In [126]:
chromedriver = "/usr/local/bin/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

In [None]:
driver = webdriver.Chrome(chromedriver)

In [None]:
driver.get('https://www.newegg.com/Product/ProductList.aspx?Submit=Property&N=100019096%2050010772%2050001186%2050010418%208000&IsNodeId=1')

In [170]:
product_array = driver.find_elements_by_class_name('item-container')

In [171]:
def get_html_from_one_page(driver, product_num, product_array):
    '''Navigates from product main page in to product and returns raw html
       driver: Selenium chrome driver
       product_num: int 0-95 representing a product within the product array on a page
    '''
    product_array[product_num].click()
    html = driver.page_source
    driver.back()
    return html

### Proof of concept:

In [175]:
driver.close()

In [172]:
test_html = get_html_from_one_page(driver, 0, product_array)

In [173]:
test_soup = BeautifulSoup(test_html, 'lxml')

### Scrape prices

In [237]:
def get_prices(soup):
    # Grabs all 96 prices from the product array page
    price_list = []
    
    price_spans = soup.find_all('span', class_='price-current-label')
    for span in price_spans:
        price_list.append(span.findNextSibling().contents[0])
    return price_list

## General scraping function

We're just going to get the HTML for the product array page, and the html for each computer's page on that product array. We'll parse this data using BeautifulSoup later.

In [None]:
def lets_scrape(url):
    '''
    Grabs the HTML of the product array page and the 96 computers on that page.
    =====Parameters=====
    Driver: Selenium Chrome driver to control Chrome browser
    =====Returns=====
    pandas DataFrame of two columns: price and html of page for computer of that price
    '''
    price_df = pd.DataFrame(columns=['array_html'])
    
    
    # Instantiate Chrome window controlled by driver
    chromedriver = "/usr/local/bin/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)
    
    # Open the product page
    driver.get(url)
    
    # All computer links in this "array"
    product_array = driver.find_elements_by_class_name('item-container')
    array_html = driver.page_source
    
    # TODO: time.sleep() with random? For how long?
    # Loop over product_array and grab html from each product using get_html_from_one_page