In [33]:
import pandas as pd
import numpy as np
import requests
import bs4
import re
from collections import OrderedDict
import time
import random

First page's url is here: https://www.newegg.com/Product/ProductList.aspx?Submit=Property&N=100019096%2050010772%2050001186%2050010418%208000&IsNodeId=1&bop=And&Order=RELEASE&PageSize=96

**Sorting parameters within this page:**
* Seller is Newegg
* Brand is Dell, HP, Lenovo
* Sort by Newest to Oldest
* 96 Products per page
* Date is 10/3 at 10:33 PM

## Scraping functions

In [34]:
def get_soup(url):
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, 'lxml')
    return soup

In [35]:
def get_components(html):
    '''Grabs all available PC components from Newegg webpage. 
       ====Parameters====
       soup: BeautifulSoup object (use get_soup() function)
       ====Returns====
       this_computer: OrderedDict mapping this computer's variables to values
    '''
    soup = bs4.BeautifulSoup(html, 'lxml')
    # Narrow down to specifications part of page
    specs = soup.find('div', class_='plinks')
    # All categories of technical specifications in dt tags
    categories = specs.find_all('dt')
    # All details of technical specifications in dd tags
    details = specs.find_all('dd')
    
    this_computer = OrderedDict()
    
    for cat, det in zip(categories, details):
        # raw detail is always located here
        raw_det = det.contents[0]
        
        # in some cases, have to go one tag deeper to get category name
        if type(cat.contents[0]) == bs4.element.Tag:
            raw_cat = cat.contents[0].contents[0]
        else:
            raw_cat = cat.contents[0]
            
        this_computer[raw_cat] = raw_det
    return this_computer

## Cleaning functions

In [36]:
def processor_brand(processor):
    # Return the first word in the processor description (usually the brand)
    return processor.split(' ')[0]

In [37]:
def ram_cap(memory):
    # Returns RAM capacity
    return memory.upper().split('GB')[0]

In [38]:
def ram_type(memory):
    # Returns type of RAM (DDR2, DDR3, or DDR4)
    memory = memory.upper()
    if 'DDR2' in memory:
        return 'ddr2'
    if 'DDR3' in memory:
        return 'ddr3'
    # DDR4 is industry standard these days, we default to it
    else:
        return 'ddr4'

In [39]:
def disk_cap(storage):
    # Returns the disk capacity of a hard disk (will also grab first letter of storage unit GB or TB)
    storage = storage.upper()
    stor_split = storage.split('B')
    return stor_split[0]

In [40]:
def ssd_or_hdd(storage):
    # Check contents of storage for keywords indicating disk type
    storage = storage.upper()
    # Some systems have both SSD and HDD in 
    if ('+' in storage) or ('plus' in storage):
        return 'both'
    elif 'SSD' in storage:
        return 'ssd'
    elif 'RPM' in storage:
        return 'hdd'
    elif 'HDD' in storage:
        return 'hdd'
    else:
        return 'hdd'

In [41]:
def num_cores(processor, processor_main_features):
    # TODO: grab number of cores based on keywords such as 'dual', 'quad', 'six', '6'. 
    # be careful not to grab from the processor speed!

SyntaxError: unexpected EOF while parsing (<ipython-input-41-a73db5ef83d3>, line 3)

## Scaling with Selenium

In [143]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time

import os

In [126]:
chromedriver = "/usr/local/bin/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

In [None]:
driver = webdriver.Chrome(chromedriver)

In [None]:
driver.get('https://www.newegg.com/Product/ProductList.aspx?Submit=Property&N=100019096%2050010772%2050001186%2050010418%208000&IsNodeId=1')

In [170]:
product_array = driver.find_elements_by_class_name('item-container')

def get_html_from_one_page(driver, product_num, product_array):
    '''Navigates from product main page in to product and returns raw html
       driver: Selenium chrome driver
       product_num: int 0-95 representing a product within the product array on a page
    '''
    product_array[product_num].click()
    html = driver.page_source
    driver.back()
    return html

### Scrape prices

## General scraping function

We're just going to get the HTML for the product array page, and the html for each computer's page on that product array. We'll parse this data using BeautifulSoup later.

def lets_scrape(url):
    '''
    Grabs the HTML of the product array page and the 96 computers on that page.
    =====Parameters=====
    Driver: Selenium Chrome driver to control Chrome browser
    =====Returns=====
    pandas DataFrame of two columns: price and html of page for computer of that price
    '''
    price_df = pd.DataFrame(columns=['array_html'])
    
    
    # Instantiate Chrome window controlled by driver
    chromedriver = "/usr/local/bin/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)
    
    # Open the product page
    driver.get(url)
    
    # All computer links in this "array"
    product_array = driver.find_elements_by_class_name('item-container')
    array_html = driver.page_source
    
    # TODO: time.sleep() with random? For how long?
    # Loop over product_array and grab html from each product using get_html_from_one_page

## Blacklisting measures
* Set user agent to common browser rather than default
* Sleep for 5-10 seconds between requests
* Grab proxies and rotate between them if necessary
* Access products in random order when downloading their HTML

In [42]:
master_urls = [f'https://www.newegg.com/Product/ProductList.aspx?Submit=Property&N=100019096%2050010772%2050001186%2050010418%208000&IsNodeId=1&page={i}&bop=And&PageSize=96&order=RELEASE'
              for i in range(1, 54)]

In [43]:
def get_product_html(url):
    r = requests.get(url)# proxies={'http': next(proxy_pool), 'https': next(proxy_pool)})
    soup = bs4.BeautifulSoup(r.text)
    
    if soup.find('h2').contents[0] == "That's not you, right?":
        raise ValueError('Captcha\'d')
    
    return r.text

In [44]:
def get_prices(html):
    # Grabs all 96 prices from the product array page
    soup = bs4.BeautifulSoup(html, 'lxml')
    price_list = []
    
    price_spans = soup.find_all('span', class_='price-current-label')
    for span in price_spans:
        price_list.append(span.findNextSibling().contents[0])
    return price_list

In [45]:
def get_links(product_array):    
    # Returns a list of 96 product links on a product list page
    prod_links = []

    for prod in product_array:
        prod_links.append(prod.find('a', href=True)['href'])
        
    return prod_links

In [46]:
def one_page_scrape(url):
    from itertools import cycle
    '''
       Scrapes one product array page as well as each product on that page. 96 products plus
       one set of prices for each of these products is returned in the HTML of dataframes.
       ====Parameters====
       url: url of product array page to be scraped
    '''
    prices_df = pd.DataFrame(columns=['price_html'])
    products_df = pd.DataFrame(columns=['component_html'])
    
    #proxies = ['194.61.71.236:32470', '36.37.160.224:23500', '185.91.13.32:31107', '134.236.245.63:21908']
    #proxy_pool = cycle(proxies)
    
    r = requests.get(url)# proxies={'http': next(proxy_pool), 'https': next(proxy_pool)})
    
    array_page_html = pd.DataFrame(data = [r.text], columns=['price_html'])
    prices_df = prices_df.append(array_page_html)
    
    soup = bs4.BeautifulSoup(r.text, 'lxml')
    
    # Grab array holding all 96 products
    product_array = soup.find_all('div', class_='item-container')
    # Get links for each of those products
    prod_links = get_links(product_array)
    
    # Scrape the html of each of those products
    counter = 0
    for prod_url in prod_links:
        counter += 1
        print(f'About to scrape page {counter}')
        time.sleep(3 + 2.5*random.random())
        try:
            html = get_product_html(prod_url)
        except:
            html = np.nan()
        current_page_html = pd.DataFrame(data = [html], columns=['component_html'])
        products_df = products_df.append(current_page_html)
        
    return prices_df, products_df

In [47]:
master_prices = pd.DataFrame(columns=['price_html'])
master_components = pd.DataFrame(columns=['component_html'])

In [48]:
def scrape_all(url_list, master_prices, master_components, start_at):
    # Scrapes all pages in url_list
    for url in url_list[start_at:]:
        products, prices = one_page_scrape(url)
        
        master_prices = master_prices.append(prices, sort=True)
        master_components = master_components.append(products, sort=True)
        
        master_prices.to_csv('master_price_html.csv')
        master_components.to_csv('master_component_html.csv')
        
    return master_prices, master_components

In [49]:
prices_32_to, components_32_to = scrape_all(master_urls, master_prices, master_components, start_at=32)

About to scrape page 1




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


About to scrape page 2
About to scrape page 3
About to scrape page 4
About to scrape page 5
About to scrape page 6
About to scrape page 7
About to scrape page 8
About to scrape page 9
About to scrape page 10
About to scrape page 11
About to scrape page 12
About to scrape page 13
About to scrape page 14
About to scrape page 15
About to scrape page 16
About to scrape page 17
About to scrape page 18
About to scrape page 19
About to scrape page 20
About to scrape page 21
About to scrape page 22
About to scrape page 23
About to scrape page 24
About to scrape page 25
About to scrape page 26
About to scrape page 27
About to scrape page 28
About to scrape page 29
About to scrape page 30
About to scrape page 31
About to scrape page 32
About to scrape page 33
About to scrape page 34
About to scrape page 35
About to scrape page 36
About to scrape page 37
About to scrape page 38
About to scrape page 39
About to scrape page 40
About to scrape page 41
About to scrape page 42
About to scrape page 43


About to scrape page 57
About to scrape page 58
About to scrape page 59
About to scrape page 60
About to scrape page 61
About to scrape page 62
About to scrape page 63
About to scrape page 64
About to scrape page 65
About to scrape page 66
About to scrape page 67
About to scrape page 68
About to scrape page 69
About to scrape page 70
About to scrape page 71
About to scrape page 72
About to scrape page 73
About to scrape page 74
About to scrape page 75
About to scrape page 76
About to scrape page 77
About to scrape page 78
About to scrape page 79
About to scrape page 80
About to scrape page 81
About to scrape page 82
About to scrape page 83
About to scrape page 84
About to scrape page 85
About to scrape page 86
About to scrape page 87
About to scrape page 88
About to scrape page 89
About to scrape page 90
About to scrape page 91
About to scrape page 92
About to scrape page 93
About to scrape page 94
About to scrape page 95
About to scrape page 96
About to scrape page 1
About to scrape p

About to scrape page 16
About to scrape page 17
About to scrape page 18
About to scrape page 19
About to scrape page 20
About to scrape page 21
About to scrape page 22
About to scrape page 23
About to scrape page 24
About to scrape page 25
About to scrape page 26
About to scrape page 27
About to scrape page 28
About to scrape page 29
About to scrape page 30
About to scrape page 31
About to scrape page 32
About to scrape page 33
About to scrape page 34
About to scrape page 35
About to scrape page 36
About to scrape page 37
About to scrape page 38
About to scrape page 39
About to scrape page 40
About to scrape page 41
About to scrape page 42
About to scrape page 43
About to scrape page 44
About to scrape page 45
About to scrape page 46
About to scrape page 47
About to scrape page 48
About to scrape page 49
About to scrape page 50
About to scrape page 51
About to scrape page 52
About to scrape page 53
About to scrape page 54
About to scrape page 55
About to scrape page 56
About to scrape 

About to scrape page 71
About to scrape page 72
About to scrape page 73
About to scrape page 74
About to scrape page 75
About to scrape page 76
About to scrape page 77
About to scrape page 78
About to scrape page 79
About to scrape page 80
About to scrape page 81
About to scrape page 82
About to scrape page 83
About to scrape page 84
About to scrape page 85
About to scrape page 86
About to scrape page 87
About to scrape page 88
About to scrape page 89
About to scrape page 90
About to scrape page 91
About to scrape page 92
About to scrape page 93
About to scrape page 94
About to scrape page 95
About to scrape page 96
About to scrape page 1
About to scrape page 2
About to scrape page 3
About to scrape page 4
About to scrape page 5
About to scrape page 6
About to scrape page 7
About to scrape page 8
About to scrape page 9
About to scrape page 10
About to scrape page 11
About to scrape page 12
About to scrape page 13
About to scrape page 14
About to scrape page 15
About to scrape page 16
A

TypeError: 'float' object is not callable

In [495]:
url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=Property&N=100019096%2050010772%2050001186%2050010418%208000&IsNodeId=1&page=1&bop=And&PageSize=96&order=RELEASE&recaptcha=pass&recaptcha=pass'
r = requests.get(url)
r.text

'<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhtml" lang="en">\n<head>\n<title>Are you a human?</title>\n<meta http-equiv="X-UA-Compatible" content="IE=EDGE" /><meta charset="UTF-8" />\n<link rel="shortcut icon" type="image/x-icon" href="//c1.neweggimages.com/WebResource/Themes/2005/Nest/Newegg.ico">\n<link rel="stylesheet" type="text/css" href="https://fonts.googleapis.com/css?family=Open+Sans:400,300,300italic,400italic,700,700italic|Open+Sans+Condensed:300,300italic,700">\n<style type="text/css">\np{font-size:16px;color:#4d4d4d;padding:0;margin:0 0 5px}a img,a:hover img,a:visited img{border:0}.button-primary,.button-primary:focus,.button-primary:link,.button-primary:visited{font-family:\'Open Sans Condensed\',\'Arial Narrow\',\'Helvetica Narrow\',arial,helvetica,sans-serif;letter-spacing:1px;font-size:14px;font-weight:700;font-stretch:condensed;text-align:center;text-decoration:none;cursor:pointer;border-radius:4px;border:2px solid #E68626;display:inline-block;padding:9px 1

In [498]:
soup = BeautifulSoup(r.text, 'lxml')

In [502]:
soup.find('h2').contents[0]

"That's not you, right?"

In [451]:
prices_11_to = pd.read_csv('master_component_html.csv')

In [453]:
components_11_to = pd.read_csv('master_price_html.csv')

In [464]:
prices_11_to.head()

Unnamed: 0.1,Unnamed: 0,component_html,price_html
0,0,,"<!DOCTYPE HTML>\r\n<html lang=""en"">\r\n<head>\..."
1,0,,"<!DOCTYPE HTML>\r\n<html lang=""en"">\r\n<head>\..."
2,0,,"<!DOCTYPE HTML>\r\n<html lang=""en"">\r\n<head>\..."


In [465]:
t = prices_11_to.iloc[2, 2]

In [455]:
prices_11_to.to_csv('11_to_14_prices.csv')

In [456]:
components_11_to.to_csv('11_to_14_components.csv')

In [452]:
prices_11_to.shape

(3, 3)

In [426]:
prices.to_csv('first_5_prices_backup.csv')
components.to_csv('first_5_components_backup.csv')

In [444]:
test_prices_2 = pd.read_csv('master_component_html.csv')

In [449]:
test_prices_2.to_csv('6_to_11_component.csv')

In [374]:
test_prices = pd.read_csv('prices.csv')
test_components = pd.read_csv('components.csv')

## Extracting data

In [505]:
!ls

11_to_14_components.csv             prices.csv
11_to_14_prices.csv                 scale-scrape.ipynb
6_to_11_component.csv               scrape-amazon.ipynb
6_to_11_price.csv                   scrape-newegg.ipynb
README.md                           scrape-nutrition.ipynb
components.csv                      scrape-stocks.ipynb
first_5_components_backup.csv       test_components.csv
first_5_prices_backup.csv           test_components.html
master_component_html.csv           test_prices.csv
master_price_html.csv               tibbott-project-luther-proposal.pdf
newegg-utils.py


In [516]:
first_5_p = pd.read_csv('first_5_prices_backup.csv')
first_5_c = pd.read_csv('first_5_components_backup.csv')

In [507]:
six_to_eleven_p = pd.read_csv('6_to_11_price.csv')
six_to_eleven_c = pd.read_csv('6_to_11_component.csv')

In [508]:
eleven_to_fourteen_p = pd.read_csv('11_to_14_prices.csv')
eleven_to_fourteen_c = pd.read_csv('11_to_14_components.csv')

In [517]:
first_5_p.drop(labels=['Unnamed: 0', 'price_html'], inplace=True, axis=1)

In [518]:
first_5_c.drop(labels=['Unnamed: 0', 'component_html'], inplace=True, axis=1)

In [535]:
first_5_p.apply(func=lambda x : get_components(x))

SyntaxError: invalid syntax (<ipython-input-535-ec1b5810e480>, line 1)

In [533]:
get_components(first_5_p.iloc[0, 0])

OrderedDict([('Brand', 'Lenovo'),
             ('Series', 'IdeaCentre'),
             ('Model', '720-18ICB (90HT0005US)'),
             ('Part Number', '90HT0005US'),
             ('Type', 'Gaming & Entertainment'),
             ('Form Factor', 'Tower'),
             ('Usage', 'Consumer'),
             ('Colors', 'Black'),
             ('Processor', 'Intel Core i7-8700 3.20 GHz'),
             ('Processor Main Features', '64 bit 6-Core Processor'),
             ('Cache Per Processor', '12 MB L3 Cache'),
             ('Memory', '16 GB DDR4 2666 + 16 GB Optane Memory'),
             ('Storage', '2 TB 7200 RPM HDD'),
             ('Optical Drive', 'DVDRW'),
             ('Graphics', 'NVIDIA GeForce GTX 1050 Ti 4 GB GDDR5'),
             ('Power Supply', '400W'),
             ('Operating System', 'Windows 10 Home 64-Bit'),
             ('CPU Type', 'Intel Core i7 8th Gen'),
             ('CPU Speed', '8700 (3.20 GHz)'),
             ('L3 Cache Per CPU', '12 MB'),
             ('CPU Main Fe

In [53]:
x = pd.read_csv('df3_final.csv')

FileNotFoundError: File b'df3_final.csv' does not exist