In [15]:
from pprint import pprint
import time
from random import randint
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

In [3]:
def get_driver():
    options = Options()
    options.headless = True
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference('permissions.default.image', 2)
    firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
    browser = webdriver.Firefox(options=options, firefox_profile=firefox_profile)

    return browser

In [94]:
def get_proxies():
    url = 'https://free-proxy-list.net/'
    proxies = set()
    rq = requests.get(url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'})
    if rq.status_code not in range(200, 299):
        return proxies
    soup = BeautifulSoup(rq.content, 'lxml')
    table = soup.find('tbody')
    for row in table.find_all('tr'):
        try:
            if len(proxies) >= 15:
                return proxies

            if row.find('td', {'class':'hx'}).text == 'yes':
                ip, port = row.findChildren(limit=2, text=True)
                proxy = ':'.join([ip, port])
                proxies.add(proxy)
        except Exception as e:
            print(e, ip, port)
    return proxies

In [4]:
def parsed_url(page = 1, low_range=250, up_range=5000, build_link = None):
    base_url = 'https://pcpartpicker.com'
    if build_link == None:
        fragment = f'/builds/#B=1&page={page}&X={low_range}00,{up_range}00'
    else: 
        fragment = f'{build_link}'

    return f'{base_url}{fragment}'

In [89]:
def build_scraper(url, browser):
    builds_dict = {}
    build_comps = ['Name','CPU', 'CPU Cooler', 'Motherboard', 'Memory', 'Storage', 'Video Card', 'Case', 'Power Supply', 'Build Price']
    rq = requests.get(url)
    if rq.status_code != requests.codes.ok:
        return builds_dict

    soup = BeautifulSoup(rq.content, 'lxml')
    builds_dict['Name'] = soup.find('h1', {"class": "build__name"}).text
    comp_table_rows = soup.find('table', {"class": "partlist partlist--mini"}).find_all('tr')
    extra_price = 0

    # Two rows is one component, one for the name of the comp and other for the features
    row_it = iter(comp_table_rows)
    for name, component in zip(row_it, row_it):
        try:
            name_text = name.find('h4').text.strip()
            component_el = component.find('td', {'class':'td__name'})
            comp_price = component_el.find(attrs={'class':'td__price'})
            
            # If there is a price set to float value, else set to None
            if comp_price != None:
                comp_price = comp_price.text
                # If the prices aren't in USD return empty dict
                if comp_price[0] == '$':
                    comp_price = comp_price.replace('$', '').strip()
                    if len(comp_price.split(' ')) > 1:
                        return {}
                else: 
                    return {}

                comp_price = float(comp_price)
            
            # If the component are in the selected list for scrape
            if name_text in build_comps:
                comp_name = component_el.findChildren(limit=1, text=True)
                comp_els = {'Name': comp_name, 'Price': comp_price}

                if name_text not in builds_dict:
                    builds_dict[name_text] = comp_els
                else:
                    comp_copy = builds_dict[name_text].copy()
                    builds_dict[name_text] = []
                    builds_dict[name_text].extend([comp_els, comp_copy])
            else:
                extra_price += comp_price if isinstance(comp_price, float) else 0

            
        except Exception as e:
            print(e, url, name_text, component_el)
            continue

        total_table_row = soup.find('table', {"class": "block partlist partlist--mini partlist--totals"}).find('td', {"class": "td__price"}).text
        builds_dict['Build Price'] = round(float(total_table_row.replace('$', '')) - extra_price, 2)
    
    return builds_dict

In [8]:
def main():
    proxies = get_proxies()
    browser = get_driver()
    n_pages = 1
    builds_list = [] 
    for i in range(1, n_pages+1):
        try:
            url = parsed_url(page=i)
            browser.get(url)
            delay = randint(2, 5)  
            time.sleep(delay)
            soup = BeautifulSoup(browser.page_source, 'lxml')
            builds_links = soup.find_all("a", {"class": "logGroup__target"}, href=True)
            
            for build in builds_links:
                build_url = parsed_url(build_link=build['href'])
                build_dict = build_scraper(build_url,browser)
                if build_dict:
                    builds_list.append(build_dict)
                
                delay = randint(2, 10)  
                time.sleep(delay)

        except Exception as e:
            print(e)
            continue

    browser.quit()
    return builds_list


In [12]:
if __name__ == '__main__':
    builds = main()

[{'Build Price': 922.0,
  'CPU': {'Name': 'Intel Core i5-10600K 4.1 GHz 6-Core', 'Price': 256.97},
  'CPU Cooler': {'Name': 'NZXT Kraken X53 73.11 CFM Liquid', 'Price': 118.45},
  'Case': {'Name': 'NZXT H510i ATX Mid Tower', 'Price': 100.0},
  'Memory': {'Name': 'G.Skill Trident Z Neo 32 GB (2 x 16 GB) DDR4-3600 CL18',
             'Price': 149.99},
  'Motherboard': {'Name': 'Asus TUF GAMING Z490-PLUS (WI-FI) ATX LGA1200',
                  'Price': 181.28},
  'Name': '2nd Build - Upgrading and Waiting for the New RTX 30 Series',
  'Power Supply': {'Name': 'Corsair RMx (2018) 750 W 80+ Gold Certified Fully '
                           'Modular ATX',
                   'Price': 115.31},
  'Storage': [{'Name': 'Western Digital 3 TB 3.5" 5900RPM', 'Price': 0.0},
              {'Name': 'Samsung 970 Evo Plus 250 GB M.2-2280 NVME SSD',
               'Price': 0.0}],
  'Video Card': {'Name': 'Gigabyte GeForce GTX 1070 8 GB Mini ITX OC',
                 'Price': 0.0}},
 {'Build Price': 980.91