In [2]:
from pprint import pprint
import time
import traceback
from random import randint
from itertools import cycle
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

In [3]:
def get_driver(proxy=None, user_agent=None):
    options = Options()
    options.headless = True

    if proxy:
        options.add_argument(f'--proxy-server={proxy}')
    if user_agent:
        options.add_argument(f'--user_agent={user_agent}')

    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference('permissions.default.image', 2)
    firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
    browser = webdriver.Firefox(options=options, firefox_profile=firefox_profile)

    return browser

In [78]:
def test_proxy(proxy):
    try:
        proxies = {
            "http":f'http://{proxy}',
            "https":f'https://{proxy}'
            }
        requests.get('https://httpbin.org/ip',proxies=proxies, verify=False)
        return True
    except Exception as e:
        print(f'Proxy: {proxy} not working')
        return False

In [131]:
def get_proxies(user_agent):
    url = 'https://free-proxy-list.net/'
    proxies = set()
    rq = requests.get(url, headers=user_agent)
    if rq.status_code not in range(200, 299):
        return proxies
    soup = BeautifulSoup(rq.content, 'lxml')
    table = soup.find('tbody')
    for row in table.find_all('tr'):
        try:
            if len(proxies) >= 5:
                return proxies
            # If proxy has https protocol and is elite proxy
            https = row.find('td', {'class':'hx'}).text
            anonymity = row.find_all('td')[4].text
            if https == 'yes' and anonymity == 'elite proxy':
                ip, port = row.findChildren(limit=2, text=True)
                proxy = ':'.join([ip, port])
                if test_proxy(proxy):
                    proxies.add(proxy)
        except Exception as e:
            print(e)
    return proxies

In [5]:
def parsed_url(page = 1, low_range=250, up_range=5000, build_link = None):
    base_url = 'https://pcpartpicker.com'
    if build_link == None:
        fragment = f'/builds/#B=1&page={page}&X={low_range}00,{up_range}00'
    else: 
        fragment = f'{build_link}'

    return f'{base_url}{fragment}'

In [123]:
def clean_price(price):
    if price[0] == '$':
        price = price.replace('$', '').strip()
        if len(price.split(' ')) > 1:
            return False
    else: 
        return False

    return float(price)

In [148]:
def build_scraper(url, proxy, user_agent):
    builds_dict = {}
    build_comps = ['Name','CPU', 'CPU Cooler', 'Motherboard', 'Memory', 'Storage', 'Video Card', 'Case', 'Power Supply', 'Build Price']
    proxies = {
            "http":f'http://{proxy}',
            "https":f'https://{proxy}'
            }
    try:
        rq = requests.get(url, headers=user_agent)
    except Exception as e:
        print(e)
        return builds_dict

    soup = BeautifulSoup(rq.content, 'lxml')
    print(soup)
    builds_dict['Name'] = soup.find('h1', {"class": "build__name"}).text
    comp_table_rows = soup.find('table', {"class": "partlist partlist--mini"}).find_all('tr')
    extra_price = 0

    # Two rows is one component, one for the name of the comp and other for the features
    row_it = iter(comp_table_rows)
    for name, component in zip(row_it, row_it):
        try:
            name_text = name.find('h4').text.strip()
            # Getting the name and price components
            component_el = component.find('td', {'class':'td__name'}).findChildren(text=True)
            component_el = list(filter(lambda el: el != '\n', component_el))

            if len(component_el) == 2:
                comp_name = component_el[0]
                comp_price = clean_price(component_el[1])
                # If price isn't in USD
                if not comp_price: 
                    return {}
            else:
                comp_name, comp_price = *component_el, None

            # If the component are in the selected list for scrape
            if name_text in build_comps:
                comp_els = {'Name': comp_name, 'Price': comp_price}

                if name_text not in builds_dict:
                    builds_dict[name_text] = comp_els
                else:
                    comp_copy = builds_dict[name_text].copy()
                    builds_dict[name_text] = []
                    builds_dict[name_text].extend([comp_els, comp_copy])
            else:
                # Calculate the total of the components not taken into account
                extra_price += comp_price if isinstance(comp_price, float) else 0

            
        except Exception as e:
            print(e, url, name_text, component_el)
            continue

        total_table_row = soup.find('table', {"class": "block partlist partlist--mini partlist--totals"}).find('td', {"class": "td__price"}).text
        builds_dict['Build Price'] = round(float(total_table_row.replace('$', '')) - extra_price, 2)
    
    return builds_dict

In [71]:
def main():
    user_agent = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'}
    proxies = cycle(get_proxies(user_agent))
    n_pages = 1
    builds_list = [] 
    for i in range(1, n_pages+1):
        try:
            proxy = next(proxies)
            url = parsed_url(page=i)
            browser = get_driver(proxy, user_agent)
            browser.get(url)
            delay = randint(2, 5)  
            time.sleep(delay)
            soup = BeautifulSoup(browser.page_source, 'lxml')

            # Get the link of all build cards in a single page
            builds_links = soup.find_all("a", {"class": "logGroup__target"}, href=True)
            
            for build in builds_links:
                proxy = next(proxies)
                build_url = parsed_url(build_link=build['href'])
                build_dict = build_scraper(build_url, proxy, user_agent)
                if build_dict:
                    builds_list.append(build_dict)
                
                delay = randint(2, 10)  
                time.sleep(delay)

            browser.close()
        except Exception as e:
            browser.close()
            print(traceback.format_exc())
            continue

    
    return builds_list


In [135]:
if __name__ == '__main__':
    user_agent = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'}
    proxies = get_proxies(user_agent)
    # builds = build_scraper('https://pcpartpicker.com/b/VqPnTW', proxy, user_agent)

Proxy: 95.216.138.68:30085 not working
Proxy: 217.172.122.2:8080 not working


In [149]:
proxies = list(proxies)
builds = build_scraper('https://pcpartpicker.com/b/Bstp99', proxies[2], user_agent)

<!DOCTYPE html>
<html class="light-mode">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1, viewport-fit=cover" name="viewport"/>
<meta content="" name="description"/>
<meta content="PCPartPicker" name="application-name"/>
<meta content="#FFFFFF" name="msapplication-TileColor"/>
<meta content="/mstile-144x144.png" name="msapplication-TileImage"/>
<meta content="telephone=no" name="format-detection"/>
<link href="/apple-touch-icon-57x57.png" rel="apple-touch-icon-precomposed" sizes="57x57"/>
<link href="/apple-touch-icon-72x72.png" rel="apple-touch-icon-precomposed" sizes="72x72"/>
<link href="/apple-touch-icon-114x114.png" rel="apple-touch-icon-precomposed" sizes="114x114"/>
<link href="/apple-touch-icon-120x120.png" rel="apple-touch-icon-precomposed" sizes="120x120"/>
<link href="/apple-touch-icon-144x144.png" rel="apple-touch-icon-precomposed" sizes="144x144"/>
<link href="/apple-touch-icon-152x15

In [150]:
pprint(builds)

{'Build Price': 1421.35,
 'CPU': {'Name': 'Intel Core i7-9700K 3.6 GHz 8-Core', 'Price': 289.99},
 'CPU Cooler': {'Name': 'Corsair iCUE H100i RGB PRO XT 75 CFM Liquid',
                'Price': 119.99},
 'Case': {'Name': 'RIOTORO CR1080 ATX Mid Tower', 'Price': 132.4},
 'Memory': [{'Name': 'Corsair Vengeance LPX 32 GB (2 x 16 GB) DDR4-3200 CL16',
             'Price': 129.99},
            {'Name': 'Corsair Vengeance LPX 32 GB (2 x 16 GB) DDR4-3200 CL16',
             'Price': 129.99}],
 'Motherboard': {'Name': 'EVGA Z390 FTW ATX LGA1151', 'Price': None},
 'Name': 'Tiny HTPC Gaming Rig with CR1080',
 'Power Supply': {'Name': 'Corsair 860 W 80+ Platinum Certified Fully Modular '
                          'ATX',
                  'Price': None},
 'Storage': [{'Name': 'Hitachi Ultrastar 7K4000 3 TB 3.5" 7200RPM',
              'Price': None},
             {'Name': 'Samsung 970 Evo 500 GB M.2-2280 NVME SSD',
              'Price': 89.0}],
 'Video Card': {'Name': 'Asus GeForce RTX 2070 SUPER