In [2]:
import time
import requests
import utilities
import traceback
from itertools import cycle
from random import randint, choice
from bs4 import BeautifulSoup
from builds_links_scraper import get_links

In [3]:
def clean_price(price):
    if price[0] == '$':
        price = price.replace('$', '').strip()
        if len(price.split(' ')) > 1:
            return None
    else: 
        return None

    return float(price)

In [4]:
def build_scraper(url, user_agent, proxy):
    builds_dict = {}
    build_comps = ['Name','CPU', 'CPU Cooler', 'Motherboard', 'Memory', 'Storage', 'Video Card', 'Case', 'Power Supply', 'Build Price']

    try:
        rq = requests.get(url, headers=user_agent, proxies=proxy)
    except Exception as e:
        print(e)
        return builds_dict

    soup = BeautifulSoup(rq.content, 'html.parser')
    builds_dict['Build Name'] = soup.find('h1', {"class": "build__name"}).text
    comp_table_rows = soup.find('table', {"class": "partlist partlist--mini"}).find_all('tr')
    extra_price = 0

    # Two rows is one component, one for the name of the comp and other for the features
    row_it = iter(comp_table_rows)
    for name, component in zip(row_it, row_it):
        try:
            name_text = name.find('h4').text.strip()
            # Getting the name and price components
            component_el = component.find('td', {'class':'td__name'}).findChildren(text=True)
            component_el = list(filter(lambda el: el != '\n', component_el))
            
            if len(component_el) == 2:
                comp_name = component_el[0]
                comp_price = clean_price(component_el[1])
                # If price isn't in USD
                if comp_price == None: 
                    return {}
            else:
                comp_name, comp_price = *component_el, None

            # If the component are in the selected list for scrape
            if name_text in build_comps:
                comp_els = {'Name': comp_name, 'Price': comp_price}

                if name_text not in builds_dict:
                    builds_dict[name_text] = comp_els
                else:

                    comp_copy = builds_dict[name_text].copy()
                    builds_dict[name_text] = []
                    #  If there are already two of the same component
                    if isinstance(comp_copy, list):
                        builds_dict[name_text].extend([comp_els, *comp_copy])
                    else:
                        builds_dict[name_text].extend([comp_els, comp_copy])
            else:
                # Calculate the total of the components not taken into account
                extra_price += comp_price if isinstance(comp_price, float) else 0

        except Exception as e:
            print(e, url, name_text, component_el)
            continue

        total_table_row = soup.find('table', {"class": "block partlist partlist--mini partlist--totals"}).find('td', {"class": "td__price"}).text
        builds_dict['Build Price'] = round(float(total_table_row.replace('$', '')) - extra_price, 2)
    
    return builds_dict

In [13]:
def main():
    builds_links = get_links()
    user_agents = utilities.get_user_agent()
    proxies = utilities.get_proxies()
    n_pages = 1
    builds_list = [] 

    for build in builds_links[:1]:
        try:
            proxy = choice(proxies)
            user_agent = choice(user_agents)
            build_url = utilities.parse_url(build_link=build['link'])
            build_dict = build_scraper(build_url, user_agent, proxy)
            # If the build was scraped correctly
            if build_dict:
                builds_list.append(build_dict)

            delay = randint(2, 10)  
            time.sleep(delay)

        except Exception as e:
            print(traceback.format_exc())
            continue

    return builds_list

In [12]:
if __name__ == '__main__':
    builds = main()

{'http': 'http://xkgtmmyv-5:ly6aduegkp6d@84.21.191.193:20004/'} {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Referer': 'https://www.google.com/', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'}


In [15]:
from pprint import pprint
pprint(builds)

[{'Build Name': 'DMTX v1',
  'Build Price': 1018.9,
  'CPU': {'Name': 'AMD Ryzen 5 3600X 3.8 GHz 6-Core', 'Price': 214.99},
  'CPU Cooler': {'Name': 'ID-COOLING SE-234-ARGB 56.5 CFM', 'Price': 39.99},
  'Case': {'Name': 'Deepcool MATREXX 50 ATX Mid Tower', 'Price': 49.99},
  'Memory': {'Name': 'Crucial Ballistix RGB 16 GB (2 x 8 GB) DDR4-3600 CL16',
             'Price': 89.99},
  'Motherboard': {'Name': 'MSI B450-A PRO MAX ATX AM4', 'Price': 99.99},
  'Power Supply': {'Name': 'Thermaltake Smart BX1 650 W 80+ Bronze Certified '
                           'ATX',
                   'Price': 69.99},
  'Storage': [{'Name': 'Seagate Barracuda Compute 2 TB 3.5" 7200RPM',
               'Price': 54.99},
              {'Name': 'Team MP33 512 GB M.2-2280 NVME SSD', 'Price': 53.99},
              {'Name': 'Team GX2 512 GB 2.5" SSD', 'Price': 44.99}],
  'Video Card': {'Name': 'MSI Radeon RX 5600 XT 6 GB GAMING X',
                 'Price': 299.99}}]
