In [1]:
# Install libraries if not installed
# !pip install html5lib
# !pip install splinter
# !pip install webdriver_manager
# !pip install pandas
# !pip install pymongo

In [2]:
# Import Dependencies
import pandas as pd
from tqdm import tqdm
import re
import time
import pymongo
import random

from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

In [3]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}

In [4]:
# Get an instance of Browser
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
# Set up MongoDB
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [None]:
# Create a db collection
db = client.goodwill
collection = db.items

## Step 1: Open Goodwill site and fill search form

- Open Goodwill site and search for computer laptops. 
- Set 90 days to go back from today. 
- Min price set to 200. 
- Click on 'This website uses cookies' dialog if present.

In [5]:
# Open the advanced search page
browser.visit('https://shopgoodwill.com/search/advancedsearch')

# Find the Categories dropdown
element = browser.find_by_id('ddlCategories').first

# Select Option 7 for Computers and Electronics category
element.select('7') # Computers and Electronics

In [6]:
# Find More Categories 1 dropdown and select Computers
element = browser.find_by_id('ddlCategories1').first
element.select('30') # Computers 

# Find More Categories 2 dropdown and select Laptops
element = browser.find_by_id('ddlCategories2').first
element.select('176') # Laptops

In [7]:
# Select 'Closed Auctions' radiobutton
browser.find_by_name('searchClosedAuctions')[2].check()

# Select days to go back from Today
element = browser.find_by_id('ddlClosedAuctionDaysBack').first
element.select('90') # 90 days

In [8]:
# Set minimum price
browser.find_by_id('txtLowPrice').fill('200')

# Wait 10 seconds before checking
time.sleep(10)

# Click on accept cookies
if browser.is_text_present('This website uses cookies to ensure you get the best experience on our website'):
    browser.links.find_by_text('Got it!').click()

# Click on Search button
element = browser.find_by_css('.button.btn.btn-primary.w-100').first
element.click()

## Step 2: Go through the search results and collect links
- Use BeautifulSoup to extract information
- Page through the results to get all items.

In [9]:
all_items = []
# Loop through pages until next right button is not clickable
while not browser.is_element_present_by_css(".p-paginator-next.p-paginator-element.p-link.p-ripple.p-disabled"):
    html = browser.html
    
    # Get instance of BeautifulSoup
    s = soup(html, 'html.parser')
    
    # Find all the links based on css class attributes
    items_links = s.findAll('a',attrs={'class':'feat-item_name ng-star-inserted'})
    
    # Expand the original list, and add elements from the items_link.
    all_items.extend(items_links)
    
    # Go to next page
    right_click_arrow = browser.find_by_css('.p-paginator-next.p-paginator-element.p-link.p-ripple').first
    right_click_arrow.click()

In [10]:
print(f"Number of records:{len(all_items)}")

Number of records:160


In [11]:
# List the first 5 records
all_items[0:5]

[<a _ngcontent-serverapp-c99="" class="feat-item_name ng-star-inserted" href="/item/162380647" id="162380647" title="ASUS ZenBook 13 UX325 Laptop/i5/8GB/250GB SSD">ASUS ZenBook 13 UX325 Laptop/i5/8GB/250GB SSD</a>,
 <a _ngcontent-serverapp-c99="" class="feat-item_name ng-star-inserted" href="/item/157100881" id="157100881" title='Asus ROG Strix GL703 Scar Edition 17.3" V i7-7 2.8GHz 16GB 256GB Gaming Laptop'>Asus ROG Strix GL703 Scar Edition 17.3" V i7-7 2.8GHz 16GB 256GB Gaming Laptop</a>,
 <a _ngcontent-serverapp-c99="" class="feat-item_name ng-star-inserted" href="/item/157105178" id="157105178" title='Lenovo IdeaPad 700-15ISK 15.6" Laptop PC w/Intel Core i7 CPU'>Lenovo IdeaPad 700-15ISK 15.6" Laptop PC w/Intel Core i7 CPU</a>,
 <a _ngcontent-serverapp-c99="" class="feat-item_name ng-star-inserted" href="/item/157105217" id="157105217" title='Dell Inspiron 17 7786 17.3" Touchscreen Convertible 2 in 1 Laptop PC'>Dell Inspiron 17 7786 17.3" Touchscreen Convertible 2 in 1 Laptop PC</a>

## Step 3: Parse Bid History table
- Write functions to use Regular Expressions to extract brand, screen size, memory capacity, etc from the link title
- Use these functions to scrape information from list of links collected in Step 2.

In [12]:
# Functions to extract information from title of the item
def extract_screen_size(title):
    """Extract screen size if found"""
    p = re.compile(r"""[0-9]*.?[0-9](in|"|”|\s*screen)""")
    matched = p.search(title, re.IGNORECASE)
    if matched:
        # Keep only the size of screen, round number
        return round(float(re.sub(r'("|in|”|\s*screen")',"", matched.group(), re.IGNORECASE)))
    else:
        return None
    
def extract_disk_or_memory_size(title):
    """Extract items describing GB or TB information"""
    p = re.compile(r"""(\w+\s?)(GB|TB)""")
    matched = p.findall(title)
    if matched:        
        return matched
    else:
        return None
    
def create_mem_capacity(matched_group):    
    """Given a list of tuples, determine which one is memory and which one is hard drive"""
    results = {}
    if matched_group and isinstance(matched_group, list):
        for row in matched_group:
            capacity = int(row[0])
            gb_or_tb = row[1]
            if row[1]=="TB":
                # convert to GB
                capacity = capacity * 1024
            if capacity<=64: # most laptpost have low GB in mem
                # Asssume memory
                results['ram'] = capacity
            else:
                # Assume hard drive 
                results['hard_drive'] = capacity
    return results

def extract_windows_version(title):
    """Return version of windows if found"""
    p = re.compile(r"""Windows\s*\d\d""")
    matched = p.search(title, re.IGNORECASE)
    if matched:
        # Keep only the size of screen, round number
        return re.sub(r'Windows\s*',"", matched.group(), re.IGNORECASE)
    else:
        return None
    
def extract_intel_cpu_only(title):
    """Return version if intel cpu"""
    p = re.compile(r"""i\d""")
    matched = p.search(title, re.IGNORECASE)
    if matched:
        # Keep only the size of screen, round number
        return matched.group()
    else:
        return None

def extract_intel_cpu_complete_model(title):
    """Return complete model intel cpu"""
    p = re.compile(r"""i\d-[0-9]+\w?""")
    matched = p.search(title, re.IGNORECASE)
    if matched:
        # Keep only the size of screen, round number
        return matched.group()
    else:
        return None
    
def extract_computer_brand(title):
    """Return computer brand intel cpu"""
    p = re.compile(r"""(Acer|Asus|Apple|Razer|Oryx|Google|Alienware|HP|Dell|Lenovo|ASUS|Macbook|Samsung|Microsoft|MSI)\s-?\s?\w+""")
    matched = p.search(title)
    if matched:
        # Keep only the size of screen, round number
        return matched.group()
    else:
        return None    

### Sample test cases

In [15]:
# Select random item
import random
random_index = random.randrange(0,len(all_items))
sample_item = all_items[random_index].text
sample_item

'Acer Aspire E5-553 (N16Q3) 15.6" AMD FX-9800p 16GB 128GB SSD +1TB HHD Laptop'

In [16]:
 # Extract mem capacity information
create_mem_capacity(extract_disk_or_memory_size(sample_item))

{'ram': 16, 'hard_drive': 1024}

### Scrape data from the item title

In [18]:
# Collect all records
all_records = []

# Ignore titles with 'Case' or 'Lot'

# Using tqdm to show progress
for i in tqdm(all_items, total=len(all_items)):
    lowered_text = i.text.lower()
    item_text = i.text
    item_dict = {}
    item_id = i['id']
    if "lot " not in lowered_text and " case" not in lowered_text and "broken" not in lowered_text:
        item_dict['id'] = item_id
        item_dict['title'] = item_text
        item_dict['brand'] = extract_computer_brand(item_text)
        item_dict['intel_cpu_model'] = extract_intel_cpu_complete_model(item_text)
        item_dict['intel_cpu'] = extract_intel_cpu_only(item_text)
        item_dict['windows_version'] = extract_windows_version(item_text)
        item_dict['screen_size'] = extract_screen_size(item_text)
        
        mem_capacity = create_mem_capacity(extract_disk_or_memory_size(item_text))
        
        item_dict['ram'] = mem_capacity['ram'] if "ram" in mem_capacity.keys() else ''
        item_dict['hard_drive'] = mem_capacity['hard_drive'] if "hard_drive" in mem_capacity.keys() else ''
            
        all_records.append(item_dict)
        # pause for 2 seconds 
        time.sleep(2)

100%|█████████████████████████████████████████| 160/160 [05:08<00:00,  1.93s/it]


In [19]:
all_records

[{'id': '162380647',
  'title': 'ASUS ZenBook 13 UX325 Laptop/i5/8GB/250GB SSD',
  'brand': 'ASUS ZenBook',
  'intel_cpu_model': None,
  'intel_cpu': 'i5',
  'windows_version': None,
  'screen_size': None,
  'ram': 8,
  'hard_drive': 250},
 {'id': '157100881',
  'title': 'Asus ROG Strix GL703 Scar Edition 17.3" V i7-7 2.8GHz 16GB 256GB Gaming Laptop',
  'brand': 'Asus ROG',
  'intel_cpu_model': 'i7-7',
  'intel_cpu': 'i7',
  'windows_version': None,
  'screen_size': 17,
  'ram': 16,
  'hard_drive': 256},
 {'id': '157105178',
  'title': 'Lenovo IdeaPad 700-15ISK 15.6" Laptop PC w/Intel Core i7 CPU',
  'brand': 'Lenovo IdeaPad',
  'intel_cpu_model': None,
  'intel_cpu': 'i7',
  'windows_version': None,
  'screen_size': 16,
  'ram': '',
  'hard_drive': ''},
 {'id': '157105217',
  'title': 'Dell Inspiron 17 7786 17.3" Touchscreen Convertible 2 in 1 Laptop PC',
  'brand': 'Dell Inspiron',
  'intel_cpu_model': None,
  'intel_cpu': None,
  'windows_version': None,
  'screen_size': 17,
  'ram'

In [22]:
browser.quit()