In [19]:
import os
import json
import psycopg2 as pg2
from time import sleep
from bs4 import BeautifulSoup
from src.crawler import Crawler
from pymongo import MongoClient

brands = [
    'Gibson',
    'Fender',
    'Rickenbacker',
    'Ibanez',
    'Epiphone',
    'Gretsch',
    'Paul Reed Smith',
    'Reverend',
    'Schecter',
    'Guild'
]

# Load login credentials
with open('src/config.json', 'r') as f:
    conf = json.load(f)

In [12]:
client = MongoClient('localhost', 27017)
db = client['reverb']
link_coll = db['links']
data_coll = db['data']

KeyboardInterrupt: 

In [4]:
# conn = pg2.connect(dbname='reverb', host='192.168.0.209', password='galvanize', user='postgres')
# cur = conn.cursor()
# conn.autocommit = True
# cur.execute("""CREATE TABLE guitars(
#                id INT PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
#                title VARCHAR(225),
#                date_str VARCHAR(12),
#                cond VARCHAR(12),
#                sale_price NUMERIC
#            );""")
# conn.close()

In [25]:
def hot_soup():
    """Return a beautiful soup object from the driver's current source"""
    return BeautifulSoup(driver.page_source, 'html.parser')

def login():
    """Login to Reverb.com"""
    soup = hot_soup()
    
    # Find login button
    start = soup.find('a', class_='site-header__nav__link--login')
    driver.bsel(start).click()
    sleep(1)
    soup = hot_soup()
    
    # Find Username & Password fields
    usn = soup.find(id='user_session_login')
    psw = soup.find(id='user_session_password')
    username = driver.bsel(usn)
    password = driver.bsel(psw)
    
    # Enter in credentials & submit
    username.send_keys(conf['REVERB_LOGIN'])
    password.send_keys(conf['REVERB_PASS'])
    submit = soup.find('input', attrs={'type': 'submit'})
    driver.bsel(submit).click()

def scrape_links(collection, timeout):
    page = 1
    count = 0
    print('Scraping links...\n')
    while True:
        soup = hot_soup()
        tiles = soup.find('ul', class_='tiles')
        links = tiles.find_all('a', class_='csp-square-card__inner')
        
        scraped = [{'title': l.find('h3').text, 'link': l['href']} for l in links]
        count += len(scraped)
        collection.insert_many(scraped)
        
        print(f'Scraped page {page}, {count} total records gathered.\n')
        
        nxt = soup.find('li', class_='pagination__page pagination__page--next')
        if nxt:
            # Click next button
            driver.bsel(nxt).click()
            sleep(timeout)
        else:
            # All links have been gathered
            break

def scrape_transactions(links, collection, timeout):
    guide = 0
    count = 0
    for document in links:
        title = document['title']
        link = document['link']
        transaction_agg = []

        driver.get(link)
        
        # Collect all transactions from a price guide
        while True:
            sleep(timeout)
            soup = hot_soup()
            transactions = soup.find_all('tr', class_='transaction')

            if transactions:
                for transaction in transactions:
                    date = transaction.find(class_='date').text
                    cond = transaction.find(class_='condition').text
                    price = transaction.find(class_='final').text
                    transaction_agg.append((date, cond, price))
            else:
                # There are no transactions in this price guide.
                print(f'{title} has no transactions.\n')
                break

            nxt = soup.find('button', attrs={'title': 'Next'})

            if 'disabled' in nxt.attrs:
                # All transactions have been collected
                collection.insert_one({
                    'title': title,
                    'transactions': transaction_agg
                })
                
                t = len(transaction_agg)
                count += t
                guide += 1
                print(f'{t} transactions recorded for {title}.\n')
                break
                
            else:
                # Click next button
                driver.bsel(nxt).click()
    
    print(f'{count} transactions scraped from {guide} guides.\n')

In [26]:
driver = Crawler()
driver.get('https://reverb.com/price-guide/electric-guitars')
timeout = 1
try:
    print('Logging in...\n')
    login()
    sleep(1)
    scrape_links(link_coll, timeout)
    links = link_coll.find({}, {'_id':0})
    scrape_transactions(links, data_coll, timeout)
    print('Scraping Complete.')
    
except Exception as e:
    print(e)

driver.cleanup()

Logging in...

Scraping links...

[{'title': 'Gibson ES-295 1952 Gold', 'link': 'https://reverb.com/price-guide/guide/3-gibson-es-295-1952-gold'}, {'title': 'Rickenbacker 660-12 Tom Petty Signature 1992', 'link': 'https://reverb.com/price-guide/guide/31-rickenbacker-660-12-tom-petty-signature-1992'}, {'title': 'Rickenbacker 360-12 1967 Fireglo, Mapleglo or Jetglo', 'link': 'https://reverb.com/price-guide/guide/32-rickenbacker-360-12-1967-fireglo-mapleglo-or-jetglo'}, {'title': 'Fender Coronado II 1968 Antigua', 'link': 'https://reverb.com/price-guide/guide/33-fender-coronado-ii-1968-antigua'}, {'title': 'Gibson Humbucker from SG 1963 Chrome', 'link': 'https://reverb.com/price-guide/guide/35-gibson-humbucker-from-sg-1963-chrome'}, {'title': 'Fender Stratocaster 1958 Sunburst', 'link': 'https://reverb.com/price-guide/guide/38-fender-stratocaster-1958-sunburst'}, {'title': 'Fender Stratocaster 1964 Sunburst', 'link': 'https://reverb.com/price-guide/guide/42-fender-stratocaster-1964-sunbur

In [None]:
while True:
    transactions = scrape_transactions(soup)
    print(transactions)
    nxt = soup.find('button', attrs={'title': 'Next'})
    if 'disabled' in nxt.attrs:
        # All transactions have been collected
        break
    else:
        # Click next button
        driver.bsel(nxt).click()