In [1]:
import os
import re
import sys
import json
import pandas as pd
import psycopg2 as pg2
from tqdm import tqdm
from time import sleep
from bs4 import BeautifulSoup
from pymongo import MongoClient

from src.run import hot_soup
from src.crawler import Crawler

client = MongoClient('192.168.0.209', 27017)
db = client['reverb']
link_coll = db['links']
sales_coll = db['sales']
data_coll = db['data']

In [2]:
brands = [
    'Airline',
    'American Showster',
    'Ampeg',
    'Aria',
    'B.C. Rich',
    'B3',
    'BC Rich',
    'Baldwin',
    'Bilt',
    'Bogner',
    'Burns',
    'Campbell',
    'Charvel',
    'Collings',
    'Conrad',
    'Conradd',
    'Cort',
    "D'Angelico",
    'Danelectro',
    'DeArmond',
    'Dean',
    'DiPinto',
    'Dobro',
    'Duesenberg',
    'EKO',
    'ESP',
    'EVH',
    'Eastman',
    'Eastwood',
    'Electra',
    'Epiphone',
    'Ernie Ball Music Man',
    'Fano',
    'Fender',
    'Framus',
    'G&L',
    'Gibson',
    'Giffin',
    'Godin',
    'Goya',
    'Gretsch',
    'Grosh',
    'Guild',
    'Guyatone',
    'Hagstrom',
    'Hamer',
    'Harden Engineering',
    'Harmony',
    'Heritage',
    'Hofner',
    'Hohner',
    'Ibanez',
    'Ibanez',
    'Intermark',
    'Jackson',
    'James Trussart',
    'Jerry Jones',
    'Kalamazoo',
    'Kay',
    'Kimberly',
    'Kramer',
    'La Baye',
    'Larrivee',
    'Line 6',
    'MCI',
    'Magnatone',
    'Martin',
    'Memphis',
    'Micro-Frets',
    'Moog',
    'Mosrite',
    'Music Man',
    'Musicraft',
    'Musicvox',
    'Nash',
    'National',
    'Norma',
    'Orville',
    'Ovation',
    'Parker',
    'Paul Reed Smith',
    'Peavey',
    'Premier',
    'Reverend',
    'Rickenbacker',
    'Roland',
    'SWR',
    'Schecter',
    'Silvertone',
    'Sorrento',
    'Squier',
    'Steinberger',
    'Stteinbereger',
    'Suhr',
    'Supro',
    'Taylor',
    'Teisco',
    'The Loar',
    'Tokai',
    'Tom Anderson',
    'Travis',
    'Truetone',
    'Two Rock',
    'Univox',
    'Vox',
    'Washburn',
    'Yamaha'
]

In [3]:
sales_df = pd.DataFrame(list(sales_coll.find({},{'_id':0})))
sales_df.head()

Unnamed: 0,title,date,cond,price
0,Rickenbacker 660-12 Tom Petty Signature 1992,2/23/2021,Excellent,"$6,495"
1,Rickenbacker 660-12 Tom Petty Signature 1992,12/7/2020,Excellent,"$4,325"
2,Rickenbacker 660-12 Tom Petty Signature 1992,10/20/2020,Excellent,"$7,995"
3,Rickenbacker 660-12 Tom Petty Signature 1992,9/28/2020,Excellent,"$5,495"
4,Rickenbacker 660-12 Tom Petty Signature 1992,5/26/2020,Excellent,"$5,800"


In [4]:
links_df = pd.DataFrame(list(link_coll.find({}, {'_id': 0})))
links_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5065 entries, 0 to 5064
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   5065 non-null   object
 1   link    5065 non-null   object
 2   html    2994 non-null   object
dtypes: object(3)
memory usage: 118.8+ KB


In [5]:
for link in list(link_coll.find({'html': {'$exists': True}}))[:10]:
    print(link['link'])

https://reverb.com/price-guide/guide/31-rickenbacker-660-12-tom-petty-signature-1992
https://reverb.com/price-guide/guide/33-fender-coronado-ii-1968-antigua
https://reverb.com/price-guide/guide/38-fender-stratocaster-1958-sunburst
https://reverb.com/price-guide/guide/42-fender-stratocaster-1964-sunburst
https://reverb.com/price-guide/guide/124-epiphone-zephyr-1944-sunburst
https://reverb.com/price-guide/guide/677-fender-esquire-1951-butterscotch-blonde
https://reverb.com/price-guide/guide/685-fender-jazzmaster-1960-sunburst
https://reverb.com/price-guide/guide/684-fender-jazzmaster-1959-sunburst-tortoise-pickguard
https://reverb.com/price-guide/guide/683-fender-jazzmaster-1959-sunburst-with-gold-pickguard
https://reverb.com/price-guide/guide/689-fender-jazzmaster-1962-3-tone-sunburst


In [7]:
html = link_coll.find_one({'html': {'$exists': True}})['html']
soup = hot_soup(html)

In [13]:
section = soup.find('section', class_='scaling-pb-2')
for d in section.descendants:
    print(d)
    print('----')



----
<p>Built to the specifications of Tom Petty, this signature Rickenbacker 660-12 was released with an array of deluxe features in a limited run of 1000 from 1991 to 1997. The Tom Petty 660 is actually the first 660 entry in the iconic 600 series, and would be followed by a normal production 660 12-string in 1998. The Tom Petty edition is distinguished from other 600-series 12-strings by two "Toaster Top" humbuckers, a fancier slanted plate tailpiece, as well as Tom's signature on the pickguard. As the first 12-string 660, this guitar is of great interest to Rickenbacker collectors as well as fans of Tom Petty, so the value on this <i>American Girl </i>won't be <i>free fallin' </i>any time soon.  <br/><br/><b>Years of Production:</b> 1991 - 1997 <br/><br/><b>Design Elements: T</b>wo chrome bar "Toaster Top" pickups, a slanted plate tailpiece, and deluxe trim<br/><br/><b>Body Style:</b> Cresting Wave body shape 12-string<br/><br/><b>Wood Composition: </b>Maple body, Maple neck, Ros

In [6]:
'|'.join(brands)

"Airline|American Showster|Ampeg|Aria|B.C. Rich|B3|BC Rich|Baldwin|Bilt|Bogner|Burns|Campbell|Charvel|Collings|Conrad|Conradd|Cort|D'Angelico|Danelectro|DeArmond|Dean|DiPinto|Dobro|Duesenberg|EKO|ESP|EVH|Eastman|Eastwood|Electra|Epiphone|Ernie Ball Music Man|Fano|Fender|Framus|G&L|Gibson|Giffin|Godin|Goya|Gretsch|Grosh|Guild|Guyatone|Hagstrom|Hamer|Harden Engineering|Harmony|Heritage|Hofner|Hohner|Ibanez|Ibanez|Intermark|Jackson|James Trussart|Jerry Jones|Kalamazoo|Kay|Kimberly|Kramer|La Baye|Larrivee|Line 6|MCI|Magnatone|Martin|Memphis|Micro-Frets|Moog|Mosrite|Music Man|Musicraft|Musicvox|Nash|National|Norma|Orville|Ovation|Parker|Paul Reed Smith|Peavey|Premier|Reverend|Rickenbacker|Roland|SWR|Schecter|Silvertone|Sorrento|Squier|Steinberger|Stteinbereger|Suhr|Supro|Taylor|Teisco|The Loar|Tokai|Tom Anderson|Travis|Truetone|Two Rock|Univox|Vox|Washburn|Yamaha"

In [40]:
r1 = ")(?:(?:(.+)((?:Early|Mid|Late)-?\s?'?\d+s"
r2 = "\d{4}\s?-\s?\d{4})(.*))"
r3 = "(?:(.+)(\d{4}s?)(.*))"
r4 = "(?:(.+)(\d{2}s?)(.*)))$"
regex = r"(" + '|'.join(brands) + '|'.join([r1, r2, r3, r4])

for document in link_coll.find({'html': {'$exists': True}}):
    title = document['title']
    match = re.match(regex, title, re.IGNORECASE)
    # TODO: Add these features to database
    #       [    Brand,     Model,     Year,     Color     ]
    feats = [g.strip() for g in match.groups() if g is not None]
    

100%|██████████| 2994/2994 [00:00<00:00, 131563.61it/s]


In [4]:
# conn = pg2.connect(dbname='reverb', host='192.168.0.209', password='galvanize', user='postgres')
# cur = conn.cursor()
# conn.autocommit = True
# cur.execute("""CREATE TABLE guitars(
#                id INT PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
#                title VARCHAR(225),
#                date_str VARCHAR(12),
#                cond VARCHAR(12),
#                sale_price NUMERIC
#            );""")
# conn.close()