In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
# https://stackoverflow.com/questions/20986631/how-can-i-scroll-a-web-page-using-selenium-webdriver-in-python
def scroll_down(driver):
    SCROLL_PAUSE_TIME = 1

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [None]:
# function to collect guitar links
def get_guitar_links(links_df):
    c = webdriver.ChromeOptions()
    c.add_argument("--incognito")

    driver = webdriver.Chrome("chromedriver.exe",options=c)
    driver.maximize_window()

    driver.get("https://findmyguitar.com/explorer.php")
    
    scroll_down(driver) # scroll all the way down
    guitars = driver.find_elements(by=By.CLASS_NAME,value='image-cards') #get all the guitars
    
    print(f"There are {len(guitars)} number of guitars.")
    
    links = links_df['Link'].unique() # get all the links from the previous guitars

    for guitar in guitars:
        a_tag = guitar.find_element(by=By.TAG_NAME,value='a')
        link = a_tag.get_attribute("href")
        model = a_tag.find_element(by=By.CLASS_NAME,value='card-title').text
        
        # if link is in the already dataframe, do not add it
        if link in links:
            print(f"{model} is already scraped.({link})")
            continue

        links_df = links_df.append({'Model' : model, 'Link' : link},ignore_index=True)
        print(f"{model} is added.({link})")
    
    return links_df

In [None]:
links_df = pd.DataFrame(columns=['Link'])

get_guitar_links(links_df)

links_df.to_csv("findmyguitars.csv",index=False)

In [5]:
def get_specs(driver,df,idx):
    guitar_specifications = driver.find_element(by=By.CLASS_NAME,value='guitar-specifications') # get the specs table
    specs = guitar_specifications.find_elements(by=By.CLASS_NAME,value='section-row') # get the rows from the table
    
    # loop through every row
    for spec in specs:
        spec = spec.find_elements(by=By.TAG_NAME,value='span')
        attribute = spec[0].text.split(":")[0] # get the attribute
        value = spec[1].text # get the correspoding value for the attribute
        df.at[idx,attribute] = value # write to the dataframe

In [6]:
def get_strengths_and_weaknesses(driver,df,idx):
    strength_table = driver.find_elements(by=By.CLASS_NAME,value='summary-table')[1] # get the strengths and weaknesses table

    strengths = strength_table.find_elements(by=By.CLASS_NAME,value='isset') # get the positive values
    weaknesses = strength_table.find_elements(by=By.CLASS_NAME,value='notset') # get the negative values
    
    # loop through positive values
    for strength in strengths:
        attribute = strength.text
        value = True
        df.at[idx,attribute] = value # write to the dataframe as True
    
    # loop through negative values
    for weakness in weaknesses:
        attribute = weakness.text
        value = False
        df.at[idx,attribute] = value # write to the dataframe as False

In [7]:
def get_materials(driver,df,idx):
    materials = driver.find_element(by=By.CLASS_NAME,value='three-cs')
    
    # get the Body,Neck,Fretboard Material
    body_material = materials.find_element(by=By.ID,value='col1').find_element(by=By.TAG_NAME,value='figcaption').text.split(" Body")[0]
    neck_material = materials.find_element(by=By.ID,value='col2').find_element(by=By.TAG_NAME,value='figcaption').text.split(" Neck")[0]
    fretboard_material = materials.find_element(by=By.ID,value='col3').find_element(by=By.TAG_NAME,value='figcaption').text.split(" Fretboard")[0]
    
    # because specs table also has the material info, append them to the dataframe as 2
    df.at[idx,'Body Material 2'] = body_material
    df.at[idx,'Neck Material 2'] = neck_material
    df.at[idx,'Fretboard Material 2'] = fretboard_material
    

In [8]:
def get_price(driver,df,idx):
    # findmyguitars.com keeps guitar prices from different websites like (amazon.com, reverb.com, thomann.de, sweetwater.com)
    prices = driver.find_element(by=By.ID,value='all-prices-container').find_elements(by=By.CLASS_NAME,value='static.more-results-button')
            
    avg_price = 0 # calculate the average price from all the websites that is shown
    num_of_products = len(prices)
    for price in prices:
        price = price.find_element(by=By.TAG_NAME,value='span').text.split(" ")[0]
        
        # if price is in €, convert it to the dollar by the today's exchange rate.
        if "€" in price:
            price = float(price.split("€")[1].replace(',',''))
            price *= 1.02
            avg_price += price
        elif "$" in price: # if it is dollar, stays same
            price = float(price.split("$")[1].replace(',',''))
            avg_price += price
        else: # sometimes price info is not shown
            print("No price info :",price)
            num_of_products -= 1 # subtract 1 from the number of items to not to effect the mean value
            
    avg_price /= num_of_products # get the mean prcie 
            
    df.at[idx,"Price"] = avg_price # write to dataframe
    print("AVG Price :",avg_price)

In [None]:
def scrape_guitar_info(df):
    c = webdriver.ChromeOptions()
    c.add_argument("--incognito")

    driver = webdriver.Chrome("chromedriver.exe",options=c)
    driver.maximize_window()
    
    for idx,row in df.iterrows():
        url = row['Link']
        
        try: # sometimes selenium throws unknown error
            driver.get(url) # go to guitar specs page
        except:
            print('error nvm')
        time.sleep(1)

        successful_scrape = False

        while not successful_scrape:
            try:
                get_specs(driver,df,idx) # get the information from specs table
                get_strengths_and_weaknesses(driver,df,idx) # get the information from strengths and wekanesses table (contains bool values)
                get_materials(driver,df,idx) # get materials (body wood, fretboard wood, neck wood)

                successful_scrape = True
            except: # if errors occurs while scraping try again
                print('unsuccessfull scrape try it again')
                successful_scrape = False

            try:    
                get_price(driver,df,idx) # get the price information
            except: # not all guitars have price information on the html
                print("no price info. continue with the next item")

        time.sleep(1)

In [None]:
df = pd.read_csv("findmyguitars.csv")

In [None]:
scrape_guitar_info(df)
df.to_csv("findmyguitars.csv",index=False)

<h3> ---------------------------------- </h3>

In [18]:
c = webdriver.ChromeOptions()
c.add_argument("--incognito")

driver = webdriver.Chrome("chromedriver.exe",options=c)
driver.maximize_window()

  driver = webdriver.Chrome("chromedriver.exe",options=c)


In [10]:
print(f"There are {df[pd.isnull(df.Year)].shape[0]} number of items to scrape.")

for idx,row in df[pd.isnull(df.Year)].iterrows():
    url = row['Link']
    print(idx,"-",url)
    try:
        driver.get(url)
    except:
        print('error nvm')
    time.sleep(1)
    
    successful_scrape = False
    
    while not successful_scrape:
        try:
            '''
            guitar_specifications = driver.find_element(by=By.CLASS_NAME,value='guitar-specifications')
            specs = guitar_specifications.find_elements(by=By.CLASS_NAME,value='section-row')
            for spec in specs:
                spec = spec.find_elements(by=By.TAG_NAME,value='span')
                attribute = spec[0].text.split(":")[0]
                value = spec[1].text
                df.at[idx,attribute] = value
            
            successful_scrape = True
            prices = driver.find_element(by=By.ID,value='all-prices-container').find_elements(by=By.CLASS_NAME,value='static.more-results-button')
            
            avg_price = 0
            num_of_products = len(prices)
            for price in prices:
                price = price.find_element(by=By.TAG_NAME,value='span').text.split(" ")[0]

                if "€" in price:
                    price = float(price.split("€")[1].replace(',',''))
                    price *= 1.02
                    avg_price += price
                elif "$" in price:
                    price = float(price.split("$")[1].replace(',',''))
                    avg_price += price
                else:
                    print("Indicator:",price)
                    num_of_products -= 1
            avg_price /= num_of_products
            
            df.at[idx,"Price"] = avg_price
            print("AVG Price :",avg_price) '''
            
            get_specs(driver,df,idx)
            get_strengths_and_weaknesses(driver,df,idx)
            get_materials(driver,df,idx)
            
            successful_scrape = True
        except:
            print('unsuccessfull scrape try it again')
            successful_scrape = False
        
        try:    
            get_price(driver,df,idx)
        except:
            print("no price info. continue with the next item")
                   
    time.sleep(1)

There are 28 number of items to scrape.
1635 - https://findmyguitar.com/reviews/Cort-G300-Pro-review.php
Tremolo True
Locking Tuners True
Stainless Steel Frets True
Compound Radius Fretboard True
Black Tusq XL Nut True
Top Pickup Brand True
Luminescent Inlay True
Neck-Through Build False
Push Knob or Extra Switch Option False
Weight Relief False
Stays in Tune (Evertune) False
21:1 Tuner Ratio False
High-Quality-Standards Country False
Strap Lock False
Expensive Wood False
Body Material 2: Basswood
Neck Material 2: Roasted Maple
Fretboard Material 2: Roasted Maple
no price info. continue with the next item
1636 - https://findmyguitar.com/reviews/Cort-KX700-EverTune-review.php
Stays in Tune (Evertune) True
Locking Tuners True
Stainless Steel Frets True
Expensive Wood True
Top Pickup Brand True
Luminescent Inlay True
Neck-Through Build False
Push Knob or Extra Switch Option False
Weight Relief False
Tremolo False
21:1 Tuner Ratio False
Compound Radius Fretboard False
High-Quality-Standard

Strap Lock False
Expensive Wood False
High-Quality Nut False
Top Pickup Brand False
Luminescent Inlay False
Body Material 2: Basswood
Neck Material 2: Maple
Fretboard Material 2: Laurel
no price info. continue with the next item
1650 - https://findmyguitar.com/reviews/Cort-MBM-2-review.php
Killswitch Pickups True
Locking Tuners True
Compound Radius Fretboard True
Neck-Through Build False
Weight Relief False
Tremolo False
Stays in Tune (Evertune) False
21:1 Tuner Ratio False
Stainless Steel Frets False
High-Quality-Standards Country False
Strap Lock False
Expensive Wood False
High-Quality Nut False
Top Pickup Brand False
Luminescent Inlay False
Body Material 2: Basswood
Neck Material 2: Maple
Fretboard Material 2: Laurel
no price info. continue with the next item
1651 - https://findmyguitar.com/reviews/Cort-CR300-review.php
Expensive Wood True
Top Pickup Brand True
Neck-Through Build False
Push Knob or Extra Switch Option False
Weight Relief False
Tremolo False
Stays in Tune (Evertune) 

In [19]:
print(f"There are {df[pd.isnull(df.Price)].shape[0]} number of items to scrape.")

for idx,row in df[pd.isnull(df.Price)].iterrows():
    url = row['Link']
    print(idx,"-",url)
    try:
        driver.get(url)
    except:
        print('error nvm')
    
    time.sleep(1)
    
    
    try:    
        get_price(driver,df,idx)
    except:
        print("no price info. continue with the next item")
                   
    time.sleep(1)

There are 62 number of items to scrape.
37 - https://findmyguitar.com/reviews/Schecter-Synyster-Gates-FR-QM-USA-Signature-review.php
no price info. continue with the next item
132 - https://findmyguitar.com/reviews/Solar-A1-6Coroner-review.php
error nvm
no price info. continue with the next item
194 - https://findmyguitar.com/reviews/Solar-S1-6PB-27-Baritone-review.php
error nvm
no price info. continue with the next item
198 - https://findmyguitar.com/reviews/ESP-Stef-B-8-review.php
error nvm
no price info. continue with the next item
207 - https://findmyguitar.com/reviews/Jackson-USA-Signature-Corey-Beaulieu-King-V-KV7-review.php
AVG Price : 2995.0
212 - https://findmyguitar.com/reviews/Solar-V1-6FRLB-review.php
no price info. continue with the next item
217 - https://findmyguitar.com/reviews/Solar-A1-6FRFOB-review.php
error nvm
no price info. continue with the next item
221 - https://findmyguitar.com/reviews/Solar-AB1-6TBS-review.php
error nvm
no price info. continue with the next it

In [20]:
driver.close()