## MySlabs Scraper

In [1]:
import requests
import urllib
import numpy as np
import pandas as pd
from scipy import stats
from selenium import webdriver
from bs4 import BeautifulSoup as bs

In [2]:
pages = 1840

In [112]:
def get_price(item):
    text = getattr(item.find('div', {'class': 'slab-feed-label'}), 'text', None).strip()
    text_list = str(text).split('-')
    return text_list[0].replace("$", '').replace(',', '').strip()

def get_sold_date(item):
    text = getattr(item.find('div', {'class': 'slab-feed-label'}), 'text', None).strip()
    text_list = str(text).split('-')
    return text_list[1].strip() if text_list[1] is not None else None

In [159]:
class MySlabsScraper:
    def __init__(self, pages) -> None:
        self.pages = pages

    def href_builder(self):
        url = 'https://myslabs.com/browse/archive/?page='
        links = []

        for i in range(self.pages):
            links.append(url + str(i + 1))
        return links

    def get_data(self):

        soups = []
        links = self.href_builder()

        for link in links:
            page = requests.get(link)
            soup = bs(page.text, 'html.parser')
            soups.append(soup)

        return soups

    def parse(self):

        soups = self.get_data()
        results = []
        product_list = []

        for soup in soups:
            results.extend(soup.find_all('div', {'class': 'slab_item psa'}))

        for item in results:
            
            def get_card_data(item):
                link = 'https://myslabs.com' + item.find('a')['href']
                page = requests.get(link)
                item = bs(page.text, 'html.parser')
                
                title = getattr(item.find('p', {'class': 'h4 font-weight-bold'}), 'text', None)
                desc = getattr(item.find('p', {'class': 'overflow-auto'}), 'text', None)
                views = getattr(item.find('div', {'class': 'mr-1 text-medium'}), 'text', None)

                return title, desc if (len(desc) > 0) & (desc is not None) else None, views

            title, desc, views = get_card_data(item)

            products = {
                'title': title,
                'desc': desc,
                'views': views[:3],
                'link': 'https://myslabs.com' + item.find('a')['href'],
                'soldprice': get_price(item),
                'solddate': get_sold_date(item),
            }
            product_list.append(products)
        return product_list

    def output(self):
        products = self.parse()
        df = pd.DataFrame(products)
        df['views'] = pd.to_numeric(df['views'])
        df['link'] = df['link'].astype(str)
        df['soldprice'] = pd.to_numeric(df['soldprice'])
        df['solddate'] = pd.to_datetime(df['solddate'])
        return df

In [160]:
slabs_test = MySlabsScraper(1)
out = slabs_test.output()

In [162]:
out

Unnamed: 0,title,desc,views,link,soldprice,solddate
0,2019 Optic Zion Williamson Rated Rookie RC Fan...,2019 Optic Zion Williamson Rated Rookie RC Fan...,31,https://myslabs.com/slab/view/583503/,45.45,2022-09-06
1,2013 Panini Prizm Kawhi Leonard Auto Red Prizm...,,31,https://myslabs.com/slab/view/845821/,1161.5,2022-09-06
2,1996 Topps Bulls Commemorative #72 PSA 10 GEM...,,11,https://myslabs.com/slab/view/844012/,56.56,2022-09-06
3,2018 Topps Chrome Update Ronald Acuna Refracto...,,25,https://myslabs.com/slab/view/843856/,883.75,2022-09-06
4,2018 Prizm Mosaic Michael Porter Jr RC #73 PSA...,,51,https://myslabs.com/slab/view/553260/,60.6,2022-09-06
5,1959 Topps #150 Stan Musial CSG 4,,21,https://myslabs.com/slab/view/845602/,63.63,2022-09-06
6,2013 Bowman Chrome Ref Aaron Judge psa 9,2013 Bowman Chrome ref Aaron Judge,47,https://myslabs.com/slab/view/820889/,252.5,2022-09-06
7,Lot of 4 Chipper Jones #55 Rookies PSA 8 and 7...,2 x PSA 8\n2 x PSA 7,50,https://myslabs.com/slab/view/798284/,50.5,2022-09-06
8,2010 Bowman Chrome 1st Christian Yelich BGS 9 ...,,24,https://myslabs.com/slab/view/749381/,13.64,2022-09-06
9,2020 TOPPS COMPLETE SET LUIS ROBERT SGC 10,,33,https://myslabs.com/slab/view/829029/,12.12,2022-09-06


In [135]:
def get_card_data(i):
    link = out.iloc[i]['link']
    page = requests.get(link)
    item = bs(page.text, 'html.parser')
    
    title = getattr(item.find('p', {'class': 'h4 font-weight-bold'}), 'text', None)
    desc = getattr(item.find('p', {'class': 'overflow-auto'}), 'text', None)

    return title, desc if len(desc) > 0 else None


In [136]:
get_card_data(5)

('2020 TOPPS COMPLETE SET LUIS ROBERT SGC 10', None)