## MySlabs Scraper

In [1]:
import requests
import urllib
import numpy as np
import pandas as pd
from scipy import stats
from selenium import webdriver
from bs4 import BeautifulSoup as bs

In [2]:
pages = 1840

In [112]:
def get_price(item):
    text = getattr(item.find('div', {'class': 'slab-feed-label'}), 'text', None).strip()
    text_list = str(text).split('-')
    return text_list[0].replace("$", '').replace(',', '').strip()

def get_sold_date(item):
    text = getattr(item.find('div', {'class': 'slab-feed-label'}), 'text', None).strip()
    text_list = str(text).split('-')
    return text_list[1].strip() if text_list[1] is not None else None

In [150]:
class MySlabsScraper:
    def __init__(self, pages) -> None:
        self.pages = pages

    def href_builder(self):
        url = 'https://myslabs.com/browse/archive/?page='
        links = []

        for i in range(self.pages):
            links.append(url + str(i + 1))
        return links

    def get_data(self):

        soups = []
        links = self.href_builder()

        for link in links:
            page = requests.get(link)
            soup = bs(page.text, 'html.parser')
            soups.append(soup)

        return soups

    def parse(self):

        soups = self.get_data()
        results = []
        product_list = []

        for soup in soups:
            results.extend(soup.find_all('div', {'class': 'slab_item psa'}))

        for item in results:
            
            def get_card_data(item):
                link = 'https://myslabs.com' + item.find('a')['href']
                page = requests.get(link)
                item = bs(page.text, 'html.parser')
                
                title = getattr(item.find('p', {'class': 'h4 font-weight-bold'}), 'text', None)
                desc = getattr(item.find('p', {'class': 'overflow-auto'}), 'text', None)
                views = getattr(item.find('div', {'class': 'mr-1 text-medium'}), 'text', None)

                return title, desc if len(desc) > 0 else None, views

            title, desc, views = get_card_data(item)

            products = {
                'title': title,
                'desc': desc,
                'views': views[:3],
                'link': 'https://myslabs.com' + item.find('a')['href'],
                'soldprice': get_price(item),
                'solddate': get_sold_date(item),
            }
            product_list.append(products)
        return product_list

    def output(self):
        products = self.parse()
        df = pd.DataFrame(products)
        df['views'] = pd.to_numeric(df['views'])
        df['link'] = df['link'].astype(str)
        df['soldprice'] = pd.to_numeric(df['soldprice'])
        df['solddate'] = pd.to_datetime(df['solddate'])
        return df

In [151]:
slabs_test = MySlabsScraper(1)
out = slabs_test.output()

In [152]:
out

Unnamed: 0,title,desc,views,link,soldprice,solddate
0,2021 Bowman Inception JULIO RODRIGUEZ Silver S...,POP 3 at PSA as of 6/29/22,68,https://myslabs.com/slab/view/766397/,1039.29,2022-09-06
1,2019 Topps Chrome Pete Alonso Blue Wave Auto /...,,115,https://myslabs.com/slab/view/176069/,858.5,2022-09-06
2,1959 Topps #150 Stan Musial CSG 4,,17,https://myslabs.com/slab/view/845602/,63.63,2022-09-06
3,2008-09 Fleer Michael Jordan Base #68 PSA 10 G...,,21,https://myslabs.com/slab/view/845520/,64.64,2022-09-06
4,Lot of 4 Chipper Jones #55 Rookies PSA 8 and 7...,2 x PSA 8\n2 x PSA 7,45,https://myslabs.com/slab/view/798284/,50.5,2022-09-06
5,2020 Topps Chrome Eloy Jimenez Prism Refractor...,,73,https://myslabs.com/slab/view/468373/,13.13,2022-09-06
6,2020 TOPPS COMPLETE SET LUIS ROBERT SGC 10,,29,https://myslabs.com/slab/view/829029/,12.12,2022-09-06
7,1992 Topps Wade Boggs GOLD #10 CSG 8 NM/MT,,81,https://myslabs.com/slab/view/450431/,12.12,2022-09-06
8,Davis Mills RC - 2021 Panini Prizm Blue Wave P...,#'d 1/199. PSA 10 Gem Mint | POP 2. Brand new ...,32,https://myslabs.com/slab/view/839379/,454.5,2022-09-06
9,Anthony Davis 2012 Panini Marquee RC 462 PSA 9,Anthony Davis 2012-13 Panini Marquee RC #462 P...,12,https://myslabs.com/slab/view/845803/,20.19,2022-09-06


In [135]:
def get_card_data(i):
    link = out.iloc[i]['link']
    page = requests.get(link)
    item = bs(page.text, 'html.parser')
    
    title = getattr(item.find('p', {'class': 'h4 font-weight-bold'}), 'text', None)
    desc = getattr(item.find('p', {'class': 'overflow-auto'}), 'text', None)

    return title, desc if len(desc) > 0 else None


In [136]:
get_card_data(5)

('2020 TOPPS COMPLETE SET LUIS ROBERT SGC 10', None)