## MySlabs Scraper

In [164]:
import requests
import time
import urllib
import numpy as np
import pandas as pd
from scipy import stats
from selenium import webdriver
from bs4 import BeautifulSoup as bs

In [2]:
pages = 1840

In [196]:
def get_price(item):
    text = getattr(item.find('div', {'class': 'slab-feed-label'}), 'text', None).strip()
    text_list = str(text).split('-')
    return text_list[0].replace("$", '').replace(',', '').strip()

def get_sold_date(item):
    text = getattr(item.find('div', {'class': 'slab-feed-label'}), 'text', None).strip()
    text_list = str(text).split('-')
    return text_list[1].strip() if text_list[1] is not None else None

def get_grade(title):
    index = ''

    grading_companies = ['PSA', 'psa', 'BGS', 'bgs', 'HGA', 'hga', 'SGC', 'sgc']
    for company in grading_companies:
        index = title.find(company)
        if (index != -1):
            return title[index:index + 7]
        
    return None

In [197]:
class MySlabsScraper:
    def __init__(self, pages) -> None:
        self.pages = pages

    def __href_builder(self):
        url = 'https://myslabs.com/browse/archive/?page='
        links = []

        for i in range(self.pages):
            links.append(url + str(i + 1))
        return links

    def __get_data(self):

        soups = []
        links = self.__href_builder()

        for link in links:
            page = requests.get(link)
            soup = bs(page.text, 'html.parser')
            soups.append(soup)

        return soups

    def __get_grade(title):

        index = ''

        grading_companies = ['PSA', 'psa', 'BGS', 'bgs', 'HGA', 'hga', 'SGC', 'sgc']
        for company in grading_companies:
            index = title.find(company)
            if (index != -1):
                return title[index:index + 7]
        
        return None

    def __parse(self):

        soups = self.__get_data()
        results = []
        product_list = []

        for soup in soups:
            results.extend(soup.find_all('div', {'class': 'slab_item psa'}))

        for item in results:
            
            def get_card_data(item):
                link = 'https://myslabs.com' + item.find('a')['href']
                page = requests.get(link)
                item = bs(page.text, 'html.parser')
                
                title = getattr(item.find('p', {'class': 'h4 font-weight-bold'}), 'text', None)
                desc = getattr(item.find('p', {'class': 'overflow-auto'}), 'text', None)
                views = getattr(item.find('div', {'class': 'mr-1 text-medium'}), 'text', None)

                if desc is not None:
                    desc = desc if len(desc) > 0 else None
                else:
                    desc = None

                return title, desc, views

            title, desc, views = get_card_data(item)

            products = {
                'title': title,
                'desc': desc,
                'grade': get_grade(title),
                'views': views.split(' ')[0] if views is not None else None,
                'link': 'https://myslabs.com' + item.find('a')['href'],
                'soldprice': get_price(item),
                'solddate': get_sold_date(item),
            }
            product_list.append(products)

        return product_list

    def output(self):
        products = self.__parse()
        df = pd.DataFrame(products)
        df['views'] = pd.to_numeric(df['views'])
        df['link'] = df['link'].astype(str)
        df['soldprice'] = pd.to_numeric(df['soldprice'])
        df['solddate'] = pd.to_datetime(df['solddate'])
        return df

In [198]:
slabs_test = MySlabsScraper(1)
out = slabs_test.output()

In [199]:
out

Unnamed: 0,title,desc,grade,views,link,soldprice,solddate
0,Ja Morant 2019-20 Mosaic RC PSA 10,,PSA 10,27,https://myslabs.com/slab/view/839936/,65.65,2022-09-08
1,2016 Panini Donruss Christian Pulisic Rookie P...,PSA 9 of Christian Pulisic rookie holo.,PSA 9,65,https://myslabs.com/slab/view/720724/,207.05,2022-09-08
2,x3 RC (all = PSA 10) 2020 Justin Herbert Absol...,x3 RC (all = PSA 10),PSA 10),14,https://myslabs.com/slab/view/847803/,156.55,2022-09-08
3,2019 Topps UCL Living Set #1 Lionel Messi PSA ...,2019 Topps UCL Living Set #1 Lionel Messi PSA ...,PSA 9 M,21,https://myslabs.com/slab/view/672950/,85.85,2022-09-08
4,2021 Panini Mosaic Orange Reactive Evan McPher...,The grade says it all for this color match sho...,SGC 10,17,https://myslabs.com/slab/view/847763/,101.0,2022-09-08
5,Kevin Porter Jr. 2019-20 Panini Select Concour...,,PSA 10,125,https://myslabs.com/slab/view/460947/,656.5,2022-09-08
6,Trea Turner 2016 Topps Chrome Refractor RC PSA...,,PSA 10,6,https://myslabs.com/slab/view/847778/,85.85,2022-09-08
7,Donovan Mitchell 5 card lot high End see all p...,Donovan Mitchell 5 cars lot\nPrizm fast break ...,,27,https://myslabs.com/slab/view/847335/,453.49,2022-09-07
8,Trea Turner 2016 Topps Chrome Refractor PRISM ...,,PSA 10,9,https://myslabs.com/slab/view/847724/,101.0,2022-09-07
9,2018-19 Prizm Mikal Bridges Red White Blue Pri...,,PSA 10,15,https://myslabs.com/slab/view/818968/,27.27,2022-09-07


In [135]:
def get_card_data(i):
    link = out.iloc[i]['link']
    page = requests.get(link)
    item = bs(page.text, 'html.parser')
    
    title = getattr(item.find('p', {'class': 'h4 font-weight-bold'}), 'text', None)
    desc = getattr(item.find('p', {'class': 'overflow-auto'}), 'text', None)

    return title, desc if len(desc) > 0 else None


In [163]:
get_card_data(5)

('1959 Topps #150 Stan Musial CSG 4', None)