# Different Modes of Scraping

In [1]:
import json
from requests_html import HTMLSession, AsyncHTMLSession
import requests_html as rh
import requests
from requests import Response
from fake_useragent import UserAgent
from requests.auth import HTTPProxyAuth
from bs4 import BeautifulSoup

## Config

In [6]:
baskref_url = 'https://www.basketball-reference.com/boxscores/?month=10&day=20&year=2021'
url_headers = "https://httpbin.org/headers"
url_ip = "https://httpbin.org/ip"

demo_game_url = "https://www.basketball-reference.com/boxscores/202201070BRK.html"
demo_year_url = "https://www.basketball-reference.com//leagues/NBA_2006_games.html"
demo_month_url = "https://www.basketball-reference.com/leagues/NBA_2006_games-january.html"

print(requests.utils.default_headers())

{'User-Agent': 'python-requests/2.28.1', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}


## Parsing

### Beautiful Soup

In [None]:
html = requests.get(demo_month_url)
soup = BeautifulSoup(html.text, 'html.parser')

## Scraping

### Using a Proxy

In [10]:
print(requests.get(url_ip).text)

{
  "origin": "151.227.227.159"
}



In [24]:
username = 'sp53333242'
password = 'oxD7r1bU'

proxies = {
   "https": f'http://user-{username}:{password}@gb.smartproxy.com:30000',
   "http": f'http://user-{username}:{password}@gb.smartproxy.com:30000',
}

print(requests.get(url_ip, proxies=proxies).text)

{
  "origin": "213.104.126.212"
}



In [26]:
proxies

{'https': 'http://user-sp53333242:oxD7r1bU@gb.smartproxy.com:30000',
 'http': 'http://user-sp53333242:oxD7r1bU@gb.smartproxy.com:30000'}

### Using a Random User Agent

In [20]:
print(requests.get(url_headers).text)

{
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.28.1", 
    "X-Amzn-Trace-Id": "Root=1-63646611-54cf00c70a84bf50093600df"
  }
}



In [21]:
print(requests.get(url_headers, headers={'User-Agent': UserAgent().random}).text)

{
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.1; SV1; .NET CLR 2.8.52393; WOW64; en-US)", 
    "X-Amzn-Trace-Id": "Root=1-63646614-5deeec36665dda821ab03530"
  }
}



In [22]:
print(requests.get(url_headers, headers=None).text)

{
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.28.1", 
    "X-Amzn-Trace-Id": "Root=1-6364663e-0e8dd95f678ca77b633f13ed"
  }
}



### Using Browser Automation

#### Selenium

In [231]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By 

chrome_service = Service(ChromeDriverManager().install())

In [None]:
chrome_options = Options()
#chrome_options.add_argument("--disable-extensions")
#chrome_options.add_argument("--disable-gpu")
#chrome_options.add_argument("--no-sandbox") # linux only
# chrome_options.add_argument("--enable-javascript")
# chrome_options.add_argument("--headless")

driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
driver.get(baskref_url)

driver.find_element_by_xpath('/html/body/div[2]/div/div/div/div[2]/div/button[3]').click()

# el = WebDriverWait(driver, timeout=3).until( 
#     lambda d: d.find_element(By.CLASS_NAME, "game_summaries"))

In [None]:
driver.page_source

#### Pypeteer

### TLS Fingerprinting

In [216]:
import ssl
import requests

from requests.adapters import HTTPAdapter
from urllib3.poolmanager import PoolManager
from urllib3.util.ssl_ import create_urllib3_context

# see "openssl ciphers" command for cipher names
CIPHERS = "ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384"


class TlsAdapter(HTTPAdapter):
    def __init__(self, ssl_options=0, **kwargs):
        self.ssl_options = ssl_options
        super(TlsAdapter, self).__init__(**kwargs)

    def init_poolmanager(self, *pool_args, **pool_kwargs):
        ctx = create_urllib3_context(ciphers=CIPHERS, cert_reqs=ssl.CERT_REQUIRED, options=self.ssl_options)
        self.poolmanager = PoolManager(*pool_args, ssl_context=ctx, **pool_kwargs)


adapter = TlsAdapter(ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)  # prioritize TLS 1.2 
session = requests.session()
session.mount("https://", adapter)
res = session.get(baskref_url, headers={'User-Agent': UserAgent().random})

print(res.status_code)
detroit = "Detroit" in res.text
print(f"Detroit in text? {detroit}")

403
Detroit in text? False


- fix baskref with requests & bs4
- without using user-agent | andwith using user agent
- see if default user agent can get past at scale
- with option to add proxies
- if not try to implement a browser based solution (selenium, pupeteer, playwright)