In [1]:
from bs4 import BeautifulSoup
import requests
import time
import os
from urllib.request import Request, urlopen

In [2]:
# Configurar directorio
data_dir = "datanba"
standings_dir = os.path.join(data_dir, "standings")
scores_dir = os.path.join(data_dir, "scores")

In [3]:
seasons = list(range(2014, 2024))
seasons

[2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

In [4]:
base_url = "https://basketball-reference.com"

## 1. Obtener HTML de la url y devolver HTML

In [5]:
def get_html(url, sleep=2, retries=3):
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep * i)
    
        result = requests.get(url)
        content = result.text
        html = BeautifulSoup(content, 'html.parser')

    return html

## 2. Obtener la clasificación de una temporada y guardarla en un archivo html

In [6]:
def scrape_season(season):
    url = f"{base_url}/leagues/NBA_{season}_games.html"
    html = get_html(url)
    
    links = html.find('div', class_='filter').find_all('a')
    hrefs = [l['href'] for l in links]
    standings_pages = [f"{base_url}{h}" for h in hrefs]
    
    for url in standings_pages:
        save_path = os.path.join(standings_dir, url.split("/")[-1])
        if os.path.exists(save_path):
            continue
           
        html = get_html(url)
        html_str = str(html)
        with open(save_path, "w+", encoding='utf-8') as x:
            x.write(html_str)

In [None]:
for season in seasons:
    scrape_season(season)

In [7]:
standings_files = os.listdir(standings_dir)

## 3. Obtener el boxscore de cada partido para cada mes y temporada


In [8]:
def boxscore_games(standings_file):
    with open(standings_file, 'r', encoding='utf-8') as f:
        html = f.read()

    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all("a")
    href = [l.get("href") for l in links]
    box_scores = [l for l in href if l and "boxscore" in l and ".html" in l]
    box_scores = [f"{base_url}{b}" for b in box_scores]
    
    name_file = os.path.basename(standings_file)

    for url in box_scores:
        save_path = os.path.join(scores_dir, url.split("/")[-1])
        if os.path.exists(save_path):
            continue

        html = get_html(url)
        html_str = str(html)

        if not html:
            continue
        with open(save_path, "w+", encoding='utf-8') as f:
            f.write(html_str)

In [13]:
for season in seasons:
    files = [s for s in standings_files if str(season) in s]
    
    for f in files:
        filepath = os.path.join(standings_dir, f)
        
        boxscore_games(filepath)

# 4. Abrir archivos con datos corruptos, para volver a hacer el web scraping

In [None]:
file = "corruptedbox.txt"

corrupted_box = []

with open(file, "r") as f:
    for line in f:
        corrupted_box.append(line.strip())

Cambiar la variable seasons para incluir menos temporadas, así tardará menos

In [1]:
seasons = list(range(2014, 2016))
seasons

[2014, 2015]

In [5]:
def get_html(url, sleep=2, retries=3):
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep * i)
    
        result = requests.get(url)
        content = result.text
        html = BeautifulSoup(content, 'html.parser')

    return html

In [6]:
def scrape_season(season):
    url = f"{base_url}/leagues/NBA_{season}_games.html"
    html = get_html(url)
    
    links = html.find('div', class_='filter').find_all('a')
    hrefs = [l['href'] for l in links]
    standings_pages = [f"{base_url}{h}" for h in hrefs]
    
    for url in standings_pages:
        save_path = os.path.join(standings_dir, url.split("/")[-1])
        if os.path.exists(save_path):
            continue
           
        html = get_html(url)
        html_str = str(html)
        with open(save_path, "w+", encoding='utf-8') as x:
            x.write(html_str)

In [None]:
for season in seasons:
    scrape_season(season)

In [7]:
standings_files = os.listdir(standings_dir)

In [8]:
def boxscore_games(standings_file):
    with open(standings_file, 'r', encoding='utf-8') as f:
        html = f.read()

    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all("a")
    href = [l.get("href") for l in links]
    box_scores = [l for l in href if l and "boxscore" in l and ".html" in l]
    box_scores = [f"{base_url}{b}" for b in box_scores]
    
    name_file = os.path.basename(standings_file)

    for url in box_scores:
        save_path = os.path.join(scores_dir, url.split("/")[-1])
        if os.path.exists(save_path):
            continue

        html = get_html(url)
        html_str = str(html)

        if not html:
            continue
        with open(save_path, "w+", encoding='utf-8') as f:
            f.write(html_str)

In [13]:
for season in seasons:
    files = [s for s in standings_files if str(season) in s]
    
    for f in files:
        filepath = os.path.join(standings_dir, f)
        
        boxscore_games(filepath)