# Web Scraping de archivos corruptos

Al ejecutar el notebook ``1_web_scraping_3_extraer_tablas.ipynb`` se obtienen archivos corruptos por distintos motivos:
- La página de donde se obtienen los datos detecta que se está haciendo un raspado masivo y se descarga un html de error en vez del html de la página donde aparecen los datos.
- Faltan variables y por eso el código interpreta que la tabla no coincide con lo que se está buscando. Se debe a que falta la variable BMP en la tabla Advanced Box Score Stats

In [1]:
from bs4 import BeautifulSoup
import requests
import time
import os
from urllib.request import Request, urlopen

In [37]:
# Configurar directorio
data_dir = "datanba"
standings_dir = os.path.join(data_dir, "standings")
scores_dir = os.path.join(data_dir, "scores")

In [3]:
seasons = list(range(2014, 2024))
seasons

[2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

In [4]:
base_url = "https://basketball-reference.com"

## 1. Obtener HTML de la url y devolver HTML

In [5]:
def get_html(url, sleep=2, retries=3):
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep * i)
    
        result = requests.get(url)
        content = result.text
        html = BeautifulSoup(content, 'html.parser')

    return html

## 2. Obtener la clasificación de una temporada y guardarla en un archivo html

In [6]:
def scrape_season(season):
    url = f"{base_url}/leagues/NBA_{season}_games.html"
    html = get_html(url)
    
    links = html.find('div', class_='filter').find_all('a')
    hrefs = [l['href'] for l in links]
    standings_pages = [f"{base_url}{h}" for h in hrefs]
    
    for url in standings_pages:
        save_path = os.path.join(standings_dir, url.split("/")[-1])
        if os.path.exists(save_path):
            continue
           
        html = get_html(url)
        html_str = str(html)
        with open(save_path, "w+", encoding='utf-8') as x:
            x.write(html_str)

In [None]:
for season in seasons:
    scrape_season(season)

In [7]:
standings_files = os.listdir(standings_dir)

## 3. Obtener el boxscore de cada partido para cada mes y temporada


In [8]:
def boxscore_games(standings_file):
    with open(standings_file, 'r', encoding='utf-8') as f:
        html = f.read()

    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all("a")
    href = [l.get("href") for l in links]
    box_scores = [l for l in href if l and "boxscore" in l and ".html" in l]
    box_scores = [f"{base_url}{b}" for b in box_scores]
    
    name_file = os.path.basename(standings_file)

    for url in box_scores:
        save_path = os.path.join(scores_dir, url.split("/")[-1])
        if os.path.exists(save_path):
            continue

        html = get_html(url)
        html_str = str(html)

        if not html:
            continue
        with open(save_path, "w+", encoding='utf-8') as f:
            f.write(html_str)

In [13]:
for season in seasons:
    files = [s for s in standings_files if str(season) in s]
    
    for f in files:
        filepath = os.path.join(standings_dir, f)
        
        boxscore_games(filepath)

# 4. Abrir archivos con datos corruptos, para volver a hacer el web scraping

In [2]:
from bs4 import BeautifulSoup
import requests
import time
import os
from urllib.request import Request, urlopen

In [3]:
# Configurar directorio
data_dir = "datanba"
standings_dir = os.path.join(data_dir, "standings")
scores_dir = os.path.join(data_dir, "scores")

standings_files = os.listdir(standings_dir)

In [4]:
base_url = "https://basketball-reference.com"

In [5]:
file = "corruptedbox.txt"

lista = []

with open(file, "r") as f:
    for line in f:
        lista.append(line.strip())

In [47]:
# borrar archivos en la carpeta local

import os

folder = "C:/Users/mroja/Documents/BigData/Modulo10 TFM/datanba/scores"


for root, dirs, files in os.walk(folder):
    print(files)
    for name in files:
        for l in lista:
            if l == name:
                os.remove(os.path.join(root, name))


['201310290IND.html', '201310290LAL.html', '201310290MIA.html', '201310300CLE.html', '201310300DAL.html', '201310300DET.html', '201310300GSW.html', '201310300HOU.html', '201310300MIN.html', '201310300NOP.html', '201310300NYK.html', '201310300PHI.html', '201310300PHO.html', '201310300SAC.html', '201310300SAS.html', '201310300TOR.html', '201310300UTA.html', '201310310CHI.html', '201310310LAC.html', '201311010ATL.html', '201311010BOS.html', '201311010BRK.html', '201311010CHA.html', '201311010DEN.html', '201311010HOU.html', '201311010LAL.html', '201311010MEM.html', '201311010MIN.html', '201311010ORL.html', '201311010PHO.html', '201311010SAC.html', '201311010WAS.html', '201311020DAL.html', '201311020GSW.html', '201311020IND.html', '201311020MIL.html', '201311020NOP.html', '201311020PHI.html', '201311020POR.html', '201311020UTA.html', '201311030DET.html', '201311030LAL.html', '201311030MIA.html', '201311030NYK.html', '201311030OKC.html', '201311030ORL.html', '201311040CLE.html', '201311040LA

In [48]:
# añadir /boxscores/ para hacer la búsqueda

corrupted_box = []

new_name = ["/boxscores/" + nfile for nfile in lista]
for x in new_name:
    corrupted_box.append(x)

In [49]:
seasons = list(range(2014, 2024))
seasons

[2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

In [50]:
standings_files

['NBA_2014_games-april.html',
 'NBA_2014_games-december.html',
 'NBA_2014_games-february.html',
 'NBA_2014_games-january.html',
 'NBA_2014_games-june.html',
 'NBA_2014_games-march.html',
 'NBA_2014_games-may.html',
 'NBA_2014_games-november.html',
 'NBA_2014_games-october.html',
 'NBA_2015_games-april.html',
 'NBA_2015_games-december.html',
 'NBA_2015_games-february.html',
 'NBA_2015_games-january.html',
 'NBA_2015_games-june.html',
 'NBA_2015_games-march.html',
 'NBA_2015_games-may.html',
 'NBA_2015_games-november.html',
 'NBA_2015_games-october.html',
 'NBA_2016_games-april.html',
 'NBA_2016_games-december.html',
 'NBA_2016_games-february.html',
 'NBA_2016_games-january.html',
 'NBA_2016_games-june.html',
 'NBA_2016_games-march.html',
 'NBA_2016_games-may.html',
 'NBA_2016_games-november.html',
 'NBA_2016_games-october.html',
 'NBA_2017_games-april.html',
 'NBA_2017_games-december.html',
 'NBA_2017_games-february.html',
 'NBA_2017_games-january.html',
 'NBA_2017_games-june.html',
 'N

In [55]:
# 1

def get_html(url, sleep=2, retries=3):
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep * i)
    
        result = requests.get(url)
        content = result.text
        html = BeautifulSoup(content, 'html.parser')

    return html

In [70]:
# 2
#boxscore_games CORREGIDO PARA OBTENER LOS ELEMENTOS CORRUPTOS

def corrupted_boxscore_games(standings_file):
    with open(standings_file, 'r', encoding='utf-8') as f:
        html = f.read()
 
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all("a")
    href = [l.get("href") for l in links]
    
    for elemento in corrupted_box:
        if elemento in href:
            box_score = f"{base_url}{elemento}"
            
            save_p = os.path.join(scores_dir, box_score.split("/")[-1])
            save_path = os.path.join(save_p)
            print(save_path)
            
            if os.path.exists(save_path):
                continue

            html = get_html(box_score)
            html_str = str(html)

            if not html:
                continue
            with open(save_path, "w+", encoding='utf-8') as f:
                f.write(html_str)
                    

In [71]:
for season in seasons:
    print(season)
    files = [s for s in standings_files if str(season) in s]
    
    for f in files:
        filepath = os.path.join(standings_dir, f)
        
        corrupted_boxscore_games(filepath)

2014
datanba\scores\201312150PHO.html
datanba\scores\201312160ATL.html
datanba\scores\201312160BOS.html
datanba\scores\201312160BRK.html
datanba\scores\201312160CHI.html
datanba\scores\201312160IND.html
datanba\scores\201312160LAC.html
datanba\scores\201312160MIA.html
datanba\scores\201312160NYK.html
datanba\scores\201312170CHA.html
datanba\scores\201312170CLE.html
datanba\scores\201312170DEN.html
datanba\scores\201312170GSW.html
datanba\scores\201312170MEM.html
datanba\scores\201312180ATL.html
datanba\scores\201312180BOS.html
datanba\scores\201312180BRK.html
datanba\scores\201312180DAL.html
datanba\scores\201312180MIA.html
datanba\scores\201312180MIL.html
datanba\scores\201312180MIN.html
datanba\scores\201312180ORL.html
datanba\scores\201312180PHO.html
datanba\scores\201312180TOR.html
datanba\scores\201402030BRK.html
datanba\scores\201402030DAL.html
datanba\scores\201402030DEN.html
datanba\scores\201402030MIA.html
datanba\scores\201402030MIL.html
datanba\scores\201402030NOP.html
datan