In [1]:
from urllib import robotparser
import pandas as pd
import time
import numpy as np
import random

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ChromeOptions, ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [22]:
class NBAScraper():

    def __init__(self, user_agent: str):
        self.rp = robotparser.RobotFileParser()
        self.user_agent = user_agent
        self.base_url = "https://www.nba.com/"
        
        self.output = None

        # TODO: Implementar proxy para que la pagina no me bloquee cuando hago muchas ejecuciones

        # https://developer.chrome.com/docs/chromedriver/capabilities?hl=es-419
        webdriver_options = ChromeOptions()
        # webdriver_options.add_argument("--headless")
        webdriver_options.add_argument(f"--user-agent={user_agent}")
        # evitar deteccion como bot (https://stackoverflow.com/questions/71885891/urllib3-exceptions-maxretryerror-httpconnectionpoolhost-localhost-port-5958)
        webdriver_options.add_argument('--disable-blink-features=AutomationControlled')
        self.driver = webdriver.Chrome(options=webdriver_options)

        # Configuración de timeouts del navegador
        self.driver.set_page_load_timeout(60)  # Máximo 20 segundos para cargar una página
        self.driver.implicitly_wait(5)  # Espera máxima para encontrar elementos antes de lanzar error
        

    # Comprobar accesibilidad al sitio web
    def check_accessibility(self, url_robots: str, url: str) -> bool:
        # https://docs.python.org/3/library/urllib.robotparser.html

        # Establece en el parser la url a robots.txt
        self.rp.set_url(url=url_robots)
        # Parsea robots.txt
        self.rp.read()
        # Examina que user_agent pueda acceder en base al robots.txt parseado
        return self.rp.can_fetch(useragent=self.user_agent, url=url)
    

    
    def extract_teams_shooting_ds(self) -> None:
        url_robots = f"{self.base_url}robots.txt"
        url_stats = f"{self.base_url}stats/teams/shooting"

        def get_table_contents(html, season, conference, position):
            # Creamos objeto BeautifulSoup
            soup = BeautifulSoup(html, "html.parser")

            # Extraemos la tabla de estadisticas
            teams_table = soup.find("table", class_="Crom_table__p1iZz")

            # Obtenemos las columnas de cada estadistica
            table_head = teams_table.find("thead")

            # Crom_headers__mzI_m -> Cabecera perteneciente a FGM, FGA, FG%
            # el apartado field contiene adicionalmente a que rango de tiro pertenece
            categories = [col.get("field") for col in table_head.find("tr", class_="Crom_headers__mzI_m").find_all("th")[1:]]
            df = pd.DataFrame(columns=["Team", "Season", "Conference", "Position"] + categories)
            table_body = teams_table.find("tbody").find_all("tr")
            for tr in table_body:
                team_info = tr.find_all("td")
                team_name = team_info[0].get_text().strip()
                team_stats = [float(stats.get_text()) for stats in team_info[1:]]
                
                # Categorias y estadisticas deben tener la misma longitud, deberia cumplirse siempre
                assert len(categories) == len(team_stats)
                df = pd.concat([df, pd.DataFrame(data=np.array([[team_name, season, conference, position] + team_stats]), columns=df.columns)])
            
            return df
        

        final_df = None


        # Comprobamos accesibilidad a la pagina
        if self.check_accessibility(url_robots=url_robots, url=url_stats):
            print(f"{url_stats} visited!")
            self.driver.get(url_stats)
            wait = WebDriverWait(self.driver, 20)
           
            try:
                # https://stackoverflow.com/questions/64032271/handling-accept-cookies-popup-with-selenium-in-python
                print("Waiting...")
                wait.until(
                    EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
                ).click()
                print("Cookies accepted!")
            except:
                print("Wait failed!")
                self.quit_driver()
                return
            
            # Esperamos para que la accion de aceptar las cookies no se solape con empezar la extraccion de las tablas, o el programa peta
            time.sleep(5)

            # Cargamos los filtros avanzados para poder hacer búsqueda por conferencias
            try:
                print("Intentando abrir Advanced Filters...")
                toggle_button = wait.until(EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, "button.StatsAdvancedFiltersPanel_safArrow__EqRgu")
                ))
                toggle_button.click()
                time.sleep(3)  # Pequeña espera para que los filtros se desplieguen correctamente
                print("Advanced Filters abiertos.")
            except Exception as e:
                print("No se pudo abrir el panel de filtros avanzados.")
                print(str(e))
                self.quit_driver()
                return

            # Obtenemos el boton para aplicar los filtros
            get_stats_button = None
            try:
                for button in self.driver.find_elements(by=By.CLASS_NAME, value="Button_button__L2wUb"):
                    if button.text.lower() == "get stats":
                        get_stats_button = button
            except:
                print("No se pudo acceder a los botones.")
                self.quit_driver()
                return
            
            # Nos aseguramos de tener boton para aplicar los filtros
            assert get_stats_button != None
            
            # Obtenemos el elemento que contiene todos los filtros
            overall_filters = self.driver.find_element(by=By.CLASS_NAME, value="nba-stats-primary-split-block")
            # Obtenemos los filtros que aparecen al inicio
            initial_filters = overall_filters.find_elements(by=By.CLASS_NAME, value="DropDown_label__lttfI")

            # Obtenemos todas las temporadas
            season_options = []
            for filt in initial_filters:
                tag = filt.find_element(by=By.TAG_NAME, value="p").text
                if tag.upper() == "SEASON":
                    season_options = filt.find_elements(by=By.TAG_NAME, value="option")
                    break
            
            # Obtenemos todas las conferencias
            conference_options = []
            for filt in initial_filters:
                tag = filt.find_element(by=By.TAG_NAME, value="p").text
                if tag.upper() == "CONFERENCE":
                        conference_options = [
                            opt for opt in filt.find_elements(by=By.TAG_NAME, value="option")
                            if opt.text in ["East", "West"]
                        ]
                        break
            
            # Obtenemos todas las posiciones de jugador
            positions_options = []
            for filt in initial_filters:
                tag = filt.find_element(by=By.TAG_NAME, value="p").text
                if tag.upper() == "POSITION":
                        positions_options = [
                            opt for opt in filt.find_elements(by=By.TAG_NAME, value="option")
                            if opt.text in ["Center", "Guard", "Forward"]
                        ]
                        break
   

            # Iteramos por todas las combinaciones de filtros
            for season_op in season_options:
                season_text = season_op.text
                try:
                    wait.until(EC.element_to_be_clickable(season_op)).click()
                    time.sleep(1)
                except:
                    print("Wait failed!")
                    self.quit_driver()
                
                for conf_op in conference_options:
                    conf_text = conf_op.text
                    try:
                        wait.until(EC.element_to_be_clickable(conf_op)).click()
                        time.sleep(1)
                    except:
                        print("Wait failed!")
                        self.quit_driver()
                    
                    for pos_op in positions_options:
                        pos_text = pos_op.text
                        time.sleep(1)
                        try:
                            wait.until(EC.element_to_be_clickable(pos_op)).click()
                        except:
                            print("Wait failed!")
                            self.quit_driver()
                        
                        # Aplicamos los filtros
                        try:
                            wait.until(EC.element_to_be_clickable(get_stats_button)).click()
                        except:
                            print("Wait failed!")
                            self.quit_driver()

                        time.sleep(1)
                        # Esperamos a que la tabla sea visible
                        wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "Crom_table__p1iZz")))
                        # Extraemos el dataframe inicial
                        html = self.driver.page_source

                        curr_df = get_table_contents(html=html, season=season_text, conference=conf_text, position=pos_text)
                        if final_df is None:
                            final_df = curr_df
                        else:
                            final_df = pd.concat([final_df, curr_df])

                        # Añadimos espaciado de peticiones HTTP:
                        # Introducimos un retraso un retraso aleatorio, así simulamos comportamiento más humano
                        response_delay = random.uniform(1.0, 1.5)
                        print(f"[INFO] Esperando {response_delay:.2f} segundos antes de la siguiente petición...")
                        time.sleep(response_delay)
        
        return final_df

    

    def extract_teams_contested_shoots(self, check_accept_cookies=False):
        url_robots = f"{self.base_url}robots.txt"
        url_stats = f"{self.base_url}stats/teams/hustle"

        def get_table_contents(html, season, conference, position):
            # Creamos objeto BeautifulSoup
            soup = BeautifulSoup(html, "html.parser")

            # Extraemos la tabla de estadisticas
            teams_table = soup.find("table", class_="Crom_table__p1iZz")

            # Obtenemos las columnas de cada estadistica
            table_head = teams_table.find("thead")

            # Categorias que nos interesan
            cats_of_interest = ["contested_shots_2pt", "contested_shots_3pt"]

            # Crom_headers__mzI_m -> Cabecera de la tabla
            categories = [col.get("field").lower() for col in table_head.find("tr", class_="Crom_headers__mzI_m").find_all("th")[1:]]

            # Indices de las categorias de interes dentro de todas las categoria 
            categories_idx = [categories.index(c) for c in cats_of_interest]

            df = pd.DataFrame(columns=["Team", "Season", "Conference", "Position"] + cats_of_interest)
            table_body = teams_table.find("tbody").find_all("tr")
            for tr in table_body:
                team_info = tr.find_all("td")
                team_name = team_info[0].get_text().strip()
                # Cogemos unicamente las estadisticas de interes
                team_stats = [float(team_info[1:][i].get_text()) for i in categories_idx]
                
                # Categorias y estadisticas deben tener la misma longitud, deberia cumplirse siempre
                assert len(categories_idx) == len(team_stats)
                print(team_stats)
                df = pd.concat([df, pd.DataFrame(data=np.array([[team_name, season, conference, position] + team_stats]), columns=df.columns)])
            
            return df


        final_df = None
        
        # Comprobamos accesibilidad a la pagina
        if self.check_accessibility(url_robots=url_robots, url=url_stats):
            print(f"{url_stats} visited!")
            self.driver.get(url_stats)
            wait = WebDriverWait(self.driver, 20)

            # No aparecerá el boton de aceptar cookies si ya se ha aceptado en el link anterior
            if check_accept_cookies:
                try:
                    # https://stackoverflow.com/questions/64032271/handling-accept-cookies-popup-with-selenium-in-python
                    print("Waiting...")
                    wait.until(
                        EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
                    ).click()
                    print("Cookies accepted!")
                except:
                    print("Wait failed!")
                    self.quit_driver()
                    return
            
            # Esperamos para que la accion de aceptar las cookies no se solape con empezar la extraccion de las tablas, o el programa peta
            time.sleep(5)

            # Cargamos los filtros avanzados para poder hacer búsqueda por conferencias
            try:
                print("Intentando abrir Advanced Filters...")
                toggle_button = wait.until(EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, "button.StatsAdvancedFiltersPanel_safArrow__EqRgu")
                ))
                toggle_button.click()
                time.sleep(3)  # Pequeña espera para que los filtros se desplieguen correctamente
                print("Advanced Filters abiertos.")
            except Exception as e:
                print("No se pudo abrir el panel de filtros avanzados.")
                print(str(e))
                self.quit_driver()
                return

            # Obtenemos el boton para aplicar los filtros
            get_stats_button = None
            try:
                for button in self.driver.find_elements(by=By.CLASS_NAME, value="Button_button__L2wUb"):
                    if button.text.lower() == "get stats":
                        get_stats_button = button
            except:
                print("No se pudo acceder a los botones.")
                self.quit_driver()
                return
            
            # Nos aseguramos de tener boton para aplicar los filtros
            assert get_stats_button != None
            
            # Obtenemos el elemento que contiene todos los filtros
            overall_filters = self.driver.find_element(by=By.CLASS_NAME, value="nba-stats-primary-split-block")
            # Obtenemos los filtros que aparecen al inicio
            initial_filters = overall_filters.find_elements(by=By.CLASS_NAME, value="DropDown_label__lttfI")

            # Obtenemos todas las temporadas
            season_options = []
            for filt in initial_filters:
                tag = filt.find_element(by=By.TAG_NAME, value="p").text
                if tag.upper() == "SEASON":
                    season_options = filt.find_elements(by=By.TAG_NAME, value="option")
                    break
            
            # Obtenemos todas las conferencias
            conference_options = []
            for filt in initial_filters:
                tag = filt.find_element(by=By.TAG_NAME, value="p").text
                if tag.upper() == "CONFERENCE":
                        conference_options = [
                            opt for opt in filt.find_elements(by=By.TAG_NAME, value="option")
                            if opt.text in ["East", "West"]
                        ]
                        break
            
            # Obtenemos todas las posiciones de jugador
            positions_options = []
            for filt in initial_filters:
                tag = filt.find_element(by=By.TAG_NAME, value="p").text
                if tag.upper() == "POSITION":
                        positions_options = [
                            opt for opt in filt.find_elements(by=By.TAG_NAME, value="option")
                            if opt.text in ["Center", "Guard", "Forward"]
                        ]
                        break
            

            # Iteramos por todas las combinaciones de filtros
            for season_op in season_options:
                season_text = season_op.text
                try:
                    wait.until(EC.element_to_be_clickable(season_op)).click()
                    time.sleep(1)
                except:
                    print("Wait failed!")
                    self.quit_driver()
                
                for conf_op in conference_options:
                    conf_text = conf_op.text
                    try:
                        wait.until(EC.element_to_be_clickable(conf_op)).click()
                        time.sleep(1)
                    except:
                        print("Wait failed!")
                        self.quit_driver()
                    
                    for pos_op in positions_options:
                        pos_text = pos_op.text
                        time.sleep(1)
                        try:
                            wait.until(EC.element_to_be_clickable(pos_op)).click()
                        except:
                            print("Wait failed!")
                            self.quit_driver()
                        
                        # Aplicamos los filtros
                        try:
                            wait.until(EC.element_to_be_clickable(get_stats_button)).click()
                        except:
                            print("Wait failed!")
                            self.quit_driver()

                        time.sleep(1)
                        # Esperamos a que la tabla sea visible
                        wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "Crom_table__p1iZz")))
                        # Extraemos el dataframe inicial
                        html = self.driver.page_source

                        curr_df = get_table_contents(html=html, season=season_text, conference=conf_text, position=pos_text)
                        if final_df is None:
                            final_df = curr_df
                        else:
                            final_df = pd.concat([final_df, curr_df])

                        # Añadimos espaciado de peticiones HTTP:
                        # Introducimos un retraso un retraso aleatorio, así simulamos comportamiento más humano
                        response_delay = random.uniform(1.0, 1.5)
                        print(f"[INFO] Esperando {response_delay:.2f} segundos antes de la siguiente petición...")
                        time.sleep(response_delay)
            

        return final_df   


    def extract_teams_boxouts(self, check_accept_cookies=False) -> pd.DataFrame:
        url_robots = f"{self.base_url}robots.txt"
        url_stats = f"{self.base_url}stats/teams/box-outs"

        def get_table_contents(html, season, conference, position):
            # Creamos objeto BeautifulSoup
            soup = BeautifulSoup(html, "html.parser")

            # Extraemos la tabla de estadisticas
            teams_table = soup.find("table", class_="Crom_table__p1iZz")

            # Obtenemos las columnas de cada estadistica
            table_head = teams_table.find("thead")

            # Categorias que nos interesan
            cats_of_interest = ["off_boxouts", "def_boxouts"]

            # Crom_headers__mzI_m -> Cabecera de la tabla
            categories = [col.get("field").lower() for col in table_head.find("tr", class_="Crom_headers__mzI_m").find_all("th")[1:]]
            print(categories)

            # Indices de las categorias de interes dentro de todas las categoria 
            categories_idx = [categories.index(c) for c in cats_of_interest]

            df = pd.DataFrame(columns=["Team", "Season", "Conference", "Position"] + cats_of_interest)
            table_body = teams_table.find("tbody").find_all("tr")
            for tr in table_body:
                team_info = tr.find_all("td")
                team_name = team_info[0].get_text().strip()
                # Cogemos unicamente las estadisticas de interes
                team_stats = [float(team_info[1:][i].get_text()) for i in categories_idx]
                
                # Categorias y estadisticas deben tener la misma longitud, deberia cumplirse siempre
                assert len(categories_idx) == len(team_stats)
                print(team_stats)
                df = pd.concat([df, pd.DataFrame(data=np.array([[team_name, season, conference, position] + team_stats]), columns=df.columns)])
            
            return df


        final_df = None
        
        # Comprobamos accesibilidad a la pagina
        if self.check_accessibility(url_robots=url_robots, url=url_stats):
            print(f"{url_stats} visited!")
            self.driver.get(url_stats)
            wait = WebDriverWait(self.driver, 20)

            # No aparecerá el boton de aceptar cookies si ya se ha aceptado en el link anterior
            if check_accept_cookies:
                try:
                    # https://stackoverflow.com/questions/64032271/handling-accept-cookies-popup-with-selenium-in-python
                    print("Waiting...")
                    wait.until(
                        EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
                    ).click()
                    print("Cookies accepted!")
                except:
                    print("Wait failed!")
                    self.quit_driver()
                    return
            
            # Esperamos para que la accion de aceptar las cookies no se solape con empezar la extraccion de las tablas, o el programa peta
            time.sleep(5)

            # Cargamos los filtros avanzados para poder hacer búsqueda por conferencias
            try:
                print("Intentando abrir Advanced Filters...")
                toggle_button = wait.until(EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, "button.StatsAdvancedFiltersPanel_safArrow__EqRgu")
                ))
                toggle_button.click()
                time.sleep(3)  # Pequeña espera para que los filtros se desplieguen correctamente
                print("Advanced Filters abiertos.")
            except Exception as e:
                print("No se pudo abrir el panel de filtros avanzados.")
                print(str(e))
                self.quit_driver()
                return

            # Obtenemos el boton para aplicar los filtros
            get_stats_button = None
            try:
                for button in self.driver.find_elements(by=By.CLASS_NAME, value="Button_button__L2wUb"):
                    if button.text.lower() == "get stats":
                        get_stats_button = button
            except:
                print("No se pudo acceder a los botones.")
                self.quit_driver()
                return
            
            # Nos aseguramos de tener boton para aplicar los filtros
            assert get_stats_button != None
            
            # Obtenemos el elemento que contiene todos los filtros
            overall_filters = self.driver.find_element(by=By.CLASS_NAME, value="nba-stats-primary-split-block")
            # Obtenemos los filtros que aparecen al inicio
            initial_filters = overall_filters.find_elements(by=By.CLASS_NAME, value="DropDown_label__lttfI")

            # Obtenemos todas las temporadas
            season_options = []
            for filt in initial_filters:
                tag = filt.find_element(by=By.TAG_NAME, value="p").text
                if tag.upper() == "SEASON":
                    season_options = filt.find_elements(by=By.TAG_NAME, value="option")
                    break
            
            # Obtenemos todas las conferencias
            conference_options = []
            for filt in initial_filters:
                tag = filt.find_element(by=By.TAG_NAME, value="p").text
                if tag.upper() == "CONFERENCE":
                        conference_options = [
                            opt for opt in filt.find_elements(by=By.TAG_NAME, value="option")
                            if opt.text in ["East", "West"]
                        ]
                        break
            
            # Obtenemos todas las posiciones de jugador
            positions_options = []
            for filt in initial_filters:
                tag = filt.find_element(by=By.TAG_NAME, value="p").text
                if tag.upper() == "POSITION":
                        positions_options = [
                            opt for opt in filt.find_elements(by=By.TAG_NAME, value="option")
                            if opt.text in ["Center", "Guard", "Forward"]
                        ]
                        break
            

            # Iteramos por todas las combinaciones de filtros
            for season_op in season_options:
                season_text = season_op.text
                try:
                    wait.until(EC.element_to_be_clickable(season_op)).click()
                    time.sleep(1)
                except:
                    print("Wait failed!")
                    self.quit_driver()
                
                for conf_op in conference_options:
                    conf_text = conf_op.text
                    try:
                        wait.until(EC.element_to_be_clickable(conf_op)).click()
                        time.sleep(1)
                    except:
                        print("Wait failed!")
                        self.quit_driver()
                    
                    for pos_op in positions_options:
                        pos_text = pos_op.text
                        time.sleep(1)
                        try:
                            wait.until(EC.element_to_be_clickable(pos_op)).click()
                        except:
                            print("Wait failed!")
                            self.quit_driver()
                        
                        # Aplicamos los filtros
                        try:
                            wait.until(EC.element_to_be_clickable(get_stats_button)).click()
                        except:
                            print("Wait failed!")
                            self.quit_driver()

                        time.sleep(1)
                        # Esperamos a que la tabla sea visible
                        wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "Crom_table__p1iZz")))
                        # Extraemos el dataframe inicial
                        html = self.driver.page_source

                        curr_df = get_table_contents(html=html, season=season_text, conference=conf_text, position=pos_text)
                        if final_df is None:
                            final_df = curr_df
                        else:
                            final_df = pd.concat([final_df, curr_df])

                        # Añadimos espaciado de peticiones HTTP:
                        # Introducimos un retraso un retraso aleatorio, así simulamos comportamiento más humano
                        response_delay = random.uniform(1.0, 1.5)
                        print(f"[INFO] Esperando {response_delay:.2f} segundos antes de la siguiente petición...")
                        time.sleep(response_delay)
            

        return final_df   



    def execute_scraping(self):
        df1 = self.extract_teams_shooting_ds()
        df2 = self.extract_teams_contested_shoots(check_accept_cookies=False)
        df1 = df1.reset_index(drop=True)
        df2 = df2.reset_index(drop=True)
        df3 = pd.merge(df1, df2, on=["Team", "Season", "Conference", "Position"], how="left")

        df4 = self.extract_teams_boxouts(check_accept_cookies=False)
        df4 = df4.reset_index(drop=True)
        
        for df in [df3, df4]:
            for col in ["Team", "Season", "Conference", "Position"]:
                df[col] = df[col].astype(str)
        
        print(df3)
        print(df4)
        self.output = pd.merge(df3, df4, on=["Team", "Season", "Conference", "Position"], how="left")     
        print(self.output) 



    # Cerrar driver
    def quit_driver(self):
        self.driver.quit()


    # Guardar resultados en ambos formatos de csv
    def get_csv(self):
        self.output.to_csv("..\\..\\dataset\\nba_test_dataset.csv", sep=",", index=False)
        self.output.to_csv("..\\..\\dataset\\nba_test_dataset_excel.csv", sep=";", index=False)
        

In [23]:
user_agent_windows = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
scraper = NBAScraper(user_agent=user_agent_windows)

df1 = scraper.extract_teams_shooting_ds()
df2 = scraper.extract_teams_contested_shoots(check_accept_cookies=False)
df4 = scraper.extract_teams_boxouts(check_accept_cookies=False)

scraper.quit_driver()

https://www.nba.com/stats/teams/shooting visited!
Waiting...
Cookies accepted!
Intentando abrir Advanced Filters...
Advanced Filters abiertos.
[INFO] Esperando 1.30 segundos antes de la siguiente petición...
[INFO] Esperando 1.46 segundos antes de la siguiente petición...
[INFO] Esperando 1.11 segundos antes de la siguiente petición...
[INFO] Esperando 1.09 segundos antes de la siguiente petición...
[INFO] Esperando 1.31 segundos antes de la siguiente petición...
[INFO] Esperando 1.46 segundos antes de la siguiente petición...
[INFO] Esperando 1.45 segundos antes de la siguiente petición...
[INFO] Esperando 1.33 segundos antes de la siguiente petición...
[INFO] Esperando 1.45 segundos antes de la siguiente petición...
[INFO] Esperando 1.04 segundos antes de la siguiente petición...
[INFO] Esperando 1.32 segundos antes de la siguiente petición...
[INFO] Esperando 1.31 segundos antes de la siguiente petición...
[INFO] Esperando 1.04 segundos antes de la siguiente petición...
[INFO] Esper

In [24]:
df1.describe()

Unnamed: 0,Team,Season,Conference,Position,Less Than 5 ft. FGM,Less Than 5 ft. FGA,Less Than 5 ft. FG PCT,5-9 ft. FGM,5-9 ft. FGA,5-9 ft. FG PCT,...,10-14 ft. FG PCT,15-19 ft. FGM,15-19 ft. FGA,15-19 ft. FG PCT,20-24 ft. FGM,20-24 ft. FGA,20-24 ft. FG PCT,25-29 ft. FGM,25-29 ft. FGA,25-29 ft. FG PCT
count,2585,2585,2585,2585,2585.0,2585.0,2585.0,2585.0,2585.0,2585.0,...,2585.0,2585.0,2585.0,2585.0,2585.0,2585.0,2585.0,2585.0,2585.0,2585.0
unique,38,29,2,3,157.0,230.0,263.0,45.0,93.0,292.0,...,307.0,72.0,154.0,274.0,69.0,158.0,275.0,81.0,200.0,254.0
top,Atlanta Hawks,2024-25,East,Forward,6.6,9.0,58.9,1.0,2.5,40.5,...,39.5,0.4,0.2,39.6,0.0,0.1,0.0,0.0,0.0,0.0
freq,87,90,1305,862,53.0,36.0,33.0,164.0,78.0,31.0,...,36.0,86.0,42.0,40.0,227.0,108.0,74.0,495.0,339.0,305.0


In [25]:
df2.describe()

Unnamed: 0,Team,Season,Conference,Position,contested_shots_2pt,contested_shots_3pt
count,854,854,854,854,854.0,854.0
unique,30,10,2,3,222.0,162.0
top,Atlanta Hawks,2024-25,West,Forward,17.1,11.5
freq,30,90,428,285,11.0,15.0


In [26]:
df4.describe()

Unnamed: 0,Team,Season,Conference,Position,off_boxouts,def_boxouts
count,719,719,719,719,719.0,719.0
unique,30,8,2,3,46.0,148.0
top,Atlanta Hawks,2024-25,East,Forward,0.1,2.7
freq,24,90,360,240,101.0,20.0


In [27]:
df3 = pd.merge(df1, df2, on=["Team", "Season", "Conference", "Position"], how="left")

In [28]:
df3.describe()

Unnamed: 0,Team,Season,Conference,Position,Less Than 5 ft. FGM,Less Than 5 ft. FGA,Less Than 5 ft. FG PCT,5-9 ft. FGM,5-9 ft. FGA,5-9 ft. FG PCT,...,15-19 ft. FGA,15-19 ft. FG PCT,20-24 ft. FGM,20-24 ft. FGA,20-24 ft. FG PCT,25-29 ft. FGM,25-29 ft. FGA,25-29 ft. FG PCT,contested_shots_2pt,contested_shots_3pt
count,2585,2585,2585,2585,2585.0,2585.0,2585.0,2585.0,2585.0,2585.0,...,2585.0,2585.0,2585.0,2585.0,2585.0,2585.0,2585.0,2585.0,854.0,854.0
unique,38,29,2,3,157.0,230.0,263.0,45.0,93.0,292.0,...,154.0,274.0,69.0,158.0,275.0,81.0,200.0,254.0,222.0,162.0
top,Atlanta Hawks,2024-25,East,Forward,6.6,9.0,58.9,1.0,2.5,40.5,...,0.2,39.6,0.0,0.1,0.0,0.0,0.0,0.0,17.1,11.5
freq,87,90,1305,862,53.0,36.0,33.0,164.0,78.0,31.0,...,42.0,40.0,227.0,108.0,74.0,495.0,339.0,305.0,11.0,15.0


In [29]:
df3.iloc[0]["Season"] == df4.iloc[0]["Season"] and \
df3.iloc[0]["Team"] == df4.iloc[0]["Team"] and \
df3.iloc[0]["Conference"] == df4.iloc[0]["Conference"] and \
df3.iloc[0]["Position"] == df4.iloc[0]["Position"]

True

In [30]:
print(df3.iloc[0]["Team"])
print(df4.iloc[0]["Team"])

Atlanta Hawks
Atlanta Hawks


In [31]:
output = pd.merge(df3, df4, on=["Team", "Season", "Conference", "Position"], how="left")
output.head(n=5)

Unnamed: 0,Team,Season,Conference,Position,Less Than 5 ft. FGM,Less Than 5 ft. FGA,Less Than 5 ft. FG PCT,5-9 ft. FGM,5-9 ft. FGA,5-9 ft. FG PCT,...,20-24 ft. FGM,20-24 ft. FGA,20-24 ft. FG PCT,25-29 ft. FGM,25-29 ft. FGA,25-29 ft. FG PCT,contested_shots_2pt,contested_shots_3pt,off_boxouts,def_boxouts
0,Atlanta Hawks,2024-25,East,Forward,11.0,16.7,65.7,2.6,5.9,44.5,...,3.4,9.1,37.2,2.9,8.1,35.5,13.9,8.3,0.8,2.7
1,Boston Celtics,2024-25,East,Forward,10.9,16.1,67.4,2.3,5.1,45.2,...,2.9,7.2,39.7,6.6,18.7,35.5,17.1,10.6,1.0,3.8
2,Brooklyn Nets,2024-25,East,Forward,8.1,13.3,61.0,1.9,5.3,36.6,...,3.7,10.5,35.5,5.0,14.0,35.7,11.0,10.1,0.4,2.6
3,Charlotte Hornets,2024-25,East,Forward,7.6,12.4,61.2,2.0,4.9,40.5,...,2.0,5.7,35.5,3.3,10.4,32.1,11.4,8.1,1.2,2.9
4,Chicago Bulls,2024-25,East,Forward,6.3,10.4,60.0,1.3,2.8,47.0,...,2.4,6.6,37.0,2.9,9.0,32.6,11.5,7.0,0.5,2.6


In [32]:
output.to_csv("..\\..\\dataset\\nba_test_dataset.csv", sep=",", index=False)
output.to_csv("..\\..\\dataset\\nba_test_dataset_excel.csv", sep=";", index=False)