In [2]:
import pandas as pd
import random
import requests
import pathlib
from bs4 import BeautifulSoup, Tag, NavigableString
import os
import time

In [3]:
HOME_DIR = pathlib.Path.home()
NOTEBOOK_DIR = pathlib.Path('.').resolve()
BBREF_DIR = HOME_DIR.joinpath('data') / 'bbref'

In [4]:
NBA_ADVANCED_DIR = BBREF_DIR /'nba' / 'player_stats' / 'advanced'
NBA_PER_GAME_DIR = BBREF_DIR /'nba' / 'player_stats' / 'per_game'
NBA_TOTALS_DIR = BBREF_DIR / 'nba' / 'player_stats' / 'totals'

ABA_ADVANCED_DIR = BBREF_DIR /'aba' / 'player_stats' / 'advanced'
ABA_PER_GAME_DIR = BBREF_DIR /'aba' / 'player_stats' / 'per_game'
ABA_TOTALS_DIR = BBREF_DIR / 'aba' / 'player_stats' / 'totals'

In [5]:
directory_list = [NBA_ADVANCED_DIR, NBA_PER_GAME_DIR, NBA_TOTALS_DIR,
                  ABA_ADVANCED_DIR, ABA_PER_GAME_DIR, ABA_TOTALS_DIR]

In [6]:
for directory in directory_list:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory created: {directory}")

Directory created: /Users/datawonk/data/bbref/nba/player_stats/advanced
Directory created: /Users/datawonk/data/bbref/nba/player_stats/per_game
Directory created: /Users/datawonk/data/bbref/nba/player_stats/totals
Directory created: /Users/datawonk/data/bbref/aba/player_stats/advanced
Directory created: /Users/datawonk/data/bbref/aba/player_stats/per_game
Directory created: /Users/datawonk/data/bbref/aba/player_stats/totals


### Get and Set Header - User-Agent

In [8]:
headers = {'User-Agent': (f"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
                          f" (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36")}
headers

{'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}

In [42]:
TOTALS_HEADER = ['player', 'pos', 'age', 'team', 'g', 'gs', 'mp', 
                 'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'fg2', 'fga2', 'fg2_pct',
                 'efg_pct', 'ft', 'fta', 'ft_pct', 'oreb', 'dreb', 'treb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

ADVANCED_HEADER = ['player', 'pos', 'age', 'team', 'g','mp', 'per', 'ts_pct', 
                   'fg3a_rate', 'fta_rate', 'oreb_pct', 'dreb_pct', 'treb_pct', 'ast_pct', 'stl_pct', 'blk_pct',
                   'tov_pct','usage_pct', 'blank1', 'ows', 'dws', 'ws', 'ws_48', 'blank2', 'obpm', 'dbpm', 'bpm',
                   'vorp']

PER_GAME_HEADER = ['player', 'pos', 'age', 'team', 'g', 'gs', 'mp', 
                 'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'fg2', 'fga2', 'fg2_pct',
                 'efg_pct', 'ft', 'fta', 'ft_pct', 'oreb', 'dreb', 'treb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

# PER100_HEADER = ['player', 'pos', 'age', 'team', 'g', 'gs', 'mp', 'fg', 'fga', 'fg_pct', 
#                  'fg3', 'fg3a', 'fg3_pct', 'fg2', 'fga2', 'fg2_pct', 'ft', 'fta', 'ft_pct', 
#                  'oreb', 'dreb', 'treb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'blank', 'ortg', 'drtg']

BIOS_HEADER = ['bbref_player_link', 'player_name', 'rookie_year', 'last_year', 'pos', 'height', 
               'weight', 'birthdate', 'colleges']

### Pull Player Data: NBA/BAA

In [11]:
# https://www.basketball-reference.com/leagues/ABA_1968_totals.html

In [49]:
BASE_URL = "https://www.basketball-reference.com/leagues/"
NBA_years = [year for year in range(2020, 1949, -1)]
stats_type_list = ['advanced', 'per_game', 'totals']

for year in NBA_years:
    for stat_type in stats_type_list:

        if stat_type == 'advanced':
            DIR = NBA_ADVANCED_DIR
            HEADER = ADVANCED_HEADER
        elif stat_type == 'per_game':
            DIR = NBA_PER_GAME_DIR
            HEADER = PER_GAME_HEADER
        elif stat_type == 'totals':
            DIR = NBA_TOTALS_DIR
            HEADER = TOTALS_HEADER
            
        print(year)
        NBA_URL = f"NBA_{year}_{stat_type}.html"
        print(BASE_URL + NBA_URL)
        r = requests.get(BASE_URL + NBA_URL, headers=headers)

        soup = BeautifulSoup(r.text, 'lxml')
        # pull out table body
        body = soup.find('tbody')

        player_links = []
        for i in body.find_all('tr'):
            for k in i.find_all('td', {"data-stat":"player"}):
                if k.find('a') is not None:
                    player_links.append(k.find('a')['href'])
                # print(k.text)


        player_stats = []
        for i in body.find_all('tr'):
            row_stats = []
            for j in i.find_all('td'):
                row_stats.append(j.text)
            if row_stats:
                player_stats.append(row_stats)

        team_links = []
        for i in body.find_all('tr'):
            for k in i.find_all('td', {"data-stat":"team_id"}):
                if k.find('a') is not None:
                    team_links.append(k.find('a')['href'])
                else:
                    team_links.append('TOT')

        season_id = str(year-1) + '-' + str(year)[-2:]
        df = pd.DataFrame(player_stats, columns=HEADER)
        df.insert(0, "bbref_player_link", player_links)
        df.insert(0, "bbref_team_link", team_links)
        df.insert(0, "season_year", year)
        df.insert(0, 'season_id', season_id)

        filename = f"player_stats_{stat_type}_{year}.csv"
        file_path = DIR + filename
        df.to_csv(file_path, index=False)
        print(file_path)

        timer = random.uniform(1.5, 3)
        time.sleep(timer)
        print(f"slept: {timer}\n")


2020
https://www.basketball-reference.com/leagues/NBA_2020_advanced.html
data/nba/player_stats/advanced/player_stats_advanced_2020.csv
slept: 1.5232239978724693

2020
https://www.basketball-reference.com/leagues/NBA_2020_per_game.html
data/nba/player_stats/per_game/player_stats_per_game_2020.csv
slept: 1.920437824691217

2020
https://www.basketball-reference.com/leagues/NBA_2020_totals.html
data/nba/player_stats/totals/player_stats_totals_2020.csv
slept: 2.0881441214753194



### Pull Player Data: ABA

In [48]:
# https://www.basketball-reference.com/leagues/ABA_1968_totals.html

BASE_URL = "https://www.basketball-reference.com/leagues/"
ABA_years = [year for year in range(1976, 1967, -1)]
stats_type_list = ['advanced', 'per_game', 'totals']


for year in ABA_years:
    for stat_type in stats_type_list:

        if stat_type == 'advanced':
            DIR = ABA_ADVANCED_DIR
            HEADER = ADVANCED_HEADER
        elif stat_type == 'per_game':
            DIR = ABA_PER_GAME_DIR
            HEADER = PER_GAME_HEADER
        elif stat_type == 'totals':
            DIR = ABA_TOTALS_DIR
            HEADER = TOTALS_HEADER
        
        print(year)
        NBA_URL = f"ABA_{year}_{stat_type}.html"
        print(BASE_URL + NBA_URL)
        r = requests.get(BASE_URL + NBA_URL, headers=headers)

        soup = BeautifulSoup(r.text, 'lxml')
        # pull out table body
        body = soup.find('tbody')

        player_links = []
        for i in body.find_all('tr'):
            for k in i.find_all('td', {"data-stat":"player"}):
                if k.find('a') is not None:
                    player_links.append(k.find('a')['href'])
                # print(k.text)


        player_stats = []
        for i in body.find_all('tr'):
            row_stats = []
            for j in i.find_all('td'):
                row_stats.append(j.text)
            if row_stats:
                player_stats.append(row_stats)

        team_links = []
        for i in body.find_all('tr'):
            for k in i.find_all('td', {"data-stat":"team_id"}):
                if k.find('a') is not None:
                    team_links.append(k.find('a')['href'])
                else:
                    team_links.append('TOT')

        season_id = str(year-1) + '-' + str(year)[-2:]
        df = pd.DataFrame(player_stats, columns=HEADER)
        df.insert(0, "bbref_player_link", player_links)
        df.insert(0, "bbref_team_link", team_links)
        df.insert(0, "season_year", year)
        df.insert(0, 'season_id', season_id)

        filename = f"player_stats_{stat_type}_{year}.csv"
        file_path = DIR + filename
        df.to_csv(file_path, index=False)
        print(file_path)

        timer = random.uniform(1.5, 3)
        time.sleep(timer)
        print(f"slept: {timer}\n")

1976
https://www.basketball-reference.com/leagues/ABA_1976_advanced.html
data/aba/player_stats/advanced/player_stats_advanced_1976.csv
slept: 1.521762778619463

1976
https://www.basketball-reference.com/leagues/ABA_1976_per_game.html
data/aba/player_stats/per_game/player_stats_per_game_1976.csv
slept: 2.7343880294027882

1976
https://www.basketball-reference.com/leagues/ABA_1976_totals.html
data/aba/player_stats/totals/player_stats_totals_1976.csv
slept: 1.999381522529545

