In [241]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import re
import pickle
from datetime import datetime, timedelta

In [242]:
import requests
from bs4 import BeautifulSoup

# Make a request to the website
url = "https://www.nba.com/game/bos-vs-dal-0042300404/play-by-play?latest=0&period=All"
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

plays = soup.find('script', {"id": '__NEXT_DATA__'})
d = json.loads(plays.string)
play_by_play = d['props']['pageProps']['playByPlay']['actions']

In [243]:
def get_game_links(date):
    # date: string formatted as yyyy-mm-dd
    links = []
    url = f"https://www.nba.com/games?date={date}"
    response = requests.get(url)

    # Parse HTML with bs4
    soup = BeautifulSoup(response.content, "html.parser")
    data = soup.find('script', {"id": '__NEXT_DATA__'})
    d = json.loads(data.string) # bigass json stored as python dict 
    modules = d['props']['pageProps']['gameCardFeed']['modules']
    if modules:
        cards = d['props']['pageProps']['gameCardFeed']['modules'][0]['cards']
        for card in cards:
            url_chunk = card['cardData']['actions'][-1]['resourceLocator']['resourceUrl']
            links.append('https://www.nba.com' + url_chunk)
        return links
    else:
        return

In [244]:
def parse_game(ref_foul_data, link, quiet=True):

    # first, check who the officials are and add 1 game
    # to their count in the dictionary
    response = requests.get(link)
    soup = BeautifulSoup(response.content, "html.parser")
    plays = soup.find('script', {"id": '__NEXT_DATA__'})
    d = json.loads(plays.string)
    officials = d['props']['pageProps']['game']['officials']

    for official in officials:
        name = official['nameI']
        if name not in ref_foul_data:
            ref_foul_data[name] = {'num_games': 0}
        ref_foul_data[name]['num_games'] += 1
        
    # next, move on to the play-by-play of the game!
    url = link + 'play-by-play?latest=0&period=All'
    response = requests.get(url)

    # Parse HTML with bs4
    soup = BeautifulSoup(response.content, "html.parser")
    plays = soup.find('script', {"id": '__NEXT_DATA__'})
    d = json.loads(plays.string)
    play_by_play = d['props']['pageProps']['playByPlay']['actions']

    pattern = r'\(([^)]+)\)$' # to extract ref names from event descs
    for play in play_by_play:
        
        if play['actionType'] == 'Foul':
            desc = play['description']
            regex_match = re.search(pattern, desc)
            if regex_match:
                ref = regex_match.group(1)
                ref = ref.replace('.', '. ')
                
            foul_type = play['subType']
            
            if foul_type not in ref_foul_data[ref]:
                ref_foul_data[ref][foul_type] = 0
            
            ref_foul_data[ref][foul_type] += 1
    
    return play_by_play

In [249]:
season_start = datetime(2023, 10, 24).date()
present_day = datetime.now().date()
delta = timedelta(days=1)
scrape_date = season_start

In [252]:
### CHECKPOINT VARS ###
scrape_date = datetime(2024, 1, 26).date()
with open('data.pkl', 'rb') as f:
    ref_foul_data = pickle.load(f)
#######################

# MAIN SCRAPING LOOP

while scrape_date <= present_day:
    try:
        print('scraping', scrape_date)
        date_str = scrape_date.strftime('%Y-%m-%d')
        links = get_game_links(date_str)

        if links:
            for link in links:
                parse_game(ref_foul_data=ref_foul_data,
                        link=link,
                        quiet=False)
    
        with open('data.pkl', 'wb') as f:
            pickle.dump(ref_foul_data, f)
        print('data saved on date', scrape_date)

        scrape_date += delta

    except Exception as e:
        print('crashed, retrying...')
        pass

    

scraping 2024-01-26
data saved on date 2024-01-26
scraping 2024-01-27
data saved on date 2024-01-27
scraping 2024-01-28
data saved on date 2024-01-28
scraping 2024-01-29
data saved on date 2024-01-29
scraping 2024-01-30
data saved on date 2024-01-30
scraping 2024-01-31
data saved on date 2024-01-31
scraping 2024-02-01
data saved on date 2024-02-01
scraping 2024-02-02
data saved on date 2024-02-02
scraping 2024-02-03
data saved on date 2024-02-03
scraping 2024-02-04
data saved on date 2024-02-04
scraping 2024-02-05
data saved on date 2024-02-05
scraping 2024-02-06
data saved on date 2024-02-06
scraping 2024-02-07
data saved on date 2024-02-07
scraping 2024-02-08
crashed, retrying...
scraping 2024-02-08
data saved on date 2024-02-08
scraping 2024-02-09
crashed, retrying...
scraping 2024-02-09
data saved on date 2024-02-09
scraping 2024-02-10
data saved on date 2024-02-10
scraping 2024-02-11
data saved on date 2024-02-11
scraping 2024-02-12
data saved on date 2024-02-12
scraping 2024-02-1

In [None]:
with open('data.pkl', 'rb') as f:
    ref_foul_data = pickle.load(f)

In [253]:
ref_foul_data

{'K. Cutler': {'num_games': 59,
  'Shooting': 363,
  'Loose Ball': 49,
  'Personal': 213,
  'Offensive': 41,
  'Technical': 9,
  'Personal Take': 9,
  'Offensive Charge': 13,
  'Defense 3 Second': 14,
  'Clear Path': 2,
  'Flagrant Type 1': 1,
  'Delay Technical': 1},
 'S. Twardoski': {'num_games': 61,
  'Personal': 228,
  'Shooting': 391,
  'Offensive Charge': 13,
  'Offensive': 50,
  'Loose Ball': 33,
  'Technical': 8,
  'Defense 3 Second': 22,
  'Personal Take': 10,
  'Away From Play': 3,
  'Clear Path': 1,
  'Flagrant Type 1': 1},
 'J. Williams': {'num_games': 75,
  'Personal': 278,
  'Shooting': 428,
  'Delay Technical': 2,
  'Technical': 24,
  'Loose Ball': 79,
  'Flagrant Type 1': 6,
  'Offensive': 61,
  'Defense 3 Second': 25,
  'Personal Take': 14,
  'Hanging Technical': 1,
  'Double Technical': 2,
  'Clear Path': 2,
  'Offensive Charge': 27,
  'Away From Play': 4,
  'Flagrant Type 2': 2},
 'J. Tiven': {'num_games': 79,
  'Shooting': 432,
  'Personal': 306,
  'Loose Ball': 60,

In [254]:
df = pd.DataFrame.from_dict(ref_foul_data, orient='index').fillna(0)

In [257]:
df.mean()

num_games                         51.936709
Shooting                         330.658228
Loose Ball                        38.189873
Personal                         195.075949
Offensive                         39.658228
Technical                         10.379747
Personal Take                     12.696203
Offensive Charge                  13.658228
Defense 3 Second                   7.658228
Clear Path                         0.316456
Flagrant Type 1                    1.607595
Delay Technical                    0.443038
Away From Play                     1.341772
Hanging Technical                  0.443038
Double Technical                   1.392405
Flagrant Type 2                    0.240506
Double Personal                    0.151899
Excess Timeout Technical           0.025316
Too Many Players Technical         0.012658
Non-Unsportsmanlike Technical      0.012658
dtype: float64