In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import re
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
team_codes_df = pd.read_csv('team_codes.csv')
games_attended_df = pd.read_csv('sports_attendance.csv')

games_attended_df["URL_Date"] = " "
games_attended_df["Year"] = " "

games_attended_df['Year'] = pd.to_datetime(games_attended_df.Date)
games_attended_df['Year'] = games_attended_df['Year'].dt.strftime('%Y')
games_attended_df['URL_Date'] = pd.to_datetime(games_attended_df.Date)
games_attended_df['URL_Date'] = games_attended_df['URL_Date'].dt.strftime('%Y%m%d0')

nba_attendance_df = games_attended_df[games_attended_df.Sport == ('NBA')].copy()
mlb_attendance_df = games_attended_df[games_attended_df.Sport == ('MLB')].copy()
nfl_attendance_df = games_attended_df[games_attended_df.Sport == ('NFL')].copy()

nba_df = nba_attendance_df.merge(team_codes_df, how='left', left_on='Home', right_on='NBA_Teams')
nba_df = nba_df.drop(columns=['MLB_Teams', 'MLB_Codes', 'NBA_Teams', 'NFL_Teams', 'NFL_Codes', 'NHL_Teams', 'NHL_Codes', 'MLS_Codes'])

nba_df['URL_Variable'] = nba_df['URL_Date'] + nba_df['NBA_Codes']

away_code_df = nba_attendance_df.merge(team_codes_df, how='left', left_on='Visitor', right_on='NBA_Teams')
away_code_df = away_code_df.drop(columns=['MLB_Teams', 'MLB_Codes', 'NBA_Teams', 'NFL_Teams', 'NFL_Codes', 'NHL_Teams', 'NHL_Codes', 'MLS_Codes'])

nba_df['Away_Code'] = away_code_df['NBA_Codes']
nba_df.rename(columns = {'NBA_Codes':'Home_Code'}, inplace = True)
nba_df


Unnamed: 0,Date,Home,Visitor,Sport,URL_Date,Year,Home_Code,URL_Variable,Away_Code
0,2/11/2014,Los Angeles Lakers,Utah Jazz,NBA,201402110,2014,LAL,201402110LAL,UTA
1,1/2/2015,Utah Jazz,Atlanta Hawks,NBA,201501020,2015,UTA,201501020UTA,ATL
2,4/3/2016,Orlando Magic,Memphis Grizzlies,NBA,201604030,2016,ORL,201604030ORL,MEM
3,12/27/2017,Golden State Warriors,Utah Jazz,NBA,201712270,2017,GSW,201712270GSW,UTA
4,11/17/2018,Orlando Magic,Los Angeles Lakers,NBA,201811170,2018,ORL,201811170ORL,LAL
5,11/18/2018,Orlando Magic,New York Knicks,NBA,201811180,2018,ORL,201811180ORL,NYK
6,12/2/2018,Miami Heat,Utah Jazz,NBA,201812020,2018,MIA,201812020MIA,UTA
7,11/8/2019,Atlanta Hawks,Sacramento Kings,NBA,201911080,2019,ATL,201911080ATL,SAC
8,11/3/2021,Sacramento Kings,New Orleans Pelicans,NBA,202111030,2021,SAC,202111030SAC,NOP
9,11/7/2021,Sacramento Kings,Indiana Pacers,NBA,202111070,2021,SAC,202111070SAC,IND


In [3]:
URL_list = nba_df["URL_Variable"].values.tolist()
Home_code_list = nba_df["Home_Code"].values.tolist()
Away_code_list = nba_df["Away_Code"].values.tolist()
Year_list = nba_df["Year"].values.tolist()
Date_list = nba_df["URL_Date"].values.tolist()

In [4]:
# getting length of list
length = len(URL_list)

dataframe_dictionary = {}

for i in range(length):
    
    # Create empty lists to hold table data to be scraped
    away_box_score = []
    home_box_score = []
    box_score = []
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.basketball-reference.com/boxscores/{URL_list[i]}.html'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has away team statistics
    away_table = soup.select_one(f'#div_box-{Away_code_list[i]}-game-basic') 

    # Grab data from table and put it into the list created above
    for tr in away_table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        away_box_score.append(tds)

    # Grab the table element that has home team statistics
    home_table = soup.select_one(f'#div_box-{Home_code_list[i]}-game-basic') 

    # Grab data from table and put it into the list created above
    for tr in home_table.select('tr:has(td)'):
        tds2 = [td.get_text(strip=True) for td in tr.select('td')]
        home_box_score.append(tds2)

    # Grab the table element that has game box score
    box_score_table = soup.select_one('#all_line_score')

    # Game box score is commented out in html, so this will grab it out of the comments
    for comment in box_score_table.find_all(text=lambda text: isinstance(text, Comment)):
        if comment.find("<table ") > 0:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find("table")

    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds3 = [td.get_text(strip=True) for td in tr.select('td')]
        box_score.append(tds3)
       
    # Create dataframe for away team statistics
    dataframe_dictionary[Away_code_list[i] + Date_list[i]] = pd.DataFrame(away_box_score)

    # Create an empty list to store away team statistics header information
    away_header_list = []

    # Grab the table header information to use as column headers in our away team statistics dataframe
    for tr in away_table.select('tr:has(th)'):
        ths = [th.get_text(strip=True) for th in tr.select('th')]
        away_header_list.append(ths)

    # Create dataframe for home team statistics
    dataframe_dictionary[Home_code_list[i] + Date_list[i]] = pd.DataFrame(home_box_score)

    # Create an empty list to store home team statistics header information
    home_header_list = []

    # Grab the table header information to use as column headers in our home team statistics dataframe
    for tr in home_table.select('tr:has(th)'):
        ths2 = [th.get_text(strip=True) for th in tr.select('th')]
        home_header_list.append(ths2)    
        
    # Create dataframe for game box score
    dataframe_dictionary['Box' + Date_list[i]] = pd.DataFrame(box_score)

    # Create an empty list to store game box score dataframe header information
    box_score_list = ['1Q', '2Q', '3Q', '4Q', 'T']

    # If a game goes into overtime(s), we need to add to the box score list for each overtime period
    OT_counter = 1
    while len(box_score[0]) > len(box_score_list):
        box_score_list.insert(-1, f'{OT_counter}OT')
        OT_counter = OT_counter + 1
        
    # Update the column and row labels
    dataframe_dictionary['Box' + Date_list[i]] = dataframe_dictionary['Box' + Date_list[i]].set_axis(box_score_list, axis=1, inplace=False)
    dataframe_dictionary['Box' + Date_list[i]] = dataframe_dictionary['Box' + Date_list[i]].set_axis([f'{Away_code_list[i]}', f'{Home_code_list[i]}'])

    # Pull the away team player data and combine the lists into one list
    away_starters = away_header_list[2:7]
    away_bench = away_header_list[8:-1]
    away_players = [away_starters,away_bench]

    # Pull the home team player data and combine the lists into one list
    home_starters = home_header_list[2:7]
    home_bench = home_header_list[8:-1]
    home_players = [home_starters,home_bench]

    # Consolodate the away team list of lists into one list
    away_starters_list = [item for sublist in away_starters for item in sublist]
    away_bench_list = [item for sublist in away_bench for item in sublist]
    away_roster = away_starters_list + away_bench_list

    # Consolodate the home team list of lists into one list
    home_starters_list = [item for sublist in home_starters for item in sublist]
    home_bench_list = [item for sublist in home_bench for item in sublist]
    home_roster = home_starters_list + home_bench_list

    # Add Team Totals to the end of the away team list
    away_roster.append('Team Totals')
    dataframe_dictionary[Away_code_list[i] + Date_list[i]].index = away_roster

    # Add Team Totals to the end of the home team list
    home_roster.append('Team Totals')
    dataframe_dictionary[Home_code_list[i] + Date_list[i]].index = home_roster

    # Set the home and away dataframe column labels
    column_headers = away_header_list[1]
    column_headers.remove("Starters")
    dataframe_dictionary[Away_code_list[i] + Date_list[i]].columns = column_headers
    dataframe_dictionary[Home_code_list[i] + Date_list[i]].columns = column_headers

    # Pull logistical information about the game: attendance, length of game, and venue
    text = soup.get_text()
    attendance_string = re.findall(r"Attendance:\s+\d*,\d*", text)
    
    # A failsafe, as one of the games has no attendance listed causing the loop to break
    if(len(attendance_string) != 0):
        
        # Clean up the attendance string that is pulled 
        new_attendance_string = attendance_string[0].strip('([^a-z]xa)')
        attendance_list = list(new_attendance_string)
        attendance_list.pop(11)
    else:
        attendance_string = "No Attendance Available"
    
    # Clean up the game length string that is pulled 
    game_time_string = re.findall(r"Time of Game:\s+\d:\d*", text)    
    new_game_time_string = game_time_string[0].strip('([^a-z]xa)')
    game_time_list = list(new_game_time_string)
    game_time_list.pop(13)

    # join the lists to create final string
    final_attendance_string = "".join(attendance_list)
    final_game_time_sting = "".join(game_time_list)

    # Create throwaway variables to hold split strings so we can pick out what we want later
    x = final_attendance_string.split(":")
    y = final_game_time_sting.split(":",1)

    # Pick out the strings before the : that we split above
    dataframe_dictionary['Attendance' + Date_list[i]] = pd.DataFrame(x, columns = [x[0]])
    dataframe_dictionary['Attendance' + Date_list[i]] = dataframe_dictionary['Attendance' + Date_list[i]].drop([0])
    dataframe_dictionary['Game_Time' + Date_list[i]] = pd.DataFrame(y, columns = [y[0]])
    dataframe_dictionary['Game_Time' + Date_list[i]] = dataframe_dictionary['Game_Time' + Date_list[i]].drop([0])

    # Join the attendance and game time dataframes into one dataframe
    dataframe_dictionary['Game_Logistics' + Date_list[i]] = pd.concat([dataframe_dictionary['Attendance' + Date_list[i]], dataframe_dictionary['Game_Time' + Date_list[i]]], ignore_index=True, sort=False)
    dataframe_dictionary['Game_Logistics' + Date_list[i]]['Time of Game'] = dataframe_dictionary['Game_Logistics' + Date_list[i]]['Time of Game'].shift(-1)
    dataframe_dictionary['Game_Logistics' + Date_list[i]] = dataframe_dictionary['Game_Logistics' + Date_list[i]].dropna()

    # Scrape the name of the vanue and location and add that to the logistics dataframe
    divparent = soup.find('div', attrs={'class':'scorebox_meta'})
    divparent.text
    text = divparent.text
    result = re.split(r'\n', text)
    almost_done = result[1]
    final_result = almost_done.partition(f'{Year_list[i]}')[2]
    dataframe_dictionary['Game_Logistics' + Date_list[i]] = dataframe_dictionary['Game_Logistics' + Date_list[i]].assign(Venue=[final_result])


In [5]:
########## GUIDE TO ACCESSING DATAFRAMES ##########

## Pull Home Stats --> dataframe_dictionary['(Home Code)(Date)']

## Pull Away Stats --> dataframe_dictionary['(Away Code)(Date)']

## Pull Box Score --> dataframe_dictionary['Box(Date)']

## Pull Game Logistics --> dataframe_dictionary['Game_Logistics(Date)']

In [6]:
#Test
dataframe_dictionary['Game_Logistics201812020']

Unnamed: 0,Attendance,Time of Game,Venue
0,19600,2:14,"AmericanAirlines Arena, Miami, Florida"


In [7]:
#Test
dataframe_dictionary['UTA201812020']

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-
Donovan Mitchell,37:40,8.0,24.0,0.333,1.0,7.0,0.143,1.0,1.0,1.0,0.0,5.0,5.0,3.0,1.0,0.0,2.0,2.0,18.0,-5.0
Rudy Gobert,37:15,5.0,9.0,0.556,0.0,0.0,,2.0,5.0,0.4,4.0,14.0,18.0,4.0,1.0,3.0,2.0,2.0,12.0,19.0
Ricky Rubio,36:05,9.0,13.0,0.692,2.0,4.0,0.5,3.0,3.0,1.0,0.0,5.0,5.0,6.0,1.0,0.0,3.0,3.0,23.0,14.0
Jae Crowder,34:09,6.0,15.0,0.4,3.0,7.0,0.429,4.0,4.0,1.0,2.0,4.0,6.0,2.0,1.0,0.0,4.0,3.0,19.0,1.0
Joe Ingles,29:10,5.0,13.0,0.385,1.0,4.0,0.25,0.0,0.0,,0.0,4.0,4.0,4.0,2.0,0.0,0.0,4.0,11.0,7.0
Royce O'Neale,18:43,2.0,2.0,1.0,2.0,2.0,1.0,0.0,0.0,,0.0,2.0,2.0,1.0,0.0,1.0,1.0,1.0,6.0,4.0
Derrick Favors,17:53,2.0,4.0,0.5,1.0,2.0,0.5,0.0,0.0,,3.0,4.0,7.0,0.0,2.0,2.0,0.0,2.0,5.0,-19.0
Kyle Korver,17:23,1.0,6.0,0.167,1.0,5.0,0.2,0.0,0.0,,1.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,3.0,-18.0
Thabo Sefolosha,6:39,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0
Raul Neto,3:07,1.0,1.0,1.0,0.0,0.0,,0.0,0.0,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,-4.0


In [8]:
#Test
dataframe_dictionary['MIA201812020']

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-
Josh Richardson,39:10,4.0,16.0,0.25,2.0,6.0,0.333,6.0,6.0,1.0,1.0,2.0,3.0,4.0,0.0,0.0,4.0,2.0,16.0,-13.0
Wayne Ellington,37:12,4.0,13.0,0.308,4.0,9.0,0.444,0.0,0.0,,2.0,5.0,7.0,1.0,1.0,0.0,0.0,4.0,12.0,-2.0
Hassan Whiteside,34:02,11.0,17.0,0.647,0.0,0.0,,1.0,6.0,0.167,7.0,13.0,20.0,2.0,1.0,3.0,1.0,4.0,23.0,-8.0
James Johnson,18:46,4.0,10.0,0.4,1.0,2.0,0.5,0.0,0.0,,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,9.0,-16.0
Rodney McGruder,5:07,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-14.0
Justise Winslow,32:05,4.0,9.0,0.444,1.0,2.0,0.5,0.0,0.0,,0.0,5.0,5.0,1.0,2.0,1.0,4.0,3.0,9.0,14.0
Dwyane Wade,30:26,4.0,15.0,0.267,2.0,6.0,0.333,5.0,5.0,1.0,0.0,3.0,3.0,8.0,1.0,1.0,1.0,0.0,15.0,21.0
Kelly Olynyk,29:10,4.0,8.0,0.5,3.0,6.0,0.5,5.0,7.0,0.714,2.0,6.0,8.0,1.0,1.0,1.0,1.0,0.0,16.0,18.0
Bam Adebayo,14:01,1.0,2.0,0.5,0.0,0.0,,0.0,0.0,,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,2.0,10.0
Tyler Johnson,Did Not Dress,,,,,,,,,,,,,,,,,,,


In [9]:
#Test
dataframe_dictionary['Box201812020']

Unnamed: 0,1Q,2Q,3Q,4Q,T
UTA,35,20,22,23,100
MIA,27,26,27,22,102
