In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import re
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
team_codes_df = pd.read_csv('team_codes.csv')
games_attended_df = pd.read_csv('sports_attendance.csv')

games_attended_df["URL_Date"] = " "
games_attended_df["Year"] = " "

games_attended_df['Year'] = pd.to_datetime(games_attended_df.Date)
games_attended_df['Year'] = games_attended_df['Year'].dt.strftime('%Y')
games_attended_df['URL_Date'] = pd.to_datetime(games_attended_df.Date)
games_attended_df['URL_Date'] = games_attended_df['URL_Date'].dt.strftime('%Y%m%d0')

nba_attendance_df = games_attended_df[games_attended_df.Sport == ('NBA')].copy()
mlb_attendance_df = games_attended_df[games_attended_df.Sport == ('MLB')].copy()
nfl_attendance_df = games_attended_df[games_attended_df.Sport == ('NFL')].copy()

nba_df = nba_attendance_df.merge(team_codes_df, how='left', left_on='Home', right_on='NBA_Teams')
nba_df = nba_df.drop(columns=['MLB_Teams', 'MLB_Codes', 'NBA_Teams', 'NFL_Teams', 'NFL_Codes', 'NHL_Teams', 'NHL_Codes', 'MLS_Codes'])

nba_df['URL_Variable'] = nba_df['URL_Date'] + nba_df['NBA_Codes']

away_code_df = nba_attendance_df.merge(team_codes_df, how='left', left_on='Visitor', right_on='NBA_Teams')
away_code_df = away_code_df.drop(columns=['MLB_Teams', 'MLB_Codes', 'NBA_Teams', 'NFL_Teams', 'NFL_Codes', 'NHL_Teams', 'NHL_Codes', 'MLS_Codes'])

nba_df['Away_Code'] = away_code_df['NBA_Codes']
nba_df.rename(columns = {'NBA_Codes':'Home_Code'}, inplace = True)
nba_df


Unnamed: 0,Date,Home,Visitor,Sport,URL_Date,Year,Home_Code,URL_Variable,Away_Code
0,2/11/2014,Los Angeles Lakers,Utah Jazz,NBA,201402110,2014,LAL,201402110LAL,UTA
1,1/2/2015,Utah Jazz,Atlanta Hawks,NBA,201501020,2015,UTA,201501020UTA,ATL
2,4/3/2016,Orlando Magic,Memphis Grizzlies,NBA,201604030,2016,ORL,201604030ORL,MEM
3,12/27/2017,Golden State Warriors,Utah Jazz,NBA,201712270,2017,GSW,201712270GSW,UTA
4,11/17/2018,Orlando Magic,Los Angeles Lakers,NBA,201811170,2018,ORL,201811170ORL,LAL
5,11/18/2018,Orlando Magic,New York Knicks,NBA,201811180,2018,ORL,201811180ORL,NYK
6,12/2/2018,Miami Heat,Utah Jazz,NBA,201812020,2018,MIA,201812020MIA,UTA
7,11/8/2019,Atlanta Hawks,Sacramento Kings,NBA,201911080,2019,ATL,201911080ATL,SAC
8,11/3/2021,Sacramento Kings,New Orleans Pelicans,NBA,202111030,2021,SAC,202111030SAC,NOP
9,11/7/2021,Sacramento Kings,Indiana Pacers,NBA,202111070,2021,SAC,202111070SAC,IND


In [3]:
URL_list = nba_df["URL_Variable"].values.tolist()
Home_code_list = nba_df["Home_Code"].values.tolist()
Away_code_list = nba_df["Away_Code"].values.tolist()
Year_list = nba_df["Year"].values.tolist()

In [4]:
# Create empty lists to hold table data to be scraped
away_box_score = []
home_box_score = []
box_score = []

# input URL and use BeautifulSoup to parse through the page
url = f'https://www.basketball-reference.com/boxscores/{URL_list[1]}.html'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

# Grab the table element that has away team statistics
away_table = soup.select_one(f'#div_box-{Away_code_list[1]}-game-basic') 

# Grab data from table and put it into the list created above
for tr in away_table.select('tr:has(td)'):
    tds = [td.get_text(strip=True) for td in tr.select('td')]
    away_box_score.append(tds)

# Grab the table element that has home team statistics
home_table = soup.select_one(f'#div_box-{Home_code_list[1]}-game-basic') 

# Grab data from table and put it into the list created above
for tr in home_table.select('tr:has(td)'):
    tds2 = [td.get_text(strip=True) for td in tr.select('td')]
    home_box_score.append(tds2)

# Grab the table element that has game box score
box_score_table = soup.select_one('#all_line_score')

# Game box score is commented out in html, so this will grab it out of the comments
for comment in box_score_table.find_all(text=lambda text: isinstance(text, Comment)):
    if comment.find("<table ") > 0:
        comment_soup = BeautifulSoup(comment, 'html.parser')
        table = comment_soup.find("table")

# Grab data from table and put it into the list created above
for tr in table.select('tr:has(td)'):
    tds3 = [td.get_text(strip=True) for td in tr.select('td')]
    box_score.append(tds3)


In [5]:
# Create dataframe for away team statistics
away_team_df = pd.DataFrame(away_box_score)

# Create an empty list to store away team statistics header information
away_header_list = []

# Grab the table header information to use as column headers in our away team statistics dataframe
for tr in away_table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    away_header_list.append(ths)
    
# Create dataframe for home team statistics
home_team_df = pd.DataFrame(home_box_score)

# Create an empty list to store home team statistics header information
home_header_list = []

# Grab the table header information to use as column headers in our home team statistics dataframe
for tr in home_table.select('tr:has(th)'):
    ths2 = [th.get_text(strip=True) for th in tr.select('th')]
    home_header_list.append(ths2)
    
# Create dataframe for game box score
box_score_df = pd.DataFrame(box_score)

# Create an empty list to store game box score dataframe header information
box_score_list = []

# Grab the table header information to use as column headers in our game box score dataframe
for tr in box_score_table.select('tr:has(th)'):
    ths3 = [th.get_text(strip=True) for th in tr.select('th')]
    box_score_list.append(ths3)

In [6]:
# Update the column and row labels
box_score_df = box_score_df.set_axis(['1Q', '2Q', '3Q', '4Q', 'T'], axis=1, inplace=False)
box_score_df = box_score_df.set_axis([f'{Away_code_list[1]}', f'{Home_code_list[1]}'])

In [7]:
# Pull the away team player data and combine the lists into one list
away_starters = away_header_list[2:7]
away_bench = away_header_list[8:-1]
away_players = [away_starters,away_bench]

# Pull the home team player data and combine the lists into one list
home_starters = home_header_list[2:7]
home_bench = home_header_list[8:-1]
home_players = [home_starters,home_bench]

In [8]:
# Create a dataframe from away team stats
away_stats_df = pd.DataFrame(away_box_score)

# Create a dataframe from home team stats
home_stats_df = pd.DataFrame(home_box_score)

In [9]:
# Consolodate the away team list of lists into one list
away_starters_list = [item for sublist in away_starters for item in sublist]
away_bench_list = [item for sublist in away_bench for item in sublist]
away_roster = away_starters_list + away_bench_list

# Consolodate the home team list of lists into one list
home_starters_list = [item for sublist in home_starters for item in sublist]
home_bench_list = [item for sublist in home_bench for item in sublist]
home_roster = home_starters_list + home_bench_list

In [10]:
# Add Team Totals to the end of the away team list
away_roster.append('Team Totals')
away_stats_df.index = away_roster

# Add Team Totals to the end of the home team list
home_roster.append('Team Totals')
home_stats_df.index = home_roster

In [11]:
# Set the home and away dataframe column labels
column_headers = away_header_list[1]
column_headers.remove("Starters")
away_stats_df.columns = column_headers
home_stats_df.columns = column_headers

In [12]:
away_stats_df

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-
Jeff Teague,35:41,9.0,17.0,0.529,2.0,4.0,0.5,6.0,7.0,0.857,0.0,4.0,4.0,8.0,3.0,0.0,0.0,0.0,26.0,24.0
DeMarre Carroll,33:53,3.0,9.0,0.333,0.0,4.0,0.0,4.0,5.0,0.8,1.0,6.0,7.0,3.0,1.0,0.0,2.0,2.0,10.0,1.0
Paul Millsap,33:42,2.0,11.0,0.182,1.0,3.0,0.333,10.0,10.0,1.0,3.0,8.0,11.0,4.0,0.0,0.0,2.0,4.0,15.0,22.0
Kyle Korver,32:00,3.0,5.0,0.6,1.0,3.0,0.333,2.0,2.0,1.0,2.0,6.0,8.0,2.0,1.0,0.0,1.0,6.0,9.0,28.0
Al Horford,29:13,6.0,13.0,0.462,0.0,0.0,,1.0,2.0,0.5,1.0,4.0,5.0,0.0,1.0,2.0,2.0,0.0,13.0,0.0
Elton Brand,18:47,3.0,5.0,0.6,0.0,0.0,,0.0,0.0,,1.0,4.0,5.0,0.0,0.0,1.0,0.0,0.0,6.0,6.0
Thabo Sefolosha,17:45,2.0,5.0,0.4,0.0,2.0,0.0,2.0,2.0,1.0,1.0,0.0,1.0,4.0,3.0,0.0,1.0,1.0,6.0,-9.0
Mike Scott,14:18,3.0,8.0,0.375,1.0,4.0,0.25,1.0,2.0,0.5,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,8.0,-16.0
Kent Bazemore,12:22,2.0,6.0,0.333,1.0,3.0,0.333,0.0,0.0,,0.0,3.0,3.0,0.0,0.0,0.0,0.0,2.0,5.0,-8.0
Dennis Schröder,12:19,0.0,2.0,0.0,0.0,0.0,,0.0,0.0,,1.0,1.0,2.0,2.0,0.0,0.0,1.0,2.0,0.0,-18.0


In [13]:
home_stats_df

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-
Trey Burke,38:47,2.0,19.0,0.105,0.0,11.0,0.0,4.0,4.0,1.0,1.0,2.0,3.0,5.0,2.0,1.0,1.0,4.0,8.0,-19.0
Gordon Hayward,35:25,6.0,14.0,0.429,2.0,3.0,0.667,4.0,5.0,0.8,1.0,2.0,3.0,3.0,0.0,0.0,0.0,4.0,18.0,-15.0
Derrick Favors,30:21,2.0,7.0,0.286,0.0,0.0,,4.0,8.0,0.5,2.0,9.0,11.0,1.0,0.0,2.0,2.0,1.0,8.0,-13.0
Enes Freedom,22:03,5.0,12.0,0.417,0.0,0.0,,1.0,1.0,1.0,3.0,3.0,6.0,0.0,0.0,0.0,2.0,1.0,11.0,-15.0
Patrick Christopher,5:27,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-11.0
Rudy Gobert,24:06,3.0,3.0,1.0,0.0,0.0,,5.0,6.0,0.833,2.0,5.0,7.0,1.0,0.0,4.0,0.0,2.0,11.0,14.0
Dante Exum,22:28,5.0,9.0,0.556,3.0,7.0,0.429,0.0,0.0,,2.0,0.0,2.0,2.0,3.0,0.0,1.0,3.0,13.0,17.0
Ian Clark,21:18,2.0,8.0,0.25,1.0,5.0,0.2,0.0,0.0,,1.0,1.0,2.0,4.0,0.0,0.0,1.0,1.0,5.0,-5.0
Joe Ingles,20:35,3.0,4.0,0.75,2.0,2.0,1.0,0.0,0.0,,0.0,3.0,3.0,1.0,1.0,2.0,4.0,1.0,8.0,15.0
Trevor Booker,19:13,4.0,9.0,0.444,2.0,3.0,0.667,0.0,0.0,,4.0,5.0,9.0,1.0,0.0,0.0,2.0,1.0,10.0,3.0


In [14]:
box_score_df

Unnamed: 0,1Q,2Q,3Q,4Q,T
ATL,31,26,25,16,98
UTA,21,24,23,24,92


In [15]:
text = soup.get_text()
attendance_string = re.findall(r"Attendance:\s+\d*,\d*", text)
game_time_string = re.findall(r"Time of Game:\s+\d:\d*", text)
new_attendance_string = attendance_string[0].strip('([^a-z]xa)')
new_game_time_string = game_time_string[0].strip('([^a-z]xa)')
attendance_list = list(new_attendance_string)
game_time_list = list(new_game_time_string)
attendance_list.pop(11)
game_time_list.pop(13)

'\xa0'

In [16]:
final_attendance_string = "".join(attendance_list)
final_game_time_sting = "".join(game_time_list)

In [17]:
x = final_attendance_string.split(":")
y = final_game_time_sting.split(":",1)

In [18]:
attendance_df = pd.DataFrame(x, columns = [x[0]])
attendance_df = attendance_df.drop([0])

In [19]:
game_time_df = pd.DataFrame(y, columns = [y[0]])
game_time_df = game_time_df.drop([0])

In [20]:
game_logistics_df = pd.concat([attendance_df, game_time_df], ignore_index=True, sort=False)
game_logistics_df['Time of Game'] = game_logistics_df['Time of Game'].shift(-1)
game_logistics_df = game_logistics_df.dropna()

In [21]:
divparent = soup.find('div', attrs={'class':'scorebox_meta'})
divparent.text
text = divparent.text
result = re.split(r'\n', text)
almost_done = result[1]
final_result = almost_done.partition(f'{Year_list[1]}')[2]
game_logistics_df = game_logistics_df.assign(Venue=[final_result])
game_logistics_df

Unnamed: 0,Attendance,Time of Game,Venue
0,19029,2:09,"EnergySolutions Arena, Salt Lake City, Utah"
