In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import re
import time
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
team_codes_df = pd.read_csv('team_codes.csv')
games_attended_df = pd.read_csv('badal_sports_attendance.csv')

games_attended_df["URL_Date"] = " "
games_attended_df["Year"] = " "

games_attended_df['Year'] = pd.to_datetime(games_attended_df.Date)
games_attended_df['Year'] = games_attended_df['Year'].dt.strftime('%Y')
games_attended_df['URL_Date'] = pd.to_datetime(games_attended_df.Date)
games_attended_df['URL_Date'] = games_attended_df['URL_Date'].dt.strftime('%Y%m%d0')

nba_attendance_df = games_attended_df[games_attended_df.Sport == ('NBA')].copy()
mlb_attendance_df = games_attended_df[games_attended_df.Sport == ('MLB')].copy()
nfl_attendance_df = games_attended_df[games_attended_df.Sport == ('NFL')].copy()

nba_df = nba_attendance_df.merge(team_codes_df, how='left', left_on='Home', right_on='NBA_Teams')
nba_df = nba_df.drop(columns=['MLB_Teams', 'MLB_Codes', 'NBA_Teams', 'NFL_Teams', 'NFL_Codes', 'NHL_Teams', 'NHL_Codes', 'MLS_Codes'])

nba_df['URL_Variable'] = nba_df['URL_Date'] + nba_df['NBA_Codes']

away_code_df = nba_attendance_df.merge(team_codes_df, how='left', left_on='Visitor', right_on='NBA_Teams')
away_code_df = away_code_df.drop(columns=['MLB_Teams', 'MLB_Codes', 'NBA_Teams', 'NFL_Teams', 'NFL_Codes', 'NHL_Teams', 'NHL_Codes', 'MLS_Codes'])

nba_df['Away_Code'] = away_code_df['NBA_Codes']
nba_df.rename(columns = {'NBA_Codes':'Home_Code'}, inplace = True)
nba_df


Unnamed: 0,Date,Home,Visitor,Sport,URL_Date,Year,Home_Code,URL_Variable,Away_Code
0,2/1/20,Portland Trailblazers,Utah Jazz,NBA,202002010,2020,POR,202002010POR,UTA
1,11/15/17,Los Angeles Lakers,Philadelphia 76ers,NBA,201711150,2017,LAL,201711150LAL,PHI
2,12/27/17,Golden State Warriors,Utah Jazz,NBA,201712270,2017,GSW,201712270GSW,UTA
3,11/3/21,Sacramento Kings,New Orleans Pelicans,NBA,202111030,2021,SAC,202111030SAC,NOP
4,4/21/22,Utah Jazz,Dallas Mavericks,NBA,202204210,2022,UTA,202204210UTA,DAL
5,2/7/23,Los Angeles Lakers,Oklahoma City Thunder,NBA,202302070,2023,LAL,202302070LAL,OKC
6,12/28/23,Portland Trailblazers,San Antonio Spurs,NBA,202312280,2023,POR,202312280POR,SAS
7,12/29/23,Portland Trailblazers,San Antonio Spurs,NBA,202312290,2023,POR,202312290POR,SAS
8,12/22/23,Sacramento Kings,Phoenix Suns,NBA,202312220,2023,SAC,202312220SAC,PHO
9,1/15/24,Atlanta Hawks,San Antonio Spurs,NBA,202401150,2024,ATL,202401150ATL,SAS


In [3]:
URL_list = nba_df["URL_Variable"].values.tolist()
Home_code_list = nba_df["Home_Code"].values.tolist()
Away_code_list = nba_df["Away_Code"].values.tolist()
Year_list = nba_df["Year"].values.tolist()
Date_list = nba_df["URL_Date"].values.tolist()
Actual_date_list = nba_df["Date"].values.tolist()

In [4]:
# getting length of list
length = len(URL_list)

dataframe_dictionary = {}

box_score_df_list = []
home_stats_df_list = []
away_stats_df_list = []
game_logistics_df_list = []

for i in range(length):
    time.sleep(2)
    
    # Create empty lists to hold table data to be scraped
    away_box_score = []
    home_box_score = []
    box_score = []
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.basketball-reference.com/boxscores/{URL_list[i]}.html'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has away team statistics
    away_table = soup.select_one(f'#div_box-{Away_code_list[i]}-game-basic') 

    # Grab data from table and put it into the list created above
    for tr in away_table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        away_box_score.append(tds)

    # Grab the table element that has home team statistics
    home_table = soup.select_one(f'#div_box-{Home_code_list[i]}-game-basic') 

    # Grab data from table and put it into the list created above
    for tr in home_table.select('tr:has(td)'):
        tds2 = [td.get_text(strip=True) for td in tr.select('td')]
        home_box_score.append(tds2)

    # Grab the table element that has game box score
    box_score_table = soup.select_one('#all_line_score')

    # Game box score is commented out in html, so this will grab it out of the comments
    for comment in box_score_table.find_all(text=lambda text: isinstance(text, Comment)):
        if comment.find("<table ") > 0:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find("table")

    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds3 = [td.get_text(strip=True) for td in tr.select('td')]
        box_score.append(tds3)
       
    # Create dataframe for away team statistics
    dataframe_dictionary[Away_code_list[i] + Date_list[i]] = pd.DataFrame(away_box_score)

    # Create an empty list to store away team statistics header information
    away_header_list = []

    # Grab the table header information to use as column headers in our away team statistics dataframe
    for tr in away_table.select('tr:has(th)'):
        ths = [th.get_text(strip=True) for th in tr.select('th')]
        away_header_list.append(ths)

    # Create dataframe for home team statistics
    dataframe_dictionary[Home_code_list[i] + Date_list[i]] = pd.DataFrame(home_box_score)

    # Create an empty list to store home team statistics header information
    home_header_list = []

    # Grab the table header information to use as column headers in our home team statistics dataframe
    for tr in home_table.select('tr:has(th)'):
        ths2 = [th.get_text(strip=True) for th in tr.select('th')]
        home_header_list.append(ths2)    
        
    # Create dataframe for game box score
    dataframe_dictionary['Box' + Date_list[i]] = pd.DataFrame(box_score)

    # Create an list to store game box score dataframe header information
    box_score_list = ['1Q', '2Q', '3Q', '4Q', 'T']

    # If a game goes into overtime(s), we need to add to the box score list for each overtime period
    OT_counter = 1
    while len(box_score[0]) > len(box_score_list):
        box_score_list.insert(-1, f'{OT_counter}OT')
        OT_counter = OT_counter + 1
        
    # Update the column and row labels
    dataframe_dictionary['Box' + Date_list[i]] = dataframe_dictionary['Box' + Date_list[i]].set_axis(box_score_list, axis=1, inplace=False)
    dataframe_dictionary['Box' + Date_list[i]] = dataframe_dictionary['Box' + Date_list[i]].set_axis([f'{Away_code_list[i]}', f'{Home_code_list[i]}'])
    
    dataframe_dictionary['Box' + Date_list[i]]['Date'] = Actual_date_list[i]
    box_score_df_list.append(dataframe_dictionary['Box' + Date_list[i]])
    
    # Pull the away team player data and combine the lists into one list
    away_starters = away_header_list[2:7]
    away_bench = away_header_list[8:-1]
    away_players = [away_starters,away_bench]

    # Pull the home team player data and combine the lists into one list
    home_starters = home_header_list[2:7]
    home_bench = home_header_list[8:-1]
    home_players = [home_starters,home_bench]

    # Consolodate the away team list of lists into one list
    away_starters_list = [item for sublist in away_starters for item in sublist]
    away_bench_list = [item for sublist in away_bench for item in sublist]
    away_roster = away_starters_list + away_bench_list

    # Consolodate the home team list of lists into one list
    home_starters_list = [item for sublist in home_starters for item in sublist]
    home_bench_list = [item for sublist in home_bench for item in sublist]
    home_roster = home_starters_list + home_bench_list

    # Add Team Totals to the end of the away team list
    away_roster.append('Team Totals')
    dataframe_dictionary[Away_code_list[i] + Date_list[i]].index = away_roster

    # Add Team Totals to the end of the home team list
    home_roster.append('Team Totals')
    dataframe_dictionary[Home_code_list[i] + Date_list[i]].index = home_roster

    # Set the home and away dataframe column labels
    column_headers = away_header_list[1]
    column_headers.remove("Starters")
    dataframe_dictionary[Away_code_list[i] + Date_list[i]].columns = column_headers
    dataframe_dictionary[Home_code_list[i] + Date_list[i]].columns = column_headers

    dataframe_dictionary[Away_code_list[i] + Date_list[i]]['Date'] = Actual_date_list[i]
    dataframe_dictionary[Home_code_list[i] + Date_list[i]]['Date'] = Actual_date_list[i]
    
    dataframe_dictionary[Away_code_list[i] + Date_list[i]]['Team'] = Away_code_list[i]
    dataframe_dictionary[Home_code_list[i] + Date_list[i]]['Team'] = Home_code_list[i]
    
    home_stats_df_list.append(dataframe_dictionary[Home_code_list[i] + Date_list[i]])
    away_stats_df_list.append(dataframe_dictionary[Away_code_list[i] + Date_list[i]])
    
    # Pull logistical information about the game: attendance, length of game, and venue
    text = soup.get_text()
    attendance_string = re.findall(r"Attendance:\s+\d*,\d*", text)
    
    # A failsafe, as one of the games has no attendance listed causing the loop to break
    if(len(attendance_string) != 0):
        
        # Clean up the attendance string that is pulled 
        new_attendance_string = attendance_string[0].strip('([^a-z]xa)')
        attendance_list = list(new_attendance_string)
        attendance_list.pop(11)
    else:
        attendance_string = "No Attendance Available"
    
    # Clean up the game length string that is pulled 
    game_time_string = re.findall(r"Time of Game:\s+\d:\d*", text)    
    new_game_time_string = game_time_string[0].strip('([^a-z]xa)')
    game_time_list = list(new_game_time_string)
    game_time_list.pop(13)

    # join the lists to create final string
    final_attendance_string = "".join(attendance_list)
    final_game_time_sting = "".join(game_time_list)

    # Create throwaway variables to hold split strings so we can pick out what we want later
    x = final_attendance_string.split(":")
    y = final_game_time_sting.split(":",1)

    # Pick out the strings before the : that we split above
    dataframe_dictionary['Attendance' + Date_list[i]] = pd.DataFrame(x, columns = [x[0]])
    dataframe_dictionary['Attendance' + Date_list[i]] = dataframe_dictionary['Attendance' + Date_list[i]].drop([0])
    dataframe_dictionary['Game_Time' + Date_list[i]] = pd.DataFrame(y, columns = [y[0]])
    dataframe_dictionary['Game_Time' + Date_list[i]] = dataframe_dictionary['Game_Time' + Date_list[i]].drop([0])

    # Join the attendance and game time dataframes into one dataframe
    dataframe_dictionary['Game_Logistics' + Date_list[i]] = pd.concat([dataframe_dictionary['Attendance' + Date_list[i]], dataframe_dictionary['Game_Time' + Date_list[i]]], ignore_index=True, sort=False)
    dataframe_dictionary['Game_Logistics' + Date_list[i]]['Time of Game'] = dataframe_dictionary['Game_Logistics' + Date_list[i]]['Time of Game'].shift(-1)
    dataframe_dictionary['Game_Logistics' + Date_list[i]] = dataframe_dictionary['Game_Logistics' + Date_list[i]].dropna()

    # Scrape the name of the vanue and location and add that to the logistics dataframe
    divparent = soup.find('div', attrs={'class':'scorebox_meta'})
    divparent.text
    text = divparent.text
    result = re.split(r'\n', text)
    almost_done = result[1]
    final_result = almost_done.partition(f'{Year_list[i]}')[2]
    dataframe_dictionary['Game_Logistics' + Date_list[i]] = dataframe_dictionary['Game_Logistics' + Date_list[i]].assign(Venue=[final_result])

    dataframe_dictionary['Game_Logistics' + Date_list[i]]['Date'] = Actual_date_list[i]
    game_logistics_df_list.append(dataframe_dictionary['Game_Logistics' + Date_list[i]])
    

In [5]:
########## GUIDE TO ACCESSING DATAFRAMES ##########

## Pull Home Stats --> dataframe_dictionary['(Home Code)(Date)']

## Pull Away Stats --> dataframe_dictionary['(Away Code)(Date)']

## Pull Box Score --> dataframe_dictionary['Box(Date)']

## Pull Game Logistics --> dataframe_dictionary['Game_Logistics(Date)']

In [6]:
#Test
# dataframe_dictionary['Game_Logistics201812020']

In [7]:
#Test
# dataframe_dictionary['UTA201812020']

In [8]:
#Test
# dataframe_dictionary['MIA201812020']

In [9]:
#Test
# dataframe_dictionary['Box201812020']

In [10]:
total_box_score_df = pd.concat(box_score_df_list)

total_box_score_df

Unnamed: 0,1Q,2Q,3Q,4Q,T,Date
UTA,34,19,30,24,107,2/1/20
POR,33,30,31,30,124,2/1/20
PHI,29,29,25,32,115,11/15/17
LAL,26,28,30,25,109,11/15/17
UTA,24,23,22,32,101,12/27/17
GSW,23,25,42,36,126,12/27/17
NOP,26,26,35,12,99,11/3/21
SAC,27,29,36,20,112,11/3/21
DAL,27,41,29,29,126,4/21/22
UTA,20,31,40,27,118,4/21/22


In [11]:
total_home_stats_df = pd.concat(home_stats_df_list)

team_col = total_home_stats_df.pop("Team")
total_home_stats_df.insert(0, "Team", team_col)

total_home_stats_df

Unnamed: 0,Team,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,Date
Damian Lillard,POR,36:19,17,29,.586,9,15,.600,8,8,...,2,2,12,1,0,2,2,51,+17,2/1/20
Hassan Whiteside,POR,34:47,7,10,.700,0,0,,3,6,...,15,21,0,1,3,2,2,17,+20,2/1/20
CJ McCollum,POR,34:00,6,14,.429,2,4,.500,0,0,...,3,3,2,1,1,3,4,14,+14,2/1/20
Carmelo Anthony,POR,32:01,5,15,.333,1,3,.333,4,4,...,2,5,1,0,0,0,1,15,+9,2/1/20
Trevor Ariza,POR,26:17,3,6,.500,0,1,.000,0,0,...,5,7,2,0,0,1,4,6,+20,2/1/20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bruno Fernando,ATL,Did Not Play,,,,,,,,,...,,,,,,,,,,1/15/24
AJ Griffin,ATL,Did Not Play,,,,,,,,,...,,,,,,,,,,1/15/24
Seth Lundy,ATL,Did Not Play,,,,,,,,,...,,,,,,,,,,1/15/24
Patty Mills,ATL,Did Not Play,,,,,,,,,...,,,,,,,,,,1/15/24


In [12]:
total_away_stats_df = pd.concat(away_stats_df_list)

team_col = total_away_stats_df.pop("Team")
total_away_stats_df.insert(0, "Team", team_col)

total_away_stats_df

Unnamed: 0,Team,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,Date
Donovan Mitchell,UTA,32:24,11,21,.524,3,6,.500,0,0,...,2,2,4,1,0,2,2,25,-9,2/1/20
Rudy Gobert,UTA,32:04,1,4,.250,0,0,,4,4,...,7,11,2,1,3,1,4,6,-14,2/1/20
Mike Conley,UTA,30:01,9,20,.450,4,6,.667,0,0,...,3,4,4,0,0,1,0,22,-4,2/1/20
Bojan Bogdanović,UTA,28:57,6,11,.545,5,9,.556,5,5,...,1,2,1,1,0,1,3,22,-10,2/1/20
Joe Ingles,UTA,28:02,1,2,.500,1,2,.500,0,0,...,1,1,5,1,0,1,2,3,-6,2/1/20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Malaki Branham,SAS,9:51,2,4,.500,0,1,.000,0,0,...,0,0,0,0,0,0,1,4,-21,1/15/24
Sandro Mamukelashvili,SAS,1:24,0,1,.000,0,1,.000,0,0,...,0,0,0,0,0,0,0,0,-4,1/15/24
Mamadi Diakite,SAS,Did Not Play,,,,,,,,,...,,,,,,,,,,1/15/24
Devonte' Graham,SAS,Did Not Play,,,,,,,,,...,,,,,,,,,,1/15/24


In [13]:
total_game_logistics_df = pd.concat(game_logistics_df_list)
total_game_logistics_df

Unnamed: 0,Attendance,Time of Game,Venue,Date
0,19603,2:08,"Moda Center, Portland, Oregon",2/1/20
0,18997,2:19,"STAPLES Center, Los Angeles, California",11/15/17
0,19596,2:11,"Oracle Arena, Oakland, California",12/27/17
0,19596,2:10,"Golden 1 Center, Sacramento, California",11/3/21
0,18306,2:37,"Vivint Smart Home Arena, Salt Lake City, Utah",4/21/22
0,18997,2:32,"Crypto.com Arena, Los Angeles, California",2/7/23
0,19335,2:15,"Moda Center, Portland, Oregon",12/28/23
0,18861,2:13,"Moda Center, Portland, Oregon",12/29/23
0,17794,2:18,"Golden 1 Center, Sacramento, California",12/22/23
0,17447,2:17,"State Farm Arena, Atlanta, Georgia",1/15/24


In [14]:
index_list = total_box_score_df.index.tolist()
home_away_list = []


for i in range(len(index_list)):
    if (i % 2) == 0:
        home_away_list.append('Away')
    else:
        home_away_list.append('Home')


total_box_score_df = total_box_score_df.set_index('Date')
total_box_score_df.insert(0, 'Team', index_list)
total_box_score_df.insert(0, 'Home_Away', home_away_list)
total_box_score_df

Unnamed: 0_level_0,Home_Away,Team,1Q,2Q,3Q,4Q,T
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2/1/20,Away,UTA,34,19,30,24,107
2/1/20,Home,POR,33,30,31,30,124
11/15/17,Away,PHI,29,29,25,32,115
11/15/17,Home,LAL,26,28,30,25,109
12/27/17,Away,UTA,24,23,22,32,101
12/27/17,Home,GSW,23,25,42,36,126
11/3/21,Away,NOP,26,26,35,12,99
11/3/21,Home,SAC,27,29,36,20,112
4/21/22,Away,DAL,27,41,29,29,126
4/21/22,Home,UTA,20,31,40,27,118


In [15]:
total_game_logistics_df.to_csv('total_game_logistics.csv')
total_away_stats_df.to_csv('total_away_stats.csv')
total_home_stats_df.to_csv('total_home_stats.csv')
total_box_score_df.to_csv('total_box_score_df.csv')
# hof_df.to_csv('nba_hof.csv')
# mvp_df.to_csv('nba_mvp.csv')
# all_league_df.to_csv('all_league_players.csv')
# allstar_df.to_csv('allstars.csv')
# agg_dpoy_seen_df.to_csv('dpoy.csv')