In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import re
import time
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
team_codes_df = pd.read_csv('team_codes.csv')
games_attended_df = pd.read_csv('sports_attendance.csv')

games_attended_df["URL_Date"] = " "
games_attended_df["Year"] = " "

games_attended_df['Year'] = pd.to_datetime(games_attended_df.Date)
games_attended_df['Year'] = games_attended_df['Year'].dt.strftime('%Y')
games_attended_df['URL_Date'] = pd.to_datetime(games_attended_df.Date)
games_attended_df['URL_Date'] = games_attended_df['URL_Date'].dt.strftime('%Y%m%d0')

nba_attendance_df = games_attended_df[games_attended_df.Sport == ('NBA')].copy()
mlb_attendance_df = games_attended_df[games_attended_df.Sport == ('MLB')].copy()
nfl_attendance_df = games_attended_df[games_attended_df.Sport == ('NFL')].copy()

nba_df = nba_attendance_df.merge(team_codes_df, how='left', left_on='Home', right_on='NBA_Teams')
nba_df = nba_df.drop(columns=['MLB_Teams', 'MLB_Codes', 'NBA_Teams', 'NFL_Teams', 'NFL_Codes', 'NHL_Teams', 'NHL_Codes', 'MLS_Codes'])

nba_df['URL_Variable'] = nba_df['URL_Date'] + nba_df['NBA_Codes']

away_code_df = nba_attendance_df.merge(team_codes_df, how='left', left_on='Visitor', right_on='NBA_Teams')
away_code_df = away_code_df.drop(columns=['MLB_Teams', 'MLB_Codes', 'NBA_Teams', 'NFL_Teams', 'NFL_Codes', 'NHL_Teams', 'NHL_Codes', 'MLS_Codes'])

nba_df['Away_Code'] = away_code_df['NBA_Codes']
nba_df.rename(columns = {'NBA_Codes':'Home_Code'}, inplace = True)
nba_df


Unnamed: 0,Date,Home,Visitor,Sport,URL_Date,Year,Home_Code,URL_Variable,Away_Code
0,2/11/2014,Los Angeles Lakers,Utah Jazz,NBA,201402110,2014,LAL,201402110LAL,UTA
1,1/2/2015,Utah Jazz,Atlanta Hawks,NBA,201501020,2015,UTA,201501020UTA,ATL
2,4/3/2016,Orlando Magic,Memphis Grizzlies,NBA,201604030,2016,ORL,201604030ORL,MEM
3,12/27/2017,Golden State Warriors,Utah Jazz,NBA,201712270,2017,GSW,201712270GSW,UTA
4,11/17/2018,Orlando Magic,Los Angeles Lakers,NBA,201811170,2018,ORL,201811170ORL,LAL
5,11/18/2018,Orlando Magic,New York Knicks,NBA,201811180,2018,ORL,201811180ORL,NYK
6,12/2/2018,Miami Heat,Utah Jazz,NBA,201812020,2018,MIA,201812020MIA,UTA
7,11/8/2019,Atlanta Hawks,Sacramento Kings,NBA,201911080,2019,ATL,201911080ATL,SAC
8,11/3/2021,Sacramento Kings,New Orleans Pelicans,NBA,202111030,2021,SAC,202111030SAC,NOP
9,11/7/2021,Sacramento Kings,Indiana Pacers,NBA,202111070,2021,SAC,202111070SAC,IND


In [3]:
URL_list = nba_df["URL_Variable"].values.tolist()
Home_code_list = nba_df["Home_Code"].values.tolist()
Away_code_list = nba_df["Away_Code"].values.tolist()
Year_list = nba_df["Year"].values.tolist()
Date_list = nba_df["URL_Date"].values.tolist()
Actual_date_list = nba_df["Date"].values.tolist()

In [4]:
# getting length of list
length = len(URL_list)

dataframe_dictionary = {}

box_score_df_list = []
home_stats_df_list = []
away_stats_df_list = []
game_logistics_df_list = []

for i in range(length):
    time.sleep(2)
    
    # Create empty lists to hold table data to be scraped
    away_box_score = []
    home_box_score = []
    box_score = []
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.basketball-reference.com/boxscores/{URL_list[i]}.html'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has away team statistics
    away_table = soup.select_one(f'#div_box-{Away_code_list[i]}-game-basic') 

    # Grab data from table and put it into the list created above
    for tr in away_table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        away_box_score.append(tds)

    # Grab the table element that has home team statistics
    home_table = soup.select_one(f'#div_box-{Home_code_list[i]}-game-basic') 

    # Grab data from table and put it into the list created above
    for tr in home_table.select('tr:has(td)'):
        tds2 = [td.get_text(strip=True) for td in tr.select('td')]
        home_box_score.append(tds2)

    # Grab the table element that has game box score
    box_score_table = soup.select_one('#all_line_score')

    # Game box score is commented out in html, so this will grab it out of the comments
    for comment in box_score_table.find_all(text=lambda text: isinstance(text, Comment)):
        if comment.find("<table ") > 0:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find("table")

    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds3 = [td.get_text(strip=True) for td in tr.select('td')]
        box_score.append(tds3)
       
    # Create dataframe for away team statistics
    dataframe_dictionary[Away_code_list[i] + Date_list[i]] = pd.DataFrame(away_box_score)

    # Create an empty list to store away team statistics header information
    away_header_list = []

    # Grab the table header information to use as column headers in our away team statistics dataframe
    for tr in away_table.select('tr:has(th)'):
        ths = [th.get_text(strip=True) for th in tr.select('th')]
        away_header_list.append(ths)

    # Create dataframe for home team statistics
    dataframe_dictionary[Home_code_list[i] + Date_list[i]] = pd.DataFrame(home_box_score)

    # Create an empty list to store home team statistics header information
    home_header_list = []

    # Grab the table header information to use as column headers in our home team statistics dataframe
    for tr in home_table.select('tr:has(th)'):
        ths2 = [th.get_text(strip=True) for th in tr.select('th')]
        home_header_list.append(ths2)    
        
    # Create dataframe for game box score
    dataframe_dictionary['Box' + Date_list[i]] = pd.DataFrame(box_score)

    # Create an list to store game box score dataframe header information
    box_score_list = ['1Q', '2Q', '3Q', '4Q', 'T']

    # If a game goes into overtime(s), we need to add to the box score list for each overtime period
    OT_counter = 1
    while len(box_score[0]) > len(box_score_list):
        box_score_list.insert(-1, f'{OT_counter}OT')
        OT_counter = OT_counter + 1
        
    # Update the column and row labels
    dataframe_dictionary['Box' + Date_list[i]] = dataframe_dictionary['Box' + Date_list[i]].set_axis(box_score_list, axis=1, inplace=False)
    dataframe_dictionary['Box' + Date_list[i]] = dataframe_dictionary['Box' + Date_list[i]].set_axis([f'{Away_code_list[i]}', f'{Home_code_list[i]}'])
    
    dataframe_dictionary['Box' + Date_list[i]]['Date'] = Actual_date_list[i]
    box_score_df_list.append(dataframe_dictionary['Box' + Date_list[i]])
    
    # Pull the away team player data and combine the lists into one list
    away_starters = away_header_list[2:7]
    away_bench = away_header_list[8:-1]
    away_players = [away_starters,away_bench]

    # Pull the home team player data and combine the lists into one list
    home_starters = home_header_list[2:7]
    home_bench = home_header_list[8:-1]
    home_players = [home_starters,home_bench]

    # Consolodate the away team list of lists into one list
    away_starters_list = [item for sublist in away_starters for item in sublist]
    away_bench_list = [item for sublist in away_bench for item in sublist]
    away_roster = away_starters_list + away_bench_list

    # Consolodate the home team list of lists into one list
    home_starters_list = [item for sublist in home_starters for item in sublist]
    home_bench_list = [item for sublist in home_bench for item in sublist]
    home_roster = home_starters_list + home_bench_list

    # Add Team Totals to the end of the away team list
    away_roster.append('Team Totals')
    dataframe_dictionary[Away_code_list[i] + Date_list[i]].index = away_roster

    # Add Team Totals to the end of the home team list
    home_roster.append('Team Totals')
    dataframe_dictionary[Home_code_list[i] + Date_list[i]].index = home_roster

    # Set the home and away dataframe column labels
    column_headers = away_header_list[1]
    column_headers.remove("Starters")
    dataframe_dictionary[Away_code_list[i] + Date_list[i]].columns = column_headers
    dataframe_dictionary[Home_code_list[i] + Date_list[i]].columns = column_headers

    dataframe_dictionary[Away_code_list[i] + Date_list[i]]['Date'] = Actual_date_list[i]
    dataframe_dictionary[Home_code_list[i] + Date_list[i]]['Date'] = Actual_date_list[i]
    
    dataframe_dictionary[Away_code_list[i] + Date_list[i]]['Team'] = Away_code_list[i]
    dataframe_dictionary[Home_code_list[i] + Date_list[i]]['Team'] = Home_code_list[i]
    
    home_stats_df_list.append(dataframe_dictionary[Home_code_list[i] + Date_list[i]])
    away_stats_df_list.append(dataframe_dictionary[Away_code_list[i] + Date_list[i]])
    
    # Pull logistical information about the game: attendance, length of game, and venue
    text = soup.get_text()
    attendance_string = re.findall(r"Attendance:\s+\d*,\d*", text)
    
    # A failsafe, as one of the games has no attendance listed causing the loop to break
    if(len(attendance_string) != 0):
        
        # Clean up the attendance string that is pulled 
        new_attendance_string = attendance_string[0].strip('([^a-z]xa)')
        attendance_list = list(new_attendance_string)
        attendance_list.pop(11)
    else:
        attendance_string = "No Attendance Available"
    
    # Clean up the game length string that is pulled 
    game_time_string = re.findall(r"Time of Game:\s+\d:\d*", text)    
    new_game_time_string = game_time_string[0].strip('([^a-z]xa)')
    game_time_list = list(new_game_time_string)
    game_time_list.pop(13)

    # join the lists to create final string
    final_attendance_string = "".join(attendance_list)
    final_game_time_sting = "".join(game_time_list)

    # Create throwaway variables to hold split strings so we can pick out what we want later
    x = final_attendance_string.split(":")
    y = final_game_time_sting.split(":",1)

    # Pick out the strings before the : that we split above
    dataframe_dictionary['Attendance' + Date_list[i]] = pd.DataFrame(x, columns = [x[0]])
    dataframe_dictionary['Attendance' + Date_list[i]] = dataframe_dictionary['Attendance' + Date_list[i]].drop([0])
    dataframe_dictionary['Game_Time' + Date_list[i]] = pd.DataFrame(y, columns = [y[0]])
    dataframe_dictionary['Game_Time' + Date_list[i]] = dataframe_dictionary['Game_Time' + Date_list[i]].drop([0])

    # Join the attendance and game time dataframes into one dataframe
    dataframe_dictionary['Game_Logistics' + Date_list[i]] = pd.concat([dataframe_dictionary['Attendance' + Date_list[i]], dataframe_dictionary['Game_Time' + Date_list[i]]], ignore_index=True, sort=False)
    dataframe_dictionary['Game_Logistics' + Date_list[i]]['Time of Game'] = dataframe_dictionary['Game_Logistics' + Date_list[i]]['Time of Game'].shift(-1)
    dataframe_dictionary['Game_Logistics' + Date_list[i]] = dataframe_dictionary['Game_Logistics' + Date_list[i]].dropna()

    # Scrape the name of the vanue and location and add that to the logistics dataframe
    divparent = soup.find('div', attrs={'class':'scorebox_meta'})
    divparent.text
    text = divparent.text
    result = re.split(r'\n', text)
    almost_done = result[1]
    final_result = almost_done.partition(f'{Year_list[i]}')[2]
    dataframe_dictionary['Game_Logistics' + Date_list[i]] = dataframe_dictionary['Game_Logistics' + Date_list[i]].assign(Venue=[final_result])

    dataframe_dictionary['Game_Logistics' + Date_list[i]]['Date'] = Actual_date_list[i]
    game_logistics_df_list.append(dataframe_dictionary['Game_Logistics' + Date_list[i]])
    

In [5]:
soup


<!DOCTYPE html>

<html class="no-js" data-root="/home/bbr/build" data-version="klecko-" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">
<link href="https://cdn.ssref.net/req/202303231" rel="dns-prefetch"/>
<!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->
<script async="true" type="text/javascript">
    (function() {
	var host = window.location.hostname;
	var element = document.createElement('script');
	var firstScript = document.getElementsByTagName('script')[0];
	var url = 'https://cmp.quantcast.com'
	    .concat('/choice/', 'XwNYEpNeFfhfr', '/', host, 
		    '/choice.js?tag_version=V2');
	var uspTries = 0;
	var uspTriesLimit = 3;
	element.async = true;
	element.type = 'text/javascript';
	element.src = url;
	
	firstScript.parentNode.insertBefore(element, firstScript);
	
	function makeStub() {
	    var TCF_LOCATOR_NAME = '__tcfapiL

In [6]:
hof_table_list = []
hof_table_header_list = []

# input URL and use BeautifulSoup to parse through the page
url = f'https://www.basketball-reference.com/awards/hof.html'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

# Grab the table element that has NBA player statistics
hof_table = soup.select_one('#div_hof')
        
# Grab data from table and put it into the list created above
for tr in hof_table.select('tr:has(td)'):
    tds = [td.get_text(strip=True) for td in tr.select('td')]
    hof_table_list.append(tds)

for tr in hof_table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    hof_table_header_list.append(ths)
    
final_hof_table_header_list = hof_table_header_list[1]
final_hof_table_header_list.pop(0)

hof_df = pd.DataFrame(hof_table_list, columns=final_hof_table_header_list)

hof_names = hof_df['Name'].to_list()

cleaning_list = ['^(.*?)WNBA', '^(.*?)CBBplayer', '^(.*?)Coach', '^(.*?)CBBcoach', '^(.*?)Exec', '^(.*?)Ref', '^(.*?)Player', '^(.*?)Oly']
temp_hof_name_list = []

for name in hof_names:
#     temp_name_holder = 'a'
    cleaned_name = re.findall(r'^(.*?)\/', name)
    if cleaned_name == []:
        temp_hof_name_list.append(name)
    else:
        temp_hof_name_list.append(cleaned_name)

new_temp_name_list = []

for temp_name in temp_hof_name_list:
    if type(temp_name) == list:
        new_temp_name = str(temp_name[0])
        new_temp_name_list.append(new_temp_name)
    else:
        new_temp_name_list.append(temp_name)

cleaned_hof_name_list = []   

for temp_name in new_temp_name_list:
    name_holder = temp_name
    counter = 0 
    
    for string in cleaning_list:
        new_cleaned_name = re.findall(string, temp_name)

        if new_cleaned_name != []:
            cleaned_hof_name_list.append(new_cleaned_name)
            break
        elif (new_cleaned_name == [] and counter == len(cleaning_list)):
            cleaned_hof_name_list.append(name_holder)
        else:
            counter = counter + 1
            continue

            
final_temp_name_list = []

for final_name in cleaned_hof_name_list:
    if type(final_name) == list:
        final_name_string = str(final_name[0])
        final_temp_name_list.append(final_name_string)
    else:
        final_temp_name_list.append(final_name)

hof_df = pd.DataFrame(final_temp_name_list, columns = ["Player"])
hof_df


Unnamed: 0,Player
0,Swin Cash
1,Larry Costello
2,Hugh Evans
3,Manu Ginobili
4,Tim Hardaway
...,...
310,Walter Meanwell
311,George Mikan
312,James Naismith
313,Harold Olsen


In [7]:
mvp_table_list = []
mvp_table_header_list = []

# input URL and use BeautifulSoup to parse through the page
url = f'https://www.basketball-reference.com/awards/mvp.html'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

# Grab the table element that has NBA player statistics
mvp_table = soup.select_one('#div_mvp_NBA')
        
# Grab data from table and put it into the list created above
for tr in mvp_table.select('tr:has(td)'):
    tds = [td.get_text(strip=True) for td in tr.select('td')]
    mvp_table_list.append(tds)

for tr in mvp_table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    mvp_table_header_list.append(ths)
    
today = date.today()

# dd/mm/YY
if int(today.strftime("%M")) > 6:
    current_season = int(today.strftime("%Y"))
else:
    current_season = (int(today.strftime("%Y")) - 1)

mvp_player_list = []
mvp_year_list = []


for item in mvp_table_list:
    mvp_player_list.append(item[1])
    mvp_year_list.append(current_season)
    current_season = current_season - 1
    
mvp_df = pd.DataFrame(
    {'Year': mvp_year_list,
     'Player': mvp_player_list
    })

mvp_df

Unnamed: 0,Year,Player
0,2022,Nikola Jokić
1,2021,Nikola Jokić
2,2020,Giannis Antetokounmpo
3,2019,Giannis Antetokounmpo
4,2018,James Harden
...,...,...
62,1960,Wilt Chamberlain
63,1959,Bob Pettit
64,1958,Bill Russell
65,1957,Bob Cousy


In [8]:
all_league_table_list = []
all_league_table_header_list = []

# input URL and use BeautifulSoup to parse through the page
url = f'https://www.basketball-reference.com/awards/all_league.html'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

# Grab the table element that has NBA player statistics
all_league_table = soup.select_one('#div_awards_all_league')
        
# Grab data from table and put it into the list created above
for tr in all_league_table.select('tr:has(td)'):
    tds = [td.get_text(strip=True) for td in tr.select('td')]
    all_league_table_list.append(tds)

for tr in all_league_table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    all_league_table_header_list.append(ths)


today = date.today()

# dd/mm/YY
if int(today.strftime("%m")) > 6:
    current_season = int(today.strftime("%Y"))
else:
    current_season = (int(today.strftime("%Y")) - 1)

all_league_player_list = []
all_league_team_list = []
league_list = []


for item in all_league_table_list:
    counter = 3
    list_navigator = 3
    while list_navigator < 8:
        all_league_player_list.append(item[list_navigator])
        all_league_team_list.append(item[1])
        list_navigator = list_navigator + 1
        league_list.append(item[0])


cleaned_all_league_player_list = []
cleaned_all_league_team_list = []
cleaned_league_list = []
all_league_year_list = []

for item in all_league_player_list:
    if item == '':
        continue
    else:
        cleaned_all_league_player_list.append(item)

for item in all_league_team_list:
    if item == '':
        continue
    else:
        cleaned_all_league_team_list.append(item)

for item in league_list:
    if item == '':
        continue
    else:
        cleaned_league_list.append(item)

        
current_season = current_season + 1
        
for i in range(len(cleaned_all_league_team_list)):      
    if ((cleaned_all_league_team_list[i] == '1st') and (cleaned_all_league_team_list[i - 1] != '1st') and (cleaned_league_list[i] != 'ABA')):
        current_season = current_season - 1
        all_league_year_list.append(current_season)
    else:
        all_league_year_list.append(current_season)

        
all_league_df = pd.DataFrame(
    {'Year': all_league_year_list,
     'Player': cleaned_all_league_player_list,
     'Team': cleaned_all_league_team_list
    })

all_league_df


Unnamed: 0,Year,Player,Team
0,2022,Nikola JokićC,1st
1,2022,Giannis AntetokounmpoF,1st
2,2022,Jayson TatumF,1st
3,2022,Luka DončićG,1st
4,2022,Devin BookerG,1st
...,...,...,...
1015,1947,Frankie Baumholtz,2nd
1016,1947,Ernie Calverley,2nd
1017,1947,Chick Halbert,2nd
1018,1947,John Logan,2nd


In [9]:
time.sleep(75)

allstar_table_list = []

today = date.today()

# dd/mm/YY
allstar_year = int(today.strftime("%Y"))
if int(today.strftime("%m")) < 3:
    allstar_year = allstar_year - 1

start_year = 1951

while start_year <= allstar_year:   
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.basketball-reference.com/leagues/NBA_{start_year}.html'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has NBA player statistics
    allstar_table = soup.select_one('#all_all_star_game_rosters')

    comment = allstar_table.find(text=lambda text: isinstance(text, Comment))
    if comment.find("table") > 0:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find_all("table")

    # Grab data from table and put it into the list created above
    for tr in table[0].select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        allstar_table_list.append(tds)
        
    for tr in table[1].select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        allstar_table_list.append(tds)
        
    allstar_table_list.append(start_year)

    if start_year == 1998:
        start_year = start_year + 2
    else:
        start_year = start_year + 1
    
    time.sleep(3)


allstar_teams_list = []
i = 0

while i <= (len(allstar_table_list)-1):
    allstar_teams_list.append(((allstar_table_list[i] + allstar_table_list[(i+1)]),allstar_table_list[(i+2)]))
    i = i + 3

allstar_df = pd.DataFrame(allstar_teams_list, columns = ('All_Stars','Year'))
allstar_df      

Unnamed: 0,All_Stars,Year
0,[Paul Arizin*Vince BorylaBob Cousy*Joe Fulks*H...,1951
1,[Paul Arizin*Bob Cousy*Joe Fulks*Harry Gallati...,1952
2,[Don Barksdale*Carl Braun*Bob Cousy*Bill Gabor...,1953
3,[Carl Braun*Bob Cousy*Ray FelixHarry Gallatin*...,1954
4,[Paul Arizin*Carl Braun*Bob Cousy*Harry Gallat...,1955
...,...,...
67,[Giannis AntetokounmpoStephen CurryJoel Embiid...,2019
68,[Bam AdebayoGiannis AntetokounmpoJimmy ButlerJ...,2020
69,[Bradley BealDevin BookerMike ConleyAnthony Da...,2021
70,[LaMelo BallDevin BookerKevin DurantJoel Embii...,2022


In [10]:
dpoy_table_list = []

today = date.today()

# input URL and use BeautifulSoup to parse through the page
url = f'https://www.basketball-reference.com/awards/dpoy.html'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

# Grab the table element that has NBA player statistics
dpoy_table = soup.select_one('#div_dpoy_NBA')

# Grab data from table and put it into the list created above
for tr in dpoy_table.select('tr:has(td)'):
    tds = [td.get_text(strip=True) for td in tr.select('td')]
    dpoy_table_list.append(tds)

dpoy_df = pd.DataFrame(dpoy_table_list)
dpoy_list = dpoy_df[1].to_list()

# dd/mm/YY
dpoy_year = int(today.strftime("%Y"))
if int(today.strftime("%m")) < 7:
    dpoy_year = dpoy_year - 1

dpoy_seen_df_list = []
for item in dpoy_list:
    dpoy_seen_df_list.append([item,dpoy_year])
    dpoy_year = dpoy_year - 1

dpoy_seen_df = pd.DataFrame(dpoy_seen_df_list, columns = ['Player', 'Year'])
agg_dpoy_seen_years_list = []
agg_dpoy_seen_list = []
player_dpoy_count_list = []

for player in dpoy_seen_df['Player'].unique():
    agg_dpoy_seen_list.append(player)
    agg_dpoy_seen_years_list.append(dpoy_seen_df.loc[dpoy_seen_df['Player'] == player]['Year'].to_list())
    player_dpoy_count_list.append(dpoy_seen_df.loc[dpoy_seen_df['Player'] == player]['Year'].count())
    
agg_dpoy_seen_df = pd.DataFrame(columns = ['Player','X_DPOY','Years'])
agg_dpoy_seen_df['Player'] = agg_dpoy_seen_list
agg_dpoy_seen_df['Years'] = agg_dpoy_seen_years_list
agg_dpoy_seen_df['X_DPOY'] = player_dpoy_count_list

agg_dpoy_seen_df

Unnamed: 0,Player,X_DPOY,Years
0,Marcus Smart,1,[2022]
1,Rudy Gobert,3,"[2021, 2019, 2018]"
2,Giannis Antetokounmpo,1,[2020]
3,Draymond Green,1,[2017]
4,Kawhi Leonard,2,"[2016, 2015]"
5,Joakim Noah,1,[2014]
6,Marc Gasol,1,[2013]
7,Tyson Chandler,1,[2012]
8,Dwight Howard,3,"[2011, 2010, 2009]"
9,Kevin Garnett,1,[2008]


In [11]:
########## GUIDE TO ACCESSING DATAFRAMES ##########

## Pull Home Stats --> dataframe_dictionary['(Home Code)(Date)']

## Pull Away Stats --> dataframe_dictionary['(Away Code)(Date)']

## Pull Box Score --> dataframe_dictionary['Box(Date)']

## Pull Game Logistics --> dataframe_dictionary['Game_Logistics(Date)']

In [12]:
#Test
dataframe_dictionary['Game_Logistics201812020']

Unnamed: 0,Attendance,Time of Game,Venue,Date
0,19600,2:14,"AmericanAirlines Arena, Miami, Florida",12/2/2018


In [13]:
#Test
dataframe_dictionary['UTA201812020']

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,...,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,Date,Team
Donovan Mitchell,37:40,8.0,24.0,0.333,1.0,7.0,0.143,1.0,1.0,1.0,...,5.0,3.0,1.0,0.0,2.0,2.0,18.0,-5.0,12/2/2018,UTA
Rudy Gobert,37:15,5.0,9.0,0.556,0.0,0.0,,2.0,5.0,0.4,...,18.0,4.0,1.0,3.0,2.0,2.0,12.0,19.0,12/2/2018,UTA
Ricky Rubio,36:05,9.0,13.0,0.692,2.0,4.0,0.5,3.0,3.0,1.0,...,5.0,6.0,1.0,0.0,3.0,3.0,23.0,14.0,12/2/2018,UTA
Jae Crowder,34:09,6.0,15.0,0.4,3.0,7.0,0.429,4.0,4.0,1.0,...,6.0,2.0,1.0,0.0,4.0,3.0,19.0,1.0,12/2/2018,UTA
Joe Ingles,29:10,5.0,13.0,0.385,1.0,4.0,0.25,0.0,0.0,,...,4.0,4.0,2.0,0.0,0.0,4.0,11.0,7.0,12/2/2018,UTA
Royce O'Neale,18:43,2.0,2.0,1.0,2.0,2.0,1.0,0.0,0.0,,...,2.0,1.0,0.0,1.0,1.0,1.0,6.0,4.0,12/2/2018,UTA
Derrick Favors,17:53,2.0,4.0,0.5,1.0,2.0,0.5,0.0,0.0,,...,7.0,0.0,2.0,2.0,0.0,2.0,5.0,-19.0,12/2/2018,UTA
Kyle Korver,17:23,1.0,6.0,0.167,1.0,5.0,0.2,0.0,0.0,,...,2.0,0.0,0.0,0.0,1.0,0.0,3.0,-18.0,12/2/2018,UTA
Thabo Sefolosha,6:39,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,12/2/2018,UTA
Raul Neto,3:07,1.0,1.0,1.0,0.0,0.0,,0.0,0.0,,...,0.0,1.0,0.0,0.0,0.0,0.0,2.0,-4.0,12/2/2018,UTA


In [14]:
#Test
dataframe_dictionary['MIA201812020']

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,...,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,Date,Team
Josh Richardson,39:10,4.0,16.0,0.25,2.0,6.0,0.333,6.0,6.0,1.0,...,3.0,4.0,0.0,0.0,4.0,2.0,16.0,-13.0,12/2/2018,MIA
Wayne Ellington,37:12,4.0,13.0,0.308,4.0,9.0,0.444,0.0,0.0,,...,7.0,1.0,1.0,0.0,0.0,4.0,12.0,-2.0,12/2/2018,MIA
Hassan Whiteside,34:02,11.0,17.0,0.647,0.0,0.0,,1.0,6.0,0.167,...,20.0,2.0,1.0,3.0,1.0,4.0,23.0,-8.0,12/2/2018,MIA
James Johnson,18:46,4.0,10.0,0.4,1.0,2.0,0.5,0.0,0.0,,...,2.0,2.0,1.0,1.0,2.0,2.0,9.0,-16.0,12/2/2018,MIA
Rodney McGruder,5:07,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-14.0,12/2/2018,MIA
Justise Winslow,32:05,4.0,9.0,0.444,1.0,2.0,0.5,0.0,0.0,,...,5.0,1.0,2.0,1.0,4.0,3.0,9.0,14.0,12/2/2018,MIA
Dwyane Wade,30:26,4.0,15.0,0.267,2.0,6.0,0.333,5.0,5.0,1.0,...,3.0,8.0,1.0,1.0,1.0,0.0,15.0,21.0,12/2/2018,MIA
Kelly Olynyk,29:10,4.0,8.0,0.5,3.0,6.0,0.5,5.0,7.0,0.714,...,8.0,1.0,1.0,1.0,1.0,0.0,16.0,18.0,12/2/2018,MIA
Bam Adebayo,14:01,1.0,2.0,0.5,0.0,0.0,,0.0,0.0,,...,2.0,1.0,1.0,1.0,0.0,1.0,2.0,10.0,12/2/2018,MIA
Tyler Johnson,Did Not Dress,,,,,,,,,,...,,,,,,,,,12/2/2018,MIA


In [15]:
#Test
dataframe_dictionary['Box201812020']

Unnamed: 0,1Q,2Q,3Q,4Q,T,Date
UTA,35,20,22,23,100,12/2/2018
MIA,27,26,27,22,102,12/2/2018


In [16]:
total_box_score_df = pd.concat(box_score_df_list)

total_box_score_df

Unnamed: 0,1Q,2Q,3Q,4Q,T,Date,1OT
UTA,16,32,20,28,96,2/11/2014,
LAL,27,10,23,19,79,2/11/2014,
ATL,31,26,25,16,98,1/2/2015,
UTA,21,24,23,24,92,1/2/2015,
MEM,30,26,29,22,107,4/3/2016,
ORL,29,38,30,22,119,4/3/2016,
UTA,24,23,22,32,101,12/27/2017,
GSW,23,25,42,36,126,12/27/2017,
LAL,31,22,28,36,117,11/17/2018,
ORL,25,37,38,30,130,11/17/2018,


In [17]:
total_home_stats_df = pd.concat(home_stats_df_list)

team_col = total_home_stats_df.pop("Team")
total_home_stats_df.insert(0, "Team", team_col)

total_home_stats_df

Unnamed: 0,Team,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,Date
Wesley Johnson,LAL,37:21,7,14,.500,1,3,.333,0,0,...,6,6,0,0,0,2,3,15,-14,2/11/2014
Chris Kaman,LAL,36:44,11,24,.458,0,0,,3,3,...,10,14,4,1,3,3,3,25,-5,2/11/2014
Steve Blake,LAL,36:25,2,11,.182,1,4,.250,0,0,...,3,5,8,2,0,1,2,5,-3,2/11/2014
Shawne Williams,LAL,36:13,4,8,.500,2,4,.500,1,2,...,5,6,0,0,2,0,3,11,-10,2/11/2014
Steve Nash,LAL,16:35,1,4,.250,0,2,.000,0,0,...,1,1,2,0,0,0,0,2,-6,2/11/2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Terence Davis,SAC,Did Not Play,,,,,,,,,...,,,,,,,,,,3/25/2023
PJ Dozier,SAC,Did Not Play,,,,,,,,,...,,,,,,,,,,3/25/2023
Keon Ellis,SAC,Did Not Play,,,,,,,,,...,,,,,,,,,,3/25/2023
Alex Len,SAC,Did Not Play,,,,,,,,,...,,,,,,,,,,3/25/2023


In [18]:
total_away_stats_df = pd.concat(away_stats_df_list)

team_col = total_away_stats_df.pop("Team")
total_away_stats_df.insert(0, "Team", team_col)

total_away_stats_df

Unnamed: 0,Team,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,Date
Gordon Hayward,UTA,34:46,5,13,.385,2,4,.500,3,4,...,8,8,7,2,0,4,2,15,+12,2/11/2014
Trey Burke,UTA,28:49,3,11,.273,1,3,.333,1,1,...,3,4,8,1,0,1,1,8,+2,2/11/2014
Derrick Favors,UTA,27:03,4,9,.444,0,0,,3,7,...,8,10,1,1,3,1,3,11,-4,2/11/2014
Marvin Williams,UTA,22:29,4,8,.500,1,2,.500,0,0,...,4,7,1,0,0,1,0,9,-4,2/11/2014
Richard Jefferson,UTA,19:53,0,4,.000,0,2,.000,2,2,...,1,1,2,0,0,0,0,2,-13,2/11/2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Udoka Azubuike,UTA,10:22,1,2,.500,0,0,,0,0,...,3,6,1,0,0,1,0,2,-4,3/25/2023
Johnny Juzang,UTA,9:33,1,5,.200,0,3,.000,0,0,...,1,2,0,0,0,0,0,2,-4,3/25/2023
Damian Jones,UTA,7:23,0,0,,0,0,,0,0,...,1,2,1,1,1,0,0,0,+4,3/25/2023
Micah Potter,UTA,Did Not Play,,,,,,,,,...,,,,,,,,,,3/25/2023


In [19]:
total_game_logistics_df = pd.concat(game_logistics_df_list)
total_game_logistics_df

Unnamed: 0,Attendance,Time of Game,Venue,Date
0,18209,1:59,"STAPLES Center, Los Angeles, California",2/11/2014
0,19029,2:09,"EnergySolutions Arena, Salt Lake City, Utah",1/2/2015
0,17741,2:08,"Amway Center, Orlando, Florida",4/3/2016
0,19596,2:11,"Oracle Arena, Oakland, California",12/27/2017
0,19249,2:10,"Amway Center, Orlando, Florida",11/17/2018
0,15898,2:19,"Amway Center, Orlando, Florida",11/18/2018
0,19600,2:14,"AmericanAirlines Arena, Miami, Florida",12/2/2018
0,16447,2:06,"State Farm Arena, Atlanta, Georgia",11/8/2019
0,16447,2:10,"Golden 1 Center, Sacramento, California",11/3/2021
0,12993,2:15,"Golden 1 Center, Sacramento, California",11/7/2021


In [20]:
index_list = total_box_score_df.index.tolist()
home_away_list = []


for i in range(len(index_list)):
    if (i % 2) == 0:
        home_away_list.append('Away')
    else:
        home_away_list.append('Home')


total_box_score_df = total_box_score_df.set_index('Date')
total_box_score_df.insert(0, 'Team', index_list)
total_box_score_df.insert(0, 'Home_Away', home_away_list)
total_box_score_df

Unnamed: 0_level_0,Home_Away,Team,1Q,2Q,3Q,4Q,T,1OT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2/11/2014,Away,UTA,16,32,20,28,96,
2/11/2014,Home,LAL,27,10,23,19,79,
1/2/2015,Away,ATL,31,26,25,16,98,
1/2/2015,Home,UTA,21,24,23,24,92,
4/3/2016,Away,MEM,30,26,29,22,107,
4/3/2016,Home,ORL,29,38,30,22,119,
12/27/2017,Away,UTA,24,23,22,32,101,
12/27/2017,Home,GSW,23,25,42,36,126,
11/17/2018,Away,LAL,31,22,28,36,117,
11/17/2018,Home,ORL,25,37,38,30,130,


In [21]:
total_game_logistics_df.to_csv('total_game_logistics.csv')
total_away_stats_df.to_csv('total_away_stats.csv')
total_home_stats_df.to_csv('total_home_stats.csv')
total_box_score_df.to_csv('total_box_score_df.csv')
hof_df.to_csv('nba_hof.csv')
mvp_df.to_csv('nba_mvp.csv')
all_league_df.to_csv('all_league_players.csv')
allstar_df.to_csv('allstars.csv')
agg_dpoy_seen_df.to_csv('dpoy.csv')