In [2]:
import nba_api
import pandas as pd
import sklearn
import os
import numpy as np
import requests
import time
from sklearn.model_selection import KFold
import datetime as dt
import seaborn as sns
from nba_api.stats.static import players
import matplotlib.pyplot as plt
import math
from datetime import datetime, timedelta
from nba_api.stats.library.parameters import SeasonAllNullable
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
from tqdm import tqdm
from scipy.stats import norm
from nba_api.stats import endpoints
from nba_api.stats.static import teams
from sklearn.ensemble import RandomForestClassifier
import ipynb
import ipynb.fs
from itertools import compress


## Also read in ELO functions and tracked data / modeling functions

ELO_track = pd.read_csv('/Users/nickbachelder/Buckets Model/ELO_track', index_col=0)
ELO_track['Date'] = pd.to_datetime(ELO_track['Date'])

from ipynb.fs.defs.ELO_Tracking import get_pre_elo


## Also read in cleaned NBA data
NBAgamesC = pd.read_csv('/Users/nickbachelder/Buckets Model/NBAgamesC', index_col=0)
NBAgamesC['GAME_DATE_H'] = pd.to_datetime(NBAgamesC['GAME_DATE_H'])
nba_team = pd.read_csv('/Users/nickbachelder/Buckets Model/nba_team', index_col=0)
NBAgames = pd.read_csv('/Users/nickbachelder/Buckets Model/NBAgames', index_col=0)
NBAgames['GAME_DATE'] = pd.to_datetime(NBAgames['GAME_DATE'])
NBAgamesC['GAME_ID_H'] = NBAgamesC['GAME_ID_H'].astype(int)

In [92]:
# get all box scores since 2015

from nba_api.stats.library.parameters import SeasonType
from nba_api.stats.library.parameters import SeasonTypePlayoffs
from nba_api.stats.library.parameters import  SeasonNullable

### To avoid pulling from api, lets pull all player box scores since 2015
def get_all_player_box(seasons = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
                                  '2010', '2011', '2012', '2013', '2014',
                                  '2015', '2016', '2017', '2018', '2019', '2020']):
    all_boxes = pd.DataFrame()
    for season in tqdm(seasons):
        season = '{}-{}'.format(int(season), str(int(season) + 1)[2:])
        regular_boxes = endpoints.PlayerGameLogs(season_type_nullable = SeasonType.regular, 
                                              season_nullable = season).get_data_frames()[0]
        
        post_boxes = endpoints.PlayerGameLogs(season_type_nullable = SeasonTypePlayoffs.playoffs, 
                                              season_nullable = season).get_data_frames()[0]


        post_boxes['post'] = 1
        regular_boxes['post'] = 0
        boxes_year = pd.concat([regular_boxes, post_boxes])
        all_boxes = pd.concat([all_boxes, boxes_year])
    all_boxes['GAME_DATE'] = pd.to_datetime(all_boxes['GAME_DATE'].str[:10])
    all_boxes['GAME_ID'] = all_boxes['GAME_ID'].astype(int)
    all_boxes = all_boxes.loc[all_boxes['MIN'] != 0] 
    
    all_boxes = all_boxes[['SEASON_YEAR', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION',
                           'GAME_ID', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 
                           'FTM', 'FTA', 'FT_PCT', 'REB', 'AST', 'STL', 'BLK', 'PTS']]
    return(all_boxes)
        
        
all_boxes = get_all_player_box()

all_boxes

100%|██████████| 21/21 [05:57<00:00, 17.03s/it]


Unnamed: 0,SEASON_YEAR,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,GAME_ID,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS
0,2000-01,458,Howard Eisley,1610612742,DAL,20001185,25.318333,3,4,0.750,0,1,0.000,0,0,0.000,2,4,1,0,6
1,2000-01,2042,Courtney Alexander,1610612764,WAS,20001182,47.238333,12,23,0.522,4,6,0.667,5,6,0.833,1,1,1,0,33
2,2000-01,956,Erick Dampier,1610612744,GSW,20001188,30.533333,5,12,0.417,0,0,0.000,2,2,1.000,12,1,0,5,12
3,2000-01,915,Rodney Rogers,1610612756,PHX,20001187,25.250000,3,8,0.375,1,1,1.000,7,8,0.875,2,1,2,0,14
4,2000-01,1630,Mikki Moore,1610612765,DET,20001180,16.250000,2,5,0.400,0,0,0.000,2,2,1.000,8,0,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1859,2020-21,1629014,Anfernee Simons,1610612757,POR,42000161,23.283333,5,6,0.833,4,5,0.800,0,0,0.000,4,2,0,0,14
1860,2020-21,203932,Aaron Gordon,1610612743,DEN,42000161,28.316667,7,16,0.438,2,4,0.500,0,2,0.000,8,1,0,1,16
1861,2020-21,2546,Carmelo Anthony,1610612757,POR,42000161,21.583333,6,12,0.500,4,8,0.500,2,2,1.000,3,1,1,0,18
1862,2020-21,203468,CJ McCollum,1610612757,POR,42000161,36.483333,8,20,0.400,3,7,0.429,2,2,1.000,6,3,0,1,21


In [93]:
def min_to_int(minutes_w_seconds):
    minutes = int(minutes_w_seconds.partition(':')[0])
    seconds = int(minutes_w_seconds.partition(':')[2])/60
    time_total = round((minutes + seconds), 2)
    return(time_total)

In [94]:
def all_box_score_fixer(all_boxes, NBAgamesC): ## for some games, the nba_api skips box scores (play in games, etc) 
    unique_boxes_games = list(all_boxes['GAME_ID'].unique())
    unique_NBAgamesC_games = list(NBAgamesC['GAME_ID_H'].unique())
    
    missing = list(set(unique_NBAgamesC_games) - set(unique_boxes_games))
    missing = list(map(str, missing))
    missing = [('0' * (10 - len(x))) + x for x in missing]
    
    missing_boxes = pd.DataFrame()
    for code in tqdm(missing):
        add = endpoints.BoxScoreTraditionalV2(game_id = code).get_data_frames()[0]
        time.sleep(0.2)
        season = NBAgamesC[NBAgamesC['GAME_ID_H'] == int(code)]['SEASON_ID_H'].iloc[0]
        season_id = '{}-{}'.format(season, (season+1) - 2000)
        add = add[['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION',
                           'GAME_ID', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 
                           'FTM', 'FTA', 'FT_PCT', 'REB', 'AST', 'STL', 'BLK', 'PTS']]
        add['SEASON_YEAR'] = season_id
        add = add[~add['MIN'].isna()]
        add['MIN'] = [min_to_int(sub) for sub in add['MIN']]
        
        
        missing_boxes = pd.concat([missing_boxes, add])
    all_boxes_fixed = pd.concat([missing_boxes, all_boxes])
    all_boxes_fixed = all_boxes_fixed[all_boxes_fixed['GAME_ID'].isin(unique_NBAgamesC_games)].reset_index(drop = 1)
    return(all_boxes_fixed)

all_boxes = all_box_score_fixer(all_boxes, NBAgamesC)

100%|██████████| 901/901 [14:44<00:00,  1.02it/s]


In [95]:
all_boxes = all_boxes.reset_index(drop = 1)
all_boxes

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,GAME_ID,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS,SEASON_YEAR
0,458,Howard Eisley,1610612742,DAL,20001185,25.318333,3.0,4.0,0.750,0.0,1.0,0.000,0.0,0.0,0.000,2.0,4.0,1.0,0.0,6.0,2000-01
1,2042,Courtney Alexander,1610612764,WAS,20001182,47.238333,12.0,23.0,0.522,4.0,6.0,0.667,5.0,6.0,0.833,1.0,1.0,1.0,0.0,33.0,2000-01
2,956,Erick Dampier,1610612744,GSW,20001188,30.533333,5.0,12.0,0.417,0.0,0.0,0.000,2.0,2.0,1.000,12.0,1.0,0.0,5.0,12.0,2000-01
3,915,Rodney Rogers,1610612756,PHX,20001187,25.250000,3.0,8.0,0.375,1.0,1.0,1.000,7.0,8.0,0.875,2.0,1.0,2.0,0.0,14.0,2000-01
4,1630,Mikki Moore,1610612765,DET,20001180,16.250000,2.0,5.0,0.400,0.0,0.0,0.000,2.0,2.0,1.000,8.0,0.0,0.0,0.0,6.0,2000-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
553772,1629014,Anfernee Simons,1610612757,POR,42000161,23.283333,5.0,6.0,0.833,4.0,5.0,0.800,0.0,0.0,0.000,4.0,2.0,0.0,0.0,14.0,2020-21
553773,203932,Aaron Gordon,1610612743,DEN,42000161,28.316667,7.0,16.0,0.438,2.0,4.0,0.500,0.0,2.0,0.000,8.0,1.0,0.0,1.0,16.0,2020-21
553774,2546,Carmelo Anthony,1610612757,POR,42000161,21.583333,6.0,12.0,0.500,4.0,8.0,0.500,2.0,2.0,1.000,3.0,1.0,1.0,0.0,18.0,2020-21
553775,203468,CJ McCollum,1610612757,POR,42000161,36.483333,8.0,20.0,0.400,3.0,7.0,0.429,2.0,2.0,1.000,6.0,3.0,0.0,1.0,21.0,2020-21


In [96]:
pd.DataFrame.to_csv(all_boxes, '/Users/nickbachelder/Buckets Model/all_boxes')

In [97]:
## Create function to find current injuries

from nba_api.stats.static import players
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.library.parameters import SeasonAll
player_dict = players.get_players()

def get_curr_injuries():
    all_injuries = []
    recent_injuries = pd.read_html('https://www.espn.com/nba/injuries')
    for team in range(len(recent_injuries)):
        all_injuries.extend(list(recent_injuries[team]['NAME']))
    return all_injuries

get_curr_injuries()

["De'Andre Hunter",
 'Onyeka Okongwu',
 'Jaylen Brown',
 'Kyrie Irving',
 'Nate Darling',
 'Gordon Hayward',
 'Coby White',
 'Dylan Windler',
 'Jamal Murray',
 'Isaiah Stewart',
 'Isaiah Livers',
 'Klay Thompson',
 'James Wiseman',
 'Jalen Green',
 'David Nwaba',
 'Goga Bitadze',
 'Myles Turner',
 'T.J. Warren',
 'Kawhi Leonard',
 'Serge Ibaka',
 'Ivica Zubac',
 'Jarrett Culver',
 'Victor Oladipo',
 'Donte DiVincenzo',
 'Patrick Beverley',
 'Taurean Prince',
 'Mitchell Robinson',
 'Shai Gilgeous-Alexander',
 'Markelle Fultz',
 'Jonathan Isaac',
 'Michael Carter-Williams',
 'Jalen Suggs',
 'Chris Paul',
 'Dario Saric',
 'Luka Samanic',
 'Pascal Siakam',
 'Deni Avdija',
 'Thomas Bryant']

In [98]:
def get_player_games(player1, all_boxes):
    player = [player for player in player_dict if player['full_name'] == player1][0]
    playerid = player['id']
    
    player_gamelog = all_boxes[all_boxes['PLAYER_NAME'] == player1].reset_index(drop = 1)['GAME_ID']
    
    return(player_gamelog)

get_player_games('LeBron James', all_boxes = all_boxes)  ## These are all games Lebron has played since 2015

0       20301179
1       20301162
2       20301151
3       20301137
4       20301128
          ...   
1571    42000155
1572    42000154
1573    42000153
1574    42000152
1575    42000151
Name: GAME_ID, Length: 1576, dtype: object

In [13]:
## want to create a column of NBAgamesC that contains all injured played in roster for that year

## create roster function. then create rosters for each used year (15 - 20)

def make_rosters(seasonid): ## takes in te
    d = {}
    for teamid in tqdm(nba_team['id']):
        roster = endpoints.commonteamroster.CommonTeamRoster(team_id = teamid, season = seasonid).get_data_frames()[0]
        d.update({teamid : roster})
        time.sleep(.5)
    return d



rosters_2021 = make_rosters(2021)

100%|██████████| 30/30 [00:23<00:00,  1.27it/s]


In [14]:
rosters_2019 = make_rosters(2019)
rosters_2020 = make_rosters(2020)

100%|██████████| 30/30 [00:23<00:00,  1.25it/s]
100%|██████████| 30/30 [00:22<00:00,  1.31it/s]


In [15]:
rosters_2018 = make_rosters(2018)


100%|██████████| 30/30 [00:23<00:00,  1.30it/s]


In [16]:
rosters_2017 = make_rosters(2017)


100%|██████████| 30/30 [00:22<00:00,  1.32it/s]


In [17]:
rosters_2016 = make_rosters(2016)

100%|██████████| 30/30 [00:23<00:00,  1.30it/s]


In [18]:
rosters_2015 = make_rosters(2015)

100%|██████████| 30/30 [00:23<00:00,  1.30it/s]


In [19]:
rosters_2014 = make_rosters(2014)

100%|██████████| 30/30 [00:25<00:00,  1.19it/s]


In [20]:
rosters_2013 = make_rosters(2013)

100%|██████████| 30/30 [00:26<00:00,  1.12it/s]


In [21]:
rosters_2012 = make_rosters(2012)

100%|██████████| 30/30 [00:26<00:00,  1.15it/s]


In [22]:
rosters_2011 = make_rosters(2011)

100%|██████████| 30/30 [00:23<00:00,  1.28it/s]


In [23]:
rosters_2010 = make_rosters(2010)
rosters_2009 = make_rosters(2009)
rosters_2008 = make_rosters(2008)
rosters_2007 = make_rosters(2007)
rosters_2006 = make_rosters(2006)
rosters_2005 = make_rosters(2005)
rosters_2004 = make_rosters(2004)
rosters_2003 = make_rosters(2003)
rosters_2002 = make_rosters(2002)
rosters_2001 = make_rosters(2001)
rosters_2000 = make_rosters(2000)

100%|██████████| 30/30 [00:26<00:00,  1.13it/s]
100%|██████████| 30/30 [00:23<00:00,  1.28it/s]
100%|██████████| 30/30 [00:23<00:00,  1.30it/s]
100%|██████████| 30/30 [00:22<00:00,  1.31it/s]
100%|██████████| 30/30 [00:23<00:00,  1.28it/s]
100%|██████████| 30/30 [00:23<00:00,  1.27it/s]
100%|██████████| 30/30 [00:26<00:00,  1.13it/s]
100%|██████████| 30/30 [00:23<00:00,  1.26it/s]
100%|██████████| 30/30 [00:23<00:00,  1.29it/s]
100%|██████████| 30/30 [00:23<00:00,  1.30it/s]
100%|██████████| 30/30 [00:25<00:00,  1.19it/s]


In [40]:
## make nested dictionary for each year roster

nested_rosters = {2021: rosters_2021, 2020: rosters_2020, 2019: rosters_2019,
                  2018: rosters_2018,2017: rosters_2017,
                  2016: rosters_2016, 2015: rosters_2015,
                 2014: rosters_2014, 2013: rosters_2013,
                 2012: rosters_2012, 2011: rosters_2011,
                 2010: rosters_2010,
                 2009: rosters_2009, 2008: rosters_2008,
                 2007: rosters_2007, 2006: rosters_2006,
                 2005: rosters_2005, 2004: rosters_2004,
                 2003: rosters_2003, 2002: rosters_2002,
                 2001: rosters_2001,  2000: rosters_2000}

nested_rosters.get(2021)

{1610612737:         TeamID SEASON LeagueID                   PLAYER  NICKNAME  \
 0   1610612737   2021       00            Jalen Johnson     Jalen   
 1   1610612737   2021       00           Sharife Cooper   Sharife   
 2   1610612737   2021       00                AJ Lawson        AJ   
 3   1610612737   2021       00          Brandon Goodwin   Brandon   
 4   1610612737   2021       00            Kevin Huerter     Kevin   
 5   1610612737   2021       00              Skylar Mays    Skylar   
 6   1610612737   2021       00             Lou Williams       Lou   
 7   1610612737   2021       00             Gorgui Dieng    Gorgui   
 8   1610612737   2021       00         Danilo Gallinari    Danilo   
 9   1610612737   2021       00  Timothe Luwawu-Cabarrot   Timothe   
 10  1610612737   2021       00               Trae Young      Trae   
 11  1610612737   2021       00          De'Andre Hunter  De'Andre   
 12  1610612737   2021       00        Bogdan Bogdanovic    Bogdan   
 13  161

In [111]:
## This function creates a matrix for each player in each game whether they were missing. Then it concats
## to list each team game with each player injury

def create_team_injury_matrix(team_id, year): 
    rosters = nested_rosters.get(year)
    players = rosters.get(team_id)['PLAYER']
    played = {}
    season_games = NBAgamesC.loc[NBAgamesC['SEASON_ID_H'] == year]
    team_games = season_games[(season_games['TEAM_ID_H'] == team_id) | (season_games['TEAM_ID_A'] == team_id)]
    cols = ['game']
    cols.extend(team_games['GAME_ID_H'].unique())
    played_df = pd.DataFrame(columns = cols)
    for player in players:
        played = [player]
        try:
            all_played = [ int(x) for x in list(get_player_games(player, all_boxes))]
        except:
            played.extend(list(np.repeat(False, len(played_df.columns) - 1)))
            played = pd.DataFrame(played).T
            played.columns = cols
            played_df = pd.concat([played_df, played], ignore_index=True)
            continue
        played = [player]
        for game in list(team_games['GAME_ID_H'].unique()):
            game_check = int(game)
            played.append((game_check in all_played))
        played = pd.DataFrame(played).T
        played.columns = cols
        played_df = pd.concat([played_df, played])
    played_df = played_df.T
    played_df.columns = played_df.iloc[0]
    played_df = played_df.iloc[1:,]
    
    out = []
    
    for row in range(len(played_df.index)):
        out_add = list(compress(list(played_df.iloc[row,].index), list(played_df.iloc[row,] == False)))
        out.append(out_add)
        
    played_df['out'] = out
    
    return(played_df[['out']])
            

        
create_team_injury_matrix(team_id = 1610612749, year = 2010)

game,out
21001225,"[Andrew Bogut, Chris Douglas-Roberts, Jon Broc..."
21001207,"[Andrew Bogut, Jon Brockman]"
21001190,"[Andrew Bogut, Earl Boykins, Chris Douglas-Rob..."
21001181,"[Andrew Bogut, Chris Douglas-Roberts]"
21001164,"[Ersan Ilyasova, Earl Boykins, Chris Douglas-R..."
...,...
1521000053,"[Drew Gooden, Brandon Jennings, Corey Maggette..."
1521000045,"[Drew Gooden, Brandon Jennings, Corey Maggette..."
1521000037,"[Drew Gooden, Brandon Jennings, Corey Maggette..."
1521000023,"[Drew Gooden, Brandon Jennings, Corey Maggette..."


In [112]:
def injuries_by_team_dict(year):
    d = {}
    for teamid in tqdm(nba_team['id']):
        out_track = create_team_injury_matrix(teamid, year)
        d.update({teamid : out_track})
    return d

out_2020 = injuries_by_team_dict(2020)

100%|██████████| 30/30 [00:26<00:00,  1.14it/s]


In [113]:
out_2019 = injuries_by_team_dict(2019)

100%|██████████| 30/30 [00:31<00:00,  1.04s/it]


In [114]:
out_2018 = injuries_by_team_dict(2018)

100%|██████████| 30/30 [00:30<00:00,  1.03s/it]


In [115]:
out_2017 = injuries_by_team_dict(2017)

100%|██████████| 30/30 [00:31<00:00,  1.04s/it]


In [116]:
out_2016 = injuries_by_team_dict(2016)

100%|██████████| 30/30 [00:28<00:00,  1.06it/s]


In [117]:
out_2015 = injuries_by_team_dict(2015)

100%|██████████| 30/30 [00:28<00:00,  1.06it/s]


In [118]:
out_2014 = injuries_by_team_dict(2014)

100%|██████████| 30/30 [00:28<00:00,  1.06it/s]


In [119]:
out_2013 = injuries_by_team_dict(2013)

100%|██████████| 30/30 [00:28<00:00,  1.05it/s]


In [120]:
out_2012 = injuries_by_team_dict(2012)

100%|██████████| 30/30 [00:28<00:00,  1.07it/s]


In [121]:
out_2011 = injuries_by_team_dict(2011)

100%|██████████| 30/30 [00:27<00:00,  1.09it/s]


In [122]:
out_2010 = injuries_by_team_dict(2010)
out_2009 = injuries_by_team_dict(2009)
out_2008 = injuries_by_team_dict(2008)
out_2007 = injuries_by_team_dict(2007)
out_2006 = injuries_by_team_dict(2006)
out_2005 = injuries_by_team_dict(2005)
out_2004 = injuries_by_team_dict(2004)
out_2003 = injuries_by_team_dict(2003)
out_2002 = injuries_by_team_dict(2002)
out_2001 = injuries_by_team_dict(2001)
out_2000 = injuries_by_team_dict(2000)

100%|██████████| 30/30 [00:28<00:00,  1.07it/s]
100%|██████████| 30/30 [00:27<00:00,  1.11it/s]
100%|██████████| 30/30 [00:27<00:00,  1.08it/s]
100%|██████████| 30/30 [00:26<00:00,  1.12it/s]
100%|██████████| 30/30 [00:27<00:00,  1.11it/s]
100%|██████████| 30/30 [00:26<00:00,  1.15it/s]
100%|██████████| 30/30 [00:23<00:00,  1.27it/s]
100%|██████████| 30/30 [00:25<00:00,  1.16it/s]
100%|██████████| 30/30 [00:29<00:00,  1.03it/s]
100%|██████████| 30/30 [00:22<00:00,  1.31it/s]
100%|██████████| 30/30 [00:21<00:00,  1.38it/s]


In [26]:
## make nested dictionary for each year out

nested_out = {2020: out_2020, 2019: out_2019,
                  2018: out_2018,2017: out_2017,
                  2016: out_2016, 2015: out_2015,
              2014: out_2014, 2013: out_2013,
                  2012: out_2012, 2011: out_2011,
             2010: out_2010, 2009: out_2009, 2008: out_2008,
                  2007: out_2007,2006: out_2006,
                  2005: out_2005, 2004: out_2004,
              2003: out_2003, 2002: out_2002,
                  2001: out_2001, 2000: out_2000}

NameError: name 'out_2020' is not defined

In [41]:
def nested_dict_unfolder(nested): ## formats nested df so it can be saved
    unfolded = pd.DataFrame()
    for year in tqdm([2000, 2001, 2002, 2003,2004,2005,2006,2007,2008,2009, 2010, 2011, 2012, 2013, 2014,
                                  2015, 2016, 2017, 2018, 2019, 2020, 2021]):
        nested_y = nested.get(year)
        for teamid in nba_team['id']:
            team_df = pd.DataFrame(nested_y.get(teamid))
            team_df['game_id'] = team_df.index
            team_df['team_id'] = teamid
            team_df['year'] = year
            unfolded = pd.concat([unfolded, team_df])
    return(unfolded)
            
unfolded_out = nested_dict_unfolder(nested_out).reset_index(drop = 1)

In [45]:
## Also unfold the roster data

unfolded_rosters = nested_dict_unfolder(nested_rosters).reset_index(drop = 1)
unfolded_rosters



100%|██████████| 22/22 [00:01<00:00, 12.37it/s]


Unnamed: 0,TeamID,SEASON,LeagueID,PLAYER,NICKNAME,PLAYER_SLUG,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,AGE,EXP,SCHOOL,PLAYER_ID,game_id,team_id,year
0,1610612737,2000,00,DerMarr Johnson,DerMarr,dermarr-johnson,1,F,6-9,201,"MAY 05, 1980",21.0,R,Cincinnati,2035,0,1610612737,2000
1,1610612737,2000,00,Nazr Mohammed,Nazr,nazr-mohammed,2,C,6-10,240,"SEP 05, 1977",23.0,2,Kentucky,1737,1,1610612737,2000
2,1610612737,2000,00,Cal Bowdler,Cal,cal-bowdler,3,F,6-10,245,"MAR 31, 1977",24.0,1,Old Dominion,1898,2,1610612737,2000
3,1610612737,2000,00,Chris Crawford,Chris,chris-crawford,4,F,6-9,235,"MAY 13, 1975",26.0,3,Marquette,1544,3,1610612737,2000
4,1610612737,2000,00,Dion Glover,Dion,dion-glover,5,G,6-5,228,"OCT 22, 1978",22.0,1,Georgia Tech,1901,4,1610612737,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9840,1610612766,2021,00,Ish Smith,Ish,ish-smith,14,G,6-0,175,"JUL 05, 1988",33.0,11,Wake Forest,202397,14,1610612766,2021
9841,1610612766,2021,00,Gordon Hayward,Gordon,gordon-hayward,20,F,6-7,225,"MAR 23, 1990",31.0,11,Butler,202330,15,1610612766,2021
9842,1610612766,2021,00,Vernon Carey Jr.,Vernon,vernon-carey-jr,22,F-C,6-9,270,"FEB 25, 2001",20.0,1,Duke,1630176,16,1610612766,2021
9843,1610612766,2021,00,Mason Plumlee,Mason,mason-plumlee,24,F-C,6-11,254,"MAR 05, 1990",31.0,8,Duke,203486,17,1610612766,2021


In [51]:
# Save both


pd.DataFrame.to_csv(unfolded_rosters, '/Users/nickbachelder/Buckets Model/unfolded_roster')

In [52]:
nested_out = nested_dict_folder(unfolded_out, nba_team)

def nested_dict_folder(unnested, nba_team): ## unfolds 
    folded = {}
    for year in tqdm([2000, 2001, 2002, 2003,2004,2005,2006,2007,2008,2009, 2010, 2011, 2012, 2013, 2014,
                                  2015, 2016, 2017, 2018, 2019, 2020, 2021]):
        unnested_y = unnested.loc[unnested['year'] == year]
        inner = {}
        for teamid in nba_team['id']:
            team_df = unnested_y.loc[unnested_y['team_id'] == teamid] ## drop added columns
            inner.update({teamid: team_df})
        folded.update({year:inner})
    return(folded)

nested_out = nested_dict_folder(unfolded_out, nba_team)

NameError: name 'unfolded_out' is not defined

In [53]:
unfolded_roster = pd.read_csv('/Users/nickbachelder/Buckets Model/unfolded_roster', index_col=0)
folded_roster = nested_dict_folder(unfolded_roster, nba_team)
folded_roster.keys()

100%|██████████| 22/22 [00:00<00:00, 136.04it/s]


dict_keys([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021])

In [54]:
unfolded_rosters

Unnamed: 0,TeamID,SEASON,LeagueID,PLAYER,NICKNAME,PLAYER_SLUG,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,AGE,EXP,SCHOOL,PLAYER_ID,game_id,team_id,year
0,1610612737,2000,00,DerMarr Johnson,DerMarr,dermarr-johnson,1,F,6-9,201,"MAY 05, 1980",21.0,R,Cincinnati,2035,0,1610612737,2000
1,1610612737,2000,00,Nazr Mohammed,Nazr,nazr-mohammed,2,C,6-10,240,"SEP 05, 1977",23.0,2,Kentucky,1737,1,1610612737,2000
2,1610612737,2000,00,Cal Bowdler,Cal,cal-bowdler,3,F,6-10,245,"MAR 31, 1977",24.0,1,Old Dominion,1898,2,1610612737,2000
3,1610612737,2000,00,Chris Crawford,Chris,chris-crawford,4,F,6-9,235,"MAY 13, 1975",26.0,3,Marquette,1544,3,1610612737,2000
4,1610612737,2000,00,Dion Glover,Dion,dion-glover,5,G,6-5,228,"OCT 22, 1978",22.0,1,Georgia Tech,1901,4,1610612737,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9840,1610612766,2021,00,Ish Smith,Ish,ish-smith,14,G,6-0,175,"JUL 05, 1988",33.0,11,Wake Forest,202397,14,1610612766,2021
9841,1610612766,2021,00,Gordon Hayward,Gordon,gordon-hayward,20,F,6-7,225,"MAR 23, 1990",31.0,11,Butler,202330,15,1610612766,2021
9842,1610612766,2021,00,Vernon Carey Jr.,Vernon,vernon-carey-jr,22,F-C,6-9,270,"FEB 25, 2001",20.0,1,Duke,1630176,16,1610612766,2021
9843,1610612766,2021,00,Mason Plumlee,Mason,mason-plumlee,24,F-C,6-11,254,"MAR 05, 1990",31.0,8,Duke,203486,17,1610612766,2021
