In [28]:
import os
import shap
import pandas as pd
import numpy as np
import time
from nba_api.stats.static import teams
from nba_api.stats.endpoints import leaguegamefinder
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
SEED = 42

# Data

In [2]:
if not os.path.exists('../../../data/players'):
    !7z x ../../../data/players.7z -o../../../data/

if not os.path.exists('../../../data/boxscoretraditionalv2'):
    !7z x ../../../data/season_21-22.7z -o../../../data/
    !7z x ../../../data/season_2020-21.7z -aoa -o../../../data/

In [3]:
def get_game_ids(team_id=None):
    _dfs = []
    for season in ['2020-21', '2021-22']:
        _dfs.append(leaguegamefinder.LeagueGameFinder(
            team_id_nullable=team_id, 
            season_nullable=season, 
            season_type_nullable=leaguegamefinder.SeasonTypeNullable.regular,
            league_id_nullable=leaguegamefinder.LeagueIDNullable.nba)
                    .get_data_frames()[0])
    return pd.concat(_dfs, axis=0)

TEAM_ID = None
# TEAM_ID = teams.find_team_by_abbreviation('DAL')['id']

game_ids = get_game_ids(team_id=TEAM_ID)
print(game_ids.shape)
GAME_ID_TO_DATE_MAP = game_ids.set_index('GAME_ID').GAME_DATE.to_dict()
GAME_ID_TO_SEASON_ID_MAP = game_ids.set_index('GAME_ID').SEASON_ID.to_dict()
game_ids.head()

(4620, 28)


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22020,1610612760,OKC,Oklahoma City Thunder,22001074,2021-05-16,OKC vs. LAC,W,239,117,50,94,0.532,8,26,0.308,9,16,0.563,14,40,54,20,1,12,15,11,5.0
1,22020,1610612750,MIN,Minnesota Timberwolves,22001071,2021-05-16,MIN vs. DAL,W,241,136,49,90,0.544,17,40,0.425,21,25,0.84,11,27,38,35,11,4,9,22,15.0
2,22020,1610612749,MIL,Milwaukee Bucks,22001068,2021-05-16,MIL @ CHI,L,241,112,38,90,0.422,12,37,0.324,24,32,0.75,10,31,41,14,13,5,11,16,-6.0
3,22020,1610612743,DEN,Denver Nuggets,22001076,2021-05-16,DEN @ POR,L,240,116,44,98,0.449,14,37,0.378,14,15,0.933,10,26,36,20,8,2,6,20,-16.0
4,22020,1610612765,DET,Detroit Pistons,22001069,2021-05-16,DET vs. MIA,L,240,107,40,86,0.465,11,35,0.314,16,22,0.727,7,26,33,26,4,4,16,21,-13.0


In [4]:
games_train = game_ids.query('GAME_DATE <= "2022-02-01"').copy()
games_test = game_ids.query('GAME_DATE > "2022-02-01"').copy()
print(f'Test = {len(games_test)} ({len(games_test) / len(game_ids):.2%}) Train = {len(games_train)} ({len(games_train) / len(game_ids):.2%})')

Test = 930 (20.13%) Train = 3690 (79.87%)


In [5]:
def load_boxscores(games_df, team_id=None, object_dtypes = {col: object for col in ['GAME_ID', 'TEAM_ID', 'PLAYER_ID']}):
    df = (
        pd.concat([pd.read_csv(f'../../../data/boxscoretraditionalv2/boxscoretraditionalv2_0_{game_id}.csv', dtype=object_dtypes) 
                   for game_id in games_df.GAME_ID])
        .dropna(subset=['MIN'])
        .reset_index(drop=True)
    )
    
    if team_id:
        df = df.query(f'TEAM_ID == "{team_id}"').copy()
    df[['_min', '_sec']] = df['MIN'].str.split(':', expand=True)
    df['min_sec'] = df._min.astype(float) + df._sec.astype(int) / 60 
    df['GAME_DATE'] = df.GAME_ID.map(GAME_ID_TO_DATE_MAP)
    df['SEASON_ID'] = df.GAME_ID.map(GAME_ID_TO_SEASON_ID_MAP)
    df.drop_duplicates(['GAME_ID', 'PLAYER_ID'], inplace=True)
    return df

df_train = load_boxscores(games_train, team_id=TEAM_ID)
df_test = load_boxscores(games_test, team_id=TEAM_ID)
print(df_train.shape, df_test.shape)
df_train.head()

(39448, 34) (9645, 34)


Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,_min,_sec,min_sec,GAME_DATE,SEASON_ID
0,22001074,1610612746,LAC,LA,1628379,Luke Kennard,Luke,F,,29.000000:55,6.0,14.0,0.429,1.0,7.0,0.143,0.0,0.0,0.0,0.0,3.0,3.0,1.0,3.0,0.0,0.0,2.0,13.0,-3.0,29.0,55,29.916667,2021-05-16,22020
1,22001074,1610612746,LAC,LA,202335,Patrick Patterson,Patrick,F,,42.000000:06,4.0,11.0,0.364,2.0,8.0,0.25,0.0,0.0,0.0,3.0,3.0,6.0,4.0,1.0,0.0,0.0,0.0,10.0,-10.0,42.0,6,42.1,2021-05-16,22020
2,22001074,1610612746,LAC,LA,1627826,Ivica Zubac,Ivica,C,,0.000000:06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6,0.1,2021-05-16,22020
3,22001074,1610612746,LAC,LA,201976,Patrick Beverley,Patrick,G,,18.000000:52,5.0,8.0,0.625,0.0,2.0,0.0,1.0,2.0,0.5,0.0,0.0,0.0,4.0,0.0,1.0,0.0,1.0,11.0,-1.0,18.0,52,18.866667,2021-05-16,22020
4,22001074,1610612746,LAC,LA,202704,Reggie Jackson,Reggie,G,,6.000000:52,2.0,5.0,0.4,2.0,4.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,-13.0,6.0,52,6.866667,2021-05-16,22020


# Historical maxes

In [6]:
for _df in [df_train, df_test]:
    _df['pts_plus_reb'] = _df.eval('PTS + REB')
    _df['pts_plus_ast'] = _df.eval('PTS + AST')
    _df['ast_plus_reb'] = _df.eval('AST + REB')
    _df['ast_minus_to'] = _df.eval('AST - TO')
    _df['str_plus_blk'] = _df.eval('STL + BLK')

In [7]:
BASIC_COLS = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'min_sec', 'FGM', 'FG3M', 'FTM', 'TO', 'PLUS_MINUS']
PAIR_COLS = ['pts_plus_reb', 'pts_plus_ast', 'ast_plus_reb', 'ast_minus_to', 'str_plus_blk']
COLS = BASIC_COLS + PAIR_COLS

In [8]:
df = pd.concat([df_train, df_test]).sort_values(['GAME_DATE', 'TEAM_ID', 'PLAYER_ID'])
print(df.shape)

(49093, 39)


In [9]:
df_max_league = (
    df
    .groupby('GAME_DATE')
    [COLS].max()
    .cummax()
    .shift()
    .fillna(0)
)

print(df_max_league.shape)
df_max_league.head()

(305, 16)


Unnamed: 0_level_0,PTS,REB,AST,STL,BLK,min_sec,FGM,FG3M,FTM,TO,PLUS_MINUS,pts_plus_reb,pts_plus_ast,ast_plus_reb,ast_minus_to,str_plus_blk
GAME_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-12-22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-12-23,33.0,12.0,10.0,3.0,2.0,35.766667,13.0,5.0,7.0,5.0,32.0,39.0,36.0,20.0,7.0,4.0
2020-12-25,44.0,17.0,15.0,7.0,8.0,41.95,18.0,10.0,14.0,8.0,40.0,48.0,53.0,29.0,9.0,8.0
2020-12-26,44.0,17.0,15.0,7.0,8.0,41.95,18.0,10.0,14.0,8.0,40.0,48.0,53.0,29.0,9.0,8.0
2020-12-27,44.0,17.0,17.0,7.0,8.0,45.233333,18.0,10.0,15.0,8.0,40.0,48.0,61.0,29.0,12.0,8.0


In [10]:
def agg_season_stats(sub_df):
    _data = {c: sub_df[c].cummax() for c in COLS}
    _data['GAME_DATE'] = sub_df.GAME_DATE
    res = (
        pd.DataFrame(_data)
        .set_index('GAME_DATE')
        .shift()
        .fillna(0)
        .reset_index()
    )
    return res

df_max_season = (
    df
    .groupby(['GAME_DATE', 'SEASON_ID'])
    [COLS].max()
    .reset_index()#level=['SEASON_ID'])
    .groupby(['SEASON_ID'], group_keys=True)
    .apply(agg_season_stats)
    .reset_index()
    .drop(columns=['level_1'])
    .set_index(['GAME_DATE'])
)

print(df_max_season.shape)
df_max_season.head()

(305, 17)


Unnamed: 0_level_0,SEASON_ID,PTS,REB,AST,STL,BLK,min_sec,FGM,FG3M,FTM,TO,PLUS_MINUS,pts_plus_reb,pts_plus_ast,ast_plus_reb,ast_minus_to,str_plus_blk
GAME_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-12-22,22020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-12-23,22020,33.0,12.0,10.0,3.0,2.0,35.766667,13.0,5.0,7.0,5.0,32.0,39.0,36.0,20.0,7.0,4.0
2020-12-25,22020,44.0,17.0,15.0,7.0,8.0,41.95,18.0,10.0,14.0,8.0,40.0,48.0,53.0,29.0,9.0,8.0
2020-12-26,22020,44.0,17.0,15.0,7.0,8.0,41.95,18.0,10.0,14.0,8.0,40.0,48.0,53.0,29.0,9.0,8.0
2020-12-27,22020,44.0,17.0,17.0,7.0,8.0,45.233333,18.0,10.0,15.0,8.0,40.0,48.0,61.0,29.0,12.0,8.0


In [11]:
def agg_player_stats(sub_df):
    _data = {c: sub_df[c].cummax() for c in COLS}
    _data['GAME_DATE'] = sub_df.GAME_DATE
    res = (
        pd.DataFrame(_data)
        .set_index('GAME_DATE')
        .shift()
        .fillna(0)
        .reset_index()
    )
    return res
df_max_player = (
    df
    .groupby('PLAYER_ID', group_keys=True)
    .apply(agg_player_stats)
    .reset_index()
    .drop(columns=['level_1'])
    .set_index(['PLAYER_ID', 'GAME_DATE'])
)

print(df_max_player.shape)
df_max_player.head()

(49093, 16)


Unnamed: 0_level_0,Unnamed: 1_level_0,PTS,REB,AST,STL,BLK,min_sec,FGM,FG3M,FTM,TO,PLUS_MINUS,pts_plus_reb,pts_plus_ast,ast_plus_reb,ast_minus_to,str_plus_blk
PLAYER_ID,GAME_DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
101108,2020-12-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101108,2020-12-26,8.0,4.0,5.0,2.0,0.0,27.75,3.0,0.0,2.0,3.0,-10.0,12.0,13.0,9.0,2.0,2.0
101108,2020-12-27,22.0,5.0,12.0,2.0,1.0,32.783333,8.0,2.0,4.0,3.0,-1.0,27.0,34.0,17.0,12.0,3.0
101108,2020-12-29,22.0,5.0,12.0,2.0,1.0,32.783333,8.0,2.0,4.0,3.0,-1.0,27.0,34.0,17.0,12.0,3.0
101108,2020-12-31,22.0,5.0,12.0,2.0,1.0,32.783333,8.0,2.0,4.0,3.0,19.0,27.0,34.0,17.0,12.0,3.0


## merge

In [12]:
df_res = pd.merge(
    pd.merge(
        pd.merge(df, df_max_league, how='left', left_on='GAME_DATE', right_index=True, suffixes=('', '_league')),
        df_max_season.drop(columns=['SEASON_ID']), how='left', left_on='GAME_DATE', right_index=True, suffixes=('', '_season')
    ),
    df_max_player.reset_index(), how='left', left_on=['PLAYER_ID', 'GAME_DATE'], right_on=['PLAYER_ID', 'GAME_DATE'], suffixes=('', '_player')
)
    
print(df_res.shape)
df_res.head()

(49093, 87)


Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,_min,_sec,min_sec,GAME_DATE,SEASON_ID,pts_plus_reb,pts_plus_ast,ast_plus_reb,ast_minus_to,str_plus_blk,PTS_league,REB_league,AST_league,STL_league,BLK_league,min_sec_league,FGM_league,FG3M_league,FTM_league,TO_league,PLUS_MINUS_league,pts_plus_reb_league,pts_plus_ast_league,ast_plus_reb_league,ast_minus_to_league,str_plus_blk_league,PTS_season,REB_season,AST_season,STL_season,BLK_season,min_sec_season,FGM_season,FG3M_season,FTM_season,TO_season,PLUS_MINUS_season,pts_plus_reb_season,pts_plus_ast_season,ast_plus_reb_season,ast_minus_to_season,str_plus_blk_season,PTS_player,REB_player,AST_player,STL_player,BLK_player,min_sec_player,FGM_player,FG3M_player,FTM_player,TO_player,PLUS_MINUS_player,pts_plus_reb_player,pts_plus_ast_player,ast_plus_reb_player,ast_minus_to_player,str_plus_blk_player
0,22000001,1610612744,GSW,Golden State,1626162,Kelly Oubre Jr.,Kelly,G,,25.000000:39,3.0,14.0,0.214,0.0,6.0,0.0,0.0,0.0,0.0,4.0,3.0,7.0,2.0,1.0,2.0,3.0,1.0,6.0,-28.0,25.0,39,25.65,2020-12-22,22020,13.0,8.0,9.0,-1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,22000001,1610612744,GSW,Golden State,1626172,Kevon Looney,Kevon,,,11.000000:17,2.0,4.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,2.0,3.0,4.0,-10.0,11.0,17,11.283333,2020-12-22,22020,6.0,4.0,2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,22000001,1610612744,GSW,Golden State,1627737,Marquese Chriss,Marquese,,,12.000000:26,4.0,10.0,0.4,1.0,3.0,0.333,0.0,0.0,0.0,3.0,5.0,8.0,1.0,0.0,0.0,0.0,2.0,9.0,-6.0,12.0,26,12.433333,2020-12-22,22020,17.0,10.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,22000001,1610612744,GSW,Golden State,1627814,Damion Lee,Damion,,,12.000000:14,1.0,2.0,0.5,1.0,1.0,1.0,0.0,0.0,0.0,1.0,6.0,7.0,2.0,0.0,0.0,0.0,0.0,3.0,8.0,12.0,14,12.233333,2020-12-22,22020,10.0,5.0,9.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,22000001,1610612744,GSW,Golden State,1628539,Mychal Mulder,Mychal,,,6.000000:36,3.0,3.0,1.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,9.0,6.0,36,6.6,2020-12-22,22020,9.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df_res.tail()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,_min,_sec,min_sec,GAME_DATE,SEASON_ID,pts_plus_reb,pts_plus_ast,ast_plus_reb,ast_minus_to,str_plus_blk,PTS_league,REB_league,AST_league,STL_league,BLK_league,min_sec_league,FGM_league,FG3M_league,FTM_league,TO_league,PLUS_MINUS_league,pts_plus_reb_league,pts_plus_ast_league,ast_plus_reb_league,ast_minus_to_league,str_plus_blk_league,PTS_season,REB_season,AST_season,STL_season,BLK_season,min_sec_season,FGM_season,FG3M_season,FTM_season,TO_season,PLUS_MINUS_season,pts_plus_reb_season,pts_plus_ast_season,ast_plus_reb_season,ast_minus_to_season,str_plus_blk_season,PTS_player,REB_player,AST_player,STL_player,BLK_player,min_sec_player,FGM_player,FG3M_player,FTM_player,TO_player,PLUS_MINUS_player,pts_plus_reb_player,pts_plus_ast_player,ast_plus_reb_player,ast_minus_to_player,str_plus_blk_player
49088,22101217,1610612766,CHA,Charlotte,1630539,Kai Jones,Kai,,,2.000000:35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,-4.0,2.0,35,2.583333,2022-04-10,22021,0.0,0.0,0.0,0.0,2.0,62.0,30.0,24.0,10.0,10.0,56.516667,22.0,11.0,23.0,10.0,54.0,77.0,70.0,45.0,19.0,11.0,60.0,25.0,19.0,8.0,8.0,56.516667,22.0,11.0,23.0,10.0,52.0,77.0,70.0,35.0,17.0,9.0,4.0,2.0,1.0,1.0,0.0,7.316667,1.0,1.0,2.0,2.0,7.0,6.0,4.0,2.0,1.0,1.0
49089,22101217,1610612766,CHA,Charlotte,1630547,James Bouknight,James,,,2.000000:35,1.0,2.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,-4.0,2.0,35,2.583333,2022-04-10,22021,3.0,2.0,1.0,0.0,0.0,62.0,30.0,24.0,10.0,10.0,56.516667,22.0,11.0,23.0,10.0,54.0,77.0,70.0,45.0,19.0,11.0,60.0,25.0,19.0,8.0,8.0,56.516667,22.0,11.0,23.0,10.0,52.0,77.0,70.0,35.0,17.0,9.0,24.0,6.0,3.0,2.0,1.0,28.816667,9.0,6.0,5.0,2.0,19.0,30.0,25.0,7.0,2.0,2.0
49090,22101217,1610612766,CHA,Charlotte,1630550,JT Thor,JT,,,2.000000:35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,-4.0,2.0,35,2.583333,2022-04-10,22021,1.0,0.0,1.0,-1.0,0.0,62.0,30.0,24.0,10.0,10.0,56.516667,22.0,11.0,23.0,10.0,54.0,77.0,70.0,45.0,19.0,11.0,60.0,25.0,19.0,8.0,8.0,56.516667,22.0,11.0,23.0,10.0,52.0,77.0,70.0,35.0,17.0,9.0,8.0,4.0,3.0,2.0,3.0,27.85,3.0,2.0,2.0,1.0,17.0,11.0,10.0,7.0,2.0,3.0
49091,22101217,1610612766,CHA,Charlotte,202738,Isaiah Thomas,Isaiah,,,12.000000:31,5.0,8.0,0.625,2.0,4.0,0.5,2.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,14.0,-1.0,12.0,31,12.516667,2022-04-10,22021,15.0,15.0,2.0,1.0,1.0,62.0,30.0,24.0,10.0,10.0,56.516667,22.0,11.0,23.0,10.0,54.0,77.0,70.0,45.0,19.0,11.0,60.0,25.0,19.0,8.0,8.0,56.516667,22.0,11.0,23.0,10.0,52.0,77.0,70.0,35.0,17.0,9.0,19.0,5.0,4.0,2.0,1.0,32.383333,5.0,4.0,7.0,4.0,15.0,21.0,20.0,8.0,4.0,2.0
49092,22101217,1610612766,CHA,Charlotte,203486,Mason Plumlee,Mason,C,,21.000000:08,4.0,6.0,0.667,0.0,0.0,0.0,0.0,2.0,0.0,1.0,7.0,8.0,2.0,0.0,2.0,2.0,3.0,8.0,-9.0,21.0,8,21.133333,2022-04-10,22021,16.0,10.0,10.0,0.0,2.0,62.0,30.0,24.0,10.0,10.0,56.516667,22.0,11.0,23.0,10.0,54.0,77.0,70.0,45.0,19.0,11.0,60.0,25.0,19.0,8.0,8.0,56.516667,22.0,11.0,23.0,10.0,52.0,77.0,70.0,35.0,17.0,9.0,21.0,21.0,10.0,4.0,4.0,47.466667,9.0,0.0,8.0,8.0,31.0,38.0,28.0,26.0,9.0,6.0


# Count real maxes

In [15]:
_gt_real = {c: df_res[c] for c in ['GAME_ID', 'PLAYER_ID']}
GT_COLS = []
for i, col in enumerate(COLS):
    for suff in ['league', 'season', 'player']:
        _gt_col = f"{col}_gt_{suff}"
        _gt_real[_gt_col] = df_res.eval(f"({col} > {col}_{suff}) and (min_sec_{suff} > 0)")
        GT_COLS.append(_gt_col)

df_gt = pd.DataFrame(_gt_real)
print(df_gt.shape)
df_gt.head()

(49093, 50)


Unnamed: 0,GAME_ID,PLAYER_ID,PTS_gt_league,PTS_gt_season,PTS_gt_player,REB_gt_league,REB_gt_season,REB_gt_player,AST_gt_league,AST_gt_season,AST_gt_player,STL_gt_league,STL_gt_season,STL_gt_player,BLK_gt_league,BLK_gt_season,BLK_gt_player,min_sec_gt_league,min_sec_gt_season,min_sec_gt_player,FGM_gt_league,FGM_gt_season,FGM_gt_player,FG3M_gt_league,FG3M_gt_season,FG3M_gt_player,FTM_gt_league,FTM_gt_season,FTM_gt_player,TO_gt_league,TO_gt_season,TO_gt_player,PLUS_MINUS_gt_league,PLUS_MINUS_gt_season,PLUS_MINUS_gt_player,pts_plus_reb_gt_league,pts_plus_reb_gt_season,pts_plus_reb_gt_player,pts_plus_ast_gt_league,pts_plus_ast_gt_season,pts_plus_ast_gt_player,ast_plus_reb_gt_league,ast_plus_reb_gt_season,ast_plus_reb_gt_player,ast_minus_to_gt_league,ast_minus_to_gt_season,ast_minus_to_gt_player,str_plus_blk_gt_league,str_plus_blk_gt_season,str_plus_blk_gt_player
0,22000001,1626162,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,22000001,1626172,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,22000001,1627737,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,22000001,1627814,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,22000001,1628539,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [16]:
df_gt[GT_COLS].mean()

PTS_gt_league             0.000102
PTS_gt_season             0.000265
PTS_gt_player             0.050048
REB_gt_league             0.000326
REB_gt_season             0.000489
REB_gt_player             0.042022
AST_gt_league             0.000163
AST_gt_season             0.000306
AST_gt_player             0.035158
STL_gt_league             0.000122
STL_gt_season             0.000346
STL_gt_player             0.026216
BLK_gt_league             0.000143
BLK_gt_season             0.000285
BLK_gt_player             0.021856
min_sec_gt_league         0.000998
min_sec_gt_season         0.001304
min_sec_gt_player         0.062249
FGM_gt_league             0.000102
FGM_gt_season             0.000183
FGM_gt_player             0.041208
FG3M_gt_league            0.000081
FG3M_gt_season            0.000183
FG3M_gt_player            0.028517
FTM_gt_league             0.000244
FTM_gt_season             0.000346
FTM_gt_player             0.034262
TO_gt_league              0.000224
TO_gt_season        

In [17]:
REAL_MAX_COLS = []
for suff in ['league', 'season', 'player']:
    _col = f'real_max_{suff}'
    df_gt[_col] = df_gt[[c for c in GT_COLS if c.endswith(suff)]].max(axis=1)
    REAL_MAX_COLS.append(_col)
df_gt[REAL_MAX_COLS].mean()

real_max_league    0.002302
real_max_season    0.004115
real_max_player    0.226550
dtype: float64

# Add synthetic maxes

In [18]:
df_mod = df_res.copy()

In [19]:
SYNTH_PCT = 0.005
sample_idx = df_mod[(df_mod.min_sec_league > 0) | (df_mod.min_sec_season > 0) | (df_mod.min_sec_player > 0)].sample(frac=SYNTH_PCT, random_state=SEED).index
print(sample_idx.shape)

(245,)


In [20]:
_rs = np.random.RandomState(seed=SEED)
col_choice = _rs.choice(BASIC_COLS, size=len(sample_idx))
suff_choice = _rs.choice(['league', 'season'], size=len(sample_idx))
add_to_max_choice = _rs.choice([1, 2, 3, 4], size=len(sample_idx), p=[0.4, 0.3, 0.2, 0.1])
df_mod['modified'] = None
for _idx, _col, _suff, _add in zip(sample_idx, col_choice, suff_choice, add_to_max_choice):
    mod_col = f'{_col}_{_suff}'
    new_val = df_mod.loc[_idx, mod_col] + _add * (10 if _col == 'min_sec' else 1)
    df_mod.loc[_idx, f'{_col}'] = new_val
    df_mod.loc[_idx, 'modified'] = mod_col

df_mod.modified.value_counts(dropna=False)

None                 48848
FG3M_season             15
FGM_league              15
PTS_season              15
STL_season              14
FTM_season              14
BLK_season              13
FGM_season              13
AST_league              13
REB_season              13
PLUS_MINUS_league       12
PTS_league              11
FG3M_league             10
AST_season              10
FTM_league              10
TO_league               10
TO_season               10
PLUS_MINUS_season       10
min_sec_league           9
STL_league               8
min_sec_season           8
BLK_league               7
REB_league               5
Name: modified, dtype: int64

In [21]:
_gt_synth = {c: df_mod[c] for c in ['GAME_ID', 'PLAYER_ID']}
GT_COLS_SYNTH = []
for i, col in enumerate(COLS):
    for suff in ['league', 'season']:
        _gt_col = f"{col}_gt_{suff}"
        _gt_synth[_gt_col] = df_mod.eval(f"({col} > {col}_{suff}) and (min_sec_{suff} > 0)")
        GT_COLS_SYNTH.append(_gt_col)

df_gt_synth = pd.DataFrame(_gt_synth)
print(df_gt_synth.shape)
df_gt_synth.head()

(49093, 34)


Unnamed: 0,GAME_ID,PLAYER_ID,PTS_gt_league,PTS_gt_season,REB_gt_league,REB_gt_season,AST_gt_league,AST_gt_season,STL_gt_league,STL_gt_season,BLK_gt_league,BLK_gt_season,min_sec_gt_league,min_sec_gt_season,FGM_gt_league,FGM_gt_season,FG3M_gt_league,FG3M_gt_season,FTM_gt_league,FTM_gt_season,TO_gt_league,TO_gt_season,PLUS_MINUS_gt_league,PLUS_MINUS_gt_season,pts_plus_reb_gt_league,pts_plus_reb_gt_season,pts_plus_ast_gt_league,pts_plus_ast_gt_season,ast_plus_reb_gt_league,ast_plus_reb_gt_season,ast_minus_to_gt_league,ast_minus_to_gt_season,str_plus_blk_gt_league,str_plus_blk_gt_season
0,22000001,1626162,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,22000001,1626172,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,22000001,1627737,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,22000001,1627814,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,22000001,1628539,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [22]:
df_gt_synth[GT_COLS_SYNTH].mean()

PTS_gt_league             0.000530
PTS_gt_season             0.000794
REB_gt_league             0.000550
REB_gt_season             0.000856
AST_gt_league             0.000509
AST_gt_season             0.000774
STL_gt_league             0.000509
STL_gt_season             0.000794
BLK_gt_league             0.000407
BLK_gt_season             0.000693
min_sec_gt_league         0.001344
min_sec_gt_season         0.001650
FGM_gt_league             0.000591
FGM_gt_season             0.000754
FG3M_gt_league            0.000489
FG3M_gt_season            0.000693
FTM_gt_league             0.000733
FTM_gt_season             0.000835
TO_gt_league              0.000611
TO_gt_season              0.000733
PLUS_MINUS_gt_league      0.000530
PLUS_MINUS_gt_season      0.000835
pts_plus_reb_gt_league    0.000244
pts_plus_reb_gt_season    0.000448
pts_plus_ast_gt_league    0.000244
pts_plus_ast_gt_season    0.000326
ast_plus_reb_gt_league    0.000163
ast_plus_reb_gt_season    0.000265
ast_minus_to_gt_leag

In [23]:
SYNTH_MAX_COLS = []
for suff in ['league', 'season']:
    _col = f'synth_max_{suff}'
    df_gt_synth[_col] = df_gt_synth[[c for c in GT_COLS if c.endswith(suff)]].max(axis=1)
    SYNTH_MAX_COLS.append(_col)
df_gt_synth[SYNTH_MAX_COLS].mean()

synth_max_league    0.006457
synth_max_season    0.009064
dtype: float64

In [24]:
df_gt_synth.head()

Unnamed: 0,GAME_ID,PLAYER_ID,PTS_gt_league,PTS_gt_season,REB_gt_league,REB_gt_season,AST_gt_league,AST_gt_season,STL_gt_league,STL_gt_season,BLK_gt_league,BLK_gt_season,min_sec_gt_league,min_sec_gt_season,FGM_gt_league,FGM_gt_season,FG3M_gt_league,FG3M_gt_season,FTM_gt_league,FTM_gt_season,TO_gt_league,TO_gt_season,PLUS_MINUS_gt_league,PLUS_MINUS_gt_season,pts_plus_reb_gt_league,pts_plus_reb_gt_season,pts_plus_ast_gt_league,pts_plus_ast_gt_season,ast_plus_reb_gt_league,ast_plus_reb_gt_season,ast_minus_to_gt_league,ast_minus_to_gt_season,str_plus_blk_gt_league,str_plus_blk_gt_season,synth_max_league,synth_max_season
0,22000001,1626162,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,22000001,1626172,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,22000001,1627737,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,22000001,1627814,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,22000001,1628539,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


## save df

In [30]:
META_COLS = ['GAME_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'PLAYER_ID',
       'PLAYER_NAME', 'NICKNAME', 'START_POSITION', 'COMMENT', 'GAME_DATE', 'SEASON_ID']
CURRENT_RECORD_COLS = ['PTS_league', 'REB_league', 'AST_league', 'STL_league',
       'BLK_league', 'min_sec_league', 'FGM_league', 'FG3M_league',
       'FTM_league', 'TO_league', 'PLUS_MINUS_league', 'pts_plus_reb_league',
       'pts_plus_ast_league', 'ast_plus_reb_league', 'ast_minus_to_league',
       'str_plus_blk_league', 'PTS_season', 'REB_season', 'AST_season',
       'STL_season', 'BLK_season', 'min_sec_season', 'FGM_season',
       'FG3M_season', 'FTM_season', 'TO_season', 'PLUS_MINUS_season',
       'pts_plus_reb_season', 'pts_plus_ast_season', 'ast_plus_reb_season',
       'ast_minus_to_season', 'str_plus_blk_season']

In [31]:
# df_mod.columns

In [32]:
df_synth = pd.merge(
    pd.merge(
        df_mod[META_COLS + COLS + CURRENT_RECORD_COLS + ['modified']], 
        df_gt_synth[['GAME_ID', 'PLAYER_ID', 'synth_max_league', 'synth_max_season']], 
        how='left', on=['GAME_ID', 'PLAYER_ID']
    ),
    df_gt[['GAME_ID', 'PLAYER_ID', 'real_max_league', 'real_max_season']], 
    how='left', on=['GAME_ID', 'PLAYER_ID']
)
print(df_synth.shape)
df_synth.head()

(49093, 64)


Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,GAME_DATE,SEASON_ID,PTS,REB,AST,STL,BLK,min_sec,FGM,FG3M,FTM,TO,PLUS_MINUS,pts_plus_reb,pts_plus_ast,ast_plus_reb,ast_minus_to,str_plus_blk,PTS_league,REB_league,AST_league,STL_league,BLK_league,min_sec_league,FGM_league,FG3M_league,FTM_league,TO_league,PLUS_MINUS_league,pts_plus_reb_league,pts_plus_ast_league,ast_plus_reb_league,ast_minus_to_league,str_plus_blk_league,PTS_season,REB_season,AST_season,STL_season,BLK_season,min_sec_season,FGM_season,FG3M_season,FTM_season,TO_season,PLUS_MINUS_season,pts_plus_reb_season,pts_plus_ast_season,ast_plus_reb_season,ast_minus_to_season,str_plus_blk_season,modified,synth_max_league,synth_max_season,real_max_league,real_max_season
0,22000001,1610612744,GSW,Golden State,1626162,Kelly Oubre Jr.,Kelly,G,,2020-12-22,22020,6.0,7.0,2.0,1.0,2.0,25.65,3.0,0.0,0.0,3.0,-28.0,13.0,8.0,9.0,-1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,False,False,False,False
1,22000001,1610612744,GSW,Golden State,1626172,Kevon Looney,Kevon,,,2020-12-22,22020,4.0,2.0,0.0,0.0,0.0,11.283333,2.0,0.0,0.0,2.0,-10.0,6.0,4.0,2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,False,False,False,False
2,22000001,1610612744,GSW,Golden State,1627737,Marquese Chriss,Marquese,,,2020-12-22,22020,9.0,8.0,1.0,0.0,0.0,12.433333,4.0,1.0,0.0,0.0,-6.0,17.0,10.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,False,False,False,False
3,22000001,1610612744,GSW,Golden State,1627814,Damion Lee,Damion,,,2020-12-22,22020,3.0,7.0,2.0,0.0,0.0,12.233333,1.0,1.0,0.0,0.0,8.0,10.0,5.0,9.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,False,False,False,False
4,22000001,1610612744,GSW,Golden State,1628539,Mychal Mulder,Mychal,,,2020-12-22,22020,8.0,1.0,0.0,0.0,0.0,6.6,3.0,2.0,0.0,0.0,9.0,9.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,False,False,False,False


In [33]:
df_synth.to_csv('synth_2020_22.csv.zip', compression='zip')