In [1]:
import os
import shap
import pandas as pd
import numpy as np
import time
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import sqlite3
import joblib

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
SEED = 42
DB_PATH = '../data/basnya.db'

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


# check db

In [2]:
def get_df_from_db(sql_query):
    with sqlite3.connect(DB_PATH) as connection:
        return pd.read_sql(sql_query, connection)

_meta = get_df_from_db("""
SELECT 
    sch.*
FROM 
    sqlite_schema as sch
WHERE 
    sch.type ='table' AND 
    sch.name NOT LIKE 'sqlite_%';""")
_meta['cnt'] = _meta.tbl_name.map(lambda t: get_df_from_db(f"SELECT count(*) FROM  {t}").iloc[0, 0])
_meta

Unnamed: 0,type,name,tbl_name,rootpage,sql,cnt
0,table,player_1,player_1,2,"CREATE TABLE ""player_1"" (\n ""ind...",4864
1,table,player_2,player_2,3,"CREATE TABLE ""player_2"" (\n ""ind...",49162
2,table,teams,teams,4,"CREATE TABLE ""teams"" (\n ""index""...",31
3,table,games,games,5,"CREATE TABLE ""games"" (\n ""index""...",7906
4,table,player_0,player_0,6,"CREATE TABLE ""player_0"" (\n ""in...",4900
5,table,boxscoresummaryv2_0,boxscoresummaryv2_0,7,"CREATE TABLE ""boxscoresummaryv2_0"" (\n ...",7906
6,table,boxscoresummaryv2_1,boxscoresummaryv2_1,8,"CREATE TABLE ""boxscoresummaryv2_1"" (\n ...",15722
7,table,boxscoresummaryv2_2,boxscoresummaryv2_2,11,"CREATE TABLE ""boxscoresummaryv2_2"" (\n ...",23763
8,table,boxscoresummaryv2_3,boxscoresummaryv2_3,12,"CREATE TABLE ""boxscoresummaryv2_3"" (\n ...",51614
9,table,boxscoresummaryv2_4,boxscoresummaryv2_4,13,"CREATE TABLE ""boxscoresummaryv2_4"" (\n ...",7860


In [34]:
df_team = get_df_from_db("select * from teams;")
print(df_team.shape)
pd.concat([df_team.head(), df_team.tail()])

(31, 8)


Unnamed: 0,index,id,full_name,abbreviation,nickname,city,state,year_founded
0,1,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949
1,2,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949
2,3,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946
3,4,1610612751,Brooklyn Nets,BKN,Nets,Brooklyn,New York,1976
4,5,1610612766,Charlotte Hornets,CHA,Hornets,Charlotte,North Carolina,1988
26,27,1610612758,Sacramento Kings,SAC,Kings,Sacramento,California,1948
27,28,1610612759,San Antonio Spurs,SAS,Spurs,San Antonio,Texas,1976
28,29,1610612761,Toronto Raptors,TOR,Raptors,Toronto,Ontario,1995
29,30,1610612762,Utah Jazz,UTA,Jazz,Utah,Utah,1974
30,31,1610612764,Washington Wizards,WAS,Wizards,Washington,District of Columbia,1961


In [11]:
df_game = get_df_from_db("select * from games order by GAME_DATE_EST;")
print(df_game.shape)
pd.concat([df_game.head(), df_game.tail()])

(7906, 16)


Unnamed: 0,index,GAME_DATE_EST,GAME_SEQUENCE,GAME_ID,GAME_STATUS_ID,GAME_STATUS_TEXT,GAMECODE,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,LIVE_PERIOD,LIVE_PC_TIME,NATL_TV_BROADCASTER_ABBREVIATION,LIVE_PERIOD_TIME_BCAST,WH_STATUS,GAME_ID_STR
0,3496,2018-09-28T00:00:00,1.0,11800001,3.0,Final,20180928/MELPHI,1610612755,15016,2018,4.0,,,Q4 -,0,11800001
1,3497,2018-09-28T00:00:00,2.0,11800002,3.0,Final,20180928/BOSCHA,1610612766,1610612738,2018,4.0,,NBA TV,Q4 - NBA TV,1,11800002
2,3498,2018-09-29T00:00:00,1.0,11800003,3.0,Final,20180929/PORTOR,1610612761,1610612757,2018,4.0,,,Q4 -,0,11800003
3,3499,2018-09-29T00:00:00,2.0,11800004,3.0,Final,20180929/BJDDAL,1610612742,15021,2018,4.0,,,Q4 -,0,11800004
4,3500,2018-09-29T00:00:00,3.0,11800005,3.0,Final,20180929/MINGSW,1610612744,1610612750,2018,4.0,,NBA TV,Q4 - NBA TV,0,11800005
7901,7902,2023-12-02T00:00:00,8.0,22300287,3.0,Final,20231202/OKCDAL,1610612742,1610612760,2023,4.0,,,Q4 -,1,22300287
7902,7903,2023-12-02T00:00:00,9.0,22300288,3.0,Final,20231202/MEMPHX,1610612756,1610612763,2023,4.0,,,Q4 -,1,22300288
7903,7904,2023-12-02T00:00:00,10.0,22300289,3.0,Final,20231202/PORUTA,1610612762,1610612757,2023,5.0,,,Q5 -,1,22300289
7904,7905,2023-12-02T00:00:00,11.0,22300290,3.0,Final,20231202/DENSAC,1610612758,1610612743,2023,4.0,,NBA TV,Q4 - NBA TV,1,22300290
7905,7906,2023-12-02T00:00:00,12.0,22300291,3.0,Final,20231202/HOULAL,1610612747,1610612745,2023,4.0,,,Q4 -,1,22300291


In [25]:
df_player = get_df_from_db("select * from player_0;")
print(df_player.shape)
pd.concat([df_player.head(), df_player.tail()])

(4815, 34)


Unnamed: 0,index,PERSON_ID,FIRST_NAME,LAST_NAME,DISPLAY_FIRST_LAST,DISPLAY_LAST_COMMA_FIRST,DISPLAY_FI_LAST,PLAYER_SLUG,BIRTHDATE,SCHOOL,COUNTRY,LAST_AFFILIATION,HEIGHT,WEIGHT,SEASON_EXP,JERSEY,POSITION,ROSTERSTATUS,GAMES_PLAYED_CURRENT_SEASON_FLAG,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CODE,TEAM_CITY,PLAYERCODE,FROM_YEAR,TO_YEAR,DLEAGUE_FLAG,NBA_FLAG,GAMES_PLAYED_FLAG,DRAFT_YEAR,DRAFT_ROUND,DRAFT_NUMBER,GREATEST_75_FLAG
0,1,100,Tim,Legler,Tim Legler,"Legler, Tim",T. Legler,tim-legler,1966-12-26T00:00:00,La Salle,USA,La Salle/USA,6-4,200.0,10,,Guard,Inactive,N,0,,,,,tim_legler,1989,1999,N,Y,Y,Undrafted,Undrafted,Undrafted,N
1,2,1000,Shandon,Anderson,Shandon Anderson,"Anderson, Shandon",S. Anderson,shandon-anderson,1973-12-31T00:00:00,Georgia,USA,Georgia/USA,6-6,215.0,10,49.0,Guard-Forward,Inactive,N,1610612752,Knicks,NYK,knicks,New York,shandon_anderson,1996,2005,N,Y,Y,1996,2,54,N
2,3,1002,Reggie,Geary,Reggie Geary,"Geary, Reggie",R. Geary,reggie-geary,1973-08-31T00:00:00,Arizona,USA,Arizona/USA,6-2,187.0,1,4.0,Guard,Inactive,N,1610612759,Spurs,SAS,spurs,San Antonio,reggie_geary,1996,1997,N,Y,Y,1996,2,56,N
3,4,100263,Bill,Laimbeer,Bill Laimbeer,"Laimbeer, Bill",B. Laimbeer,bill-laimbeer,1957-05-19T00:00:00,,USA,Notre Dame/USA,6-11,245.0,14,40.0,Center,Inactive,N,1610612765,Pistons,DET,pistons,Detroit,bill_laimbeer,1980,1993,N,Y,Y,1979,3,65,N
4,5,1003,Drew,Barry,Drew Barry,"Barry, Drew",D. Barry,drew-barry,1973-02-17T00:00:00,Georgia Tech,USA,Georgia Tech/USA,6-5,191.0,3,10.0,Guard,Inactive,N,1610612737,Hawks,ATL,hawks,Atlanta,drew_barry,1997,1999,N,Y,Y,1996,2,57,N
4810,4811,992,Marcus,Brown,Marcus Brown,"Brown, Marcus",M. Brown,marcus-brown,1974-04-03T00:00:00,Murray State,USA,Murray State/USA,,,2,,,Inactive,N,0,,,,,marcus_brown,1996,2007,N,Y,Y,1996,2,46,N
4811,4812,994,Jamie,Feick,Jamie Feick,"Feick, Jamie",J. Feick,jamie-feick,1974-07-03T00:00:00,Michigan State,USA,Michigan State/USA,6-8,255.0,6,14.0,Forward,Inactive,N,1610612751,Nets,NJN,nets,New Jersey,jamie_feick,1996,2002,N,Y,Y,1996,2,48,N
4812,4813,997,Chris,Robinson,Chris Robinson,"Robinson, Chris",C. Robinson,chris-robinson,1974-04-02T00:00:00,Western Kentucky,USA,Western Kentucky/USA,6-5,200.0,2,5.0,Guard,Inactive,N,1610612763,Grizzlies,VAN,grizzlies,Vancouver,chris_robinson,1996,1997,N,Y,Y,1996,2,51,N
4813,4814,998,Mark,Pope,Mark Pope,"Pope, Mark",M. Pope,mark-pope,1972-09-11T00:00:00,Kentucky,USA,Kentucky/USA,6-10,235.0,7,41.0,Center-Forward,Inactive,N,1610612749,Bucks,MIL,bucks,Milwaukee,mark_pope,1997,2004,N,Y,Y,1996,2,52,N
4814,4815,999,Jeff,Nordgaard,Jeff Nordgaard,"Nordgaard, Jeff",J. Nordgaard,jeff-nordgaard,1973-02-23T00:00:00,Wisconsin-Green Bay,USA,Wisconsin-Green Bay/USA,6-7,225.0,1,,Forward,Inactive,N,0,,,,,jeff_nordgaard,1997,1997,N,Y,Y,1996,2,53,N


In [15]:
df = get_df_from_db("select * from boxscoretraditionalv2_0;")
print(df.shape)
pd.concat([df.head(), df.tail()])

(205400, 31)


Unnamed: 0,index,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_ID_STR
0,1,12100001,1610612751,BKN,Brooklyn,1627761,DeAndre' Bembry,DeAndre',F,,24.000000:05,2.0,3.0,0.667,0.0,1.0,0.0,4.0,8.0,0.5,2.0,3.0,5.0,2.0,0.0,0.0,0.0,1.0,8.0,8.0,12100001
1,2,12100001,1610612751,BKN,Brooklyn,200794,Paul Millsap,Paul,F,,18.000000:27,4.0,10.0,0.4,2.0,4.0,0.5,0.0,0.0,0.0,4.0,6.0,10.0,3.0,1.0,2.0,2.0,2.0,10.0,13.0,12100001
2,3,12100001,1610612751,BKN,Brooklyn,200746,LaMarcus Aldridge,LaMarcus,C,,14.000000:43,2.0,6.0,0.333,0.0,2.0,0.0,2.0,2.0,1.0,0.0,4.0,4.0,3.0,0.0,1.0,0.0,3.0,6.0,7.0,12100001
3,4,12100001,1610612751,BKN,Brooklyn,1628971,Bruce Brown,Bruce,G,,25.000000:02,5.0,9.0,0.556,2.0,2.0,1.0,0.0,0.0,0.0,0.0,3.0,3.0,2.0,1.0,0.0,0.0,5.0,12.0,3.0,12100001
4,5,12100001,1610612751,BKN,Brooklyn,1628975,Jevon Carter,Jevon,G,,25.000000:37,3.0,9.0,0.333,2.0,4.0,0.5,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,3.0,4.0,8.0,5.0,12100001
205395,205396,22300291,1610612747,LAL,Los Angeles,1630559,Austin Reaves,Austin,,,22.000000:09,6.0,11.0,0.545,1.0,4.0,0.25,5.0,6.0,0.833,1.0,3.0,4.0,1.0,3.0,0.0,1.0,2.0,18.0,15.0,22300291
205396,205397,22300291,1610612747,LAL,Los Angeles,1629637,Jaxson Hayes,Jaxson,,,6.000000:06,1.0,2.0,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,-6.0,22300291
205397,205398,22300291,1610612747,LAL,Los Angeles,1641720,Jalen Hood-Schifino,Jalen,,,3.000000:05,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.0,22300291
205398,205399,22300291,1610612747,LAL,Los Angeles,1641721,Maxwell Lewis,Maxwell,,,2.000000:54,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-7.0,22300291
205399,205400,22300291,1610612747,LAL,Los Angeles,1629060,Rui Hachimura,Rui,,DNP - Coach's Decision,,,,,,,,,,,,,,,,,,,,,22300291


In [41]:
print(len(set(df.PLAYER_ID) - set(df_player.PERSON_ID)), len(set(df_player.PERSON_ID) - set(df.PLAYER_ID)))
df[df.PLAYER_ID.isin(set(df.PLAYER_ID) - set(df_player.PERSON_ID))].sample(15, random_state=SEED)

274 3790


Unnamed: 0,index,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_ID_STR,GAME_DATE
177250,196468,12300042,1610612743,DEN,Denver,1629618,Jalen Pickett,Jalen,,,21.000000:43,3.0,4.0,0.75,1.0,2.0,0.5,0.0,0.0,0.0,0.0,3.0,3.0,2.0,0.0,0.0,1.0,4.0,7.0,4.0,12300042,2023-10-15
181864,201142,22300130,1610612762,UTA,Utah,1641729,Brice Sensabaugh,Brice,,DNP - Coach's Decision,,,,,,,,,,,,,,,,,,,,,22300130,2023-11-02
2139,2140,12100057,1610612743,DEN,Denver,1630793,Giorgi Bezhanishvili,Giorgi,,DNP - Coach's Decision,,,,,,,,,,,,,,,,,,,,,12100057,2021-10-14
178799,198077,22300018,1610612765,DET,Detroit,1631204,Marcus Sasser,Marcus,,,8.000000:25,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,22300018,2023-11-14
177257,196475,12300042,1610612743,DEN,Denver,1631421,Au'Diese Toney,Au'Diese,,,0.000000:56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,12300042,2023-10-15
184640,203918,22300235,1610612746,LAC,LA,1641738,Kobe Brown,Kobe,,DNP - Coach's Decision,,,,,,,,,,,,,,,,,,,,,22300235,2023-11-22
176531,195722,12300022,1610612757,POR,Portland,1642022,George Conditt IV,George,,DNP - Coach's Decision,,,,,,,,,,,,,,,,,,,,,12300022,2023-10-10
175942,195096,12300006,1610612755,PHI,Philadelphia,1641741,Ricky Council IV,Ricky,,,14.000000:06,1.0,4.0,0.25,0.0,0.0,0.0,1.0,2.0,0.5,2.0,1.0,3.0,0.0,1.0,0.0,0.0,1.0,3.0,-5.0,12300006,2023-10-08
184404,203682,22300226,1610612743,DEN,Denver,1629618,Jalen Pickett,Jalen,,,8.000000:09,0.0,4.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,22300226,2023-11-22
177913,197179,12300062,1610612765,DET,Detroit,1631204,Marcus Sasser,Marcus,,,26.000000:14,5.0,12.0,0.417,2.0,7.0,0.286,5.0,6.0,0.833,1.0,3.0,4.0,6.0,0.0,0.0,4.0,0.0,17.0,-6.0,12300062,2023-10-19


In [33]:
print(len(set(df.GAME_ID) - set(df_game.GAME_ID)), len(set(df_game.GAME_ID) - set(df.GAME_ID)))
df[df.GAME_ID.isin(set(df.GAME_ID) - set(df_game.GAME_ID))].sample(5, random_state=SEED)

5 0


Unnamed: 0,index,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_ID_STR
33854,33855,32100003,1610616839,NIQ,Team,196293907,Tiffany Haddish,Tiffany,,,,,,,,,,,,,,,,,,,,,,,32100003
52262,52263,2072100001,1612709911,IWA,Iowa,1630816,Artur Labinowicz,Artur,,,11.000000:33,3.0,5.0,0.6,2.0,4.0,0.5,0.0,0.0,0.0,2.0,1.0,3.0,2.0,0.0,0.0,0.0,0.0,8.0,0.0,2072100001
52256,52257,2072100001,1612709911,IWA,Iowa,1630665,Matt Lewis,Matt,G,,23.000000:00,6.0,8.0,0.75,5.0,6.0,0.833,1.0,1.0,1.0,0.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,20.0,10.0,2072100001
52536,52537,2072100013,1612709930,GLI,G League,1629080,Malik Pope,Malik,F,,22.000000:13,3.0,5.0,0.6,0.0,2.0,0.0,0.0,1.0,0.0,2.0,3.0,5.0,1.0,0.0,0.0,0.0,1.0,6.0,-10.0,2072100013
52273,52274,2072100002,1612709930,GLI,G League,1628614,Dakarai Allen,Dakarai,G,,24.000000:48,2.0,2.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,2.0,0.0,2.0,2.0,5.0,6.0,-18.0,2072100002


# train model

In [3]:
df = get_df_from_db("""
select bs.*, g.GAME_DATE_EST as GAME_DATE from boxscoretraditionalv2_0 as bs
left join games as g on bs.GAME_ID = g.GAME_ID
where (bs.TEAM_ID in (select distinct t.id from teams as t)) and (bs.PLAYER_ID in (select distinct p.PERSON_ID from player_0 as p))
;
""")
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)
df = df[df.MIN.notna()].copy()
print(df.shape)
df.GAME_DATE = pd.to_datetime(df.GAME_DATE).dt.date
df[['_min', '_sec']] = df['MIN'].str.split(':', expand=True)
df['min_sec'] = df._min.astype(float) + df._sec.astype(int) / 60 
df.head()

(185472, 32)
(184148, 32)
(150671, 32)


Unnamed: 0,index,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_ID_STR,GAME_DATE,_min,_sec,min_sec
0,1,12100001,1610612751,BKN,Brooklyn,1627761,DeAndre' Bembry,DeAndre',F,,24.000000:05,2.0,3.0,0.667,0.0,1.0,0.0,4.0,8.0,0.5,2.0,3.0,5.0,2.0,0.0,0.0,0.0,1.0,8.0,8.0,12100001,2021-10-03,24.0,5,24.083333
1,2,12100001,1610612751,BKN,Brooklyn,200794,Paul Millsap,Paul,F,,18.000000:27,4.0,10.0,0.4,2.0,4.0,0.5,0.0,0.0,0.0,4.0,6.0,10.0,3.0,1.0,2.0,2.0,2.0,10.0,13.0,12100001,2021-10-03,18.0,27,18.45
2,3,12100001,1610612751,BKN,Brooklyn,200746,LaMarcus Aldridge,LaMarcus,C,,14.000000:43,2.0,6.0,0.333,0.0,2.0,0.0,2.0,2.0,1.0,0.0,4.0,4.0,3.0,0.0,1.0,0.0,3.0,6.0,7.0,12100001,2021-10-03,14.0,43,14.716667
3,4,12100001,1610612751,BKN,Brooklyn,1628971,Bruce Brown,Bruce,G,,25.000000:02,5.0,9.0,0.556,2.0,2.0,1.0,0.0,0.0,0.0,0.0,3.0,3.0,2.0,1.0,0.0,0.0,5.0,12.0,3.0,12100001,2021-10-03,25.0,2,25.033333
4,5,12100001,1610612751,BKN,Brooklyn,1628975,Jevon Carter,Jevon,G,,25.000000:37,3.0,9.0,0.333,2.0,4.0,0.5,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,3.0,4.0,8.0,5.0,12100001,2021-10-03,25.0,37,25.616667


In [69]:
(df.GAME_ID // 1000000).value_counts()

22    80660
21    48494
12     6528
42     5483
11     4756
41     3455
52      372
51       18
Name: GAME_ID, dtype: int64

In [67]:
(df.GAME_ID.map(np.log10).round()).value_counts()

7.0    140438
8.0      9328
Name: GAME_ID, dtype: int64

In [68]:
df[df.GAME_ID.map(np.log10).round() == 8]

Unnamed: 0,index,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_ID_STR,GAME_DATE,_min,_sec,min_sec
33740,33899,42100101,1610612737,ATL,Atlanta,1629631,De'Andre Hunter,De'Andre,F,,34.000000:50,6.0,8.0,0.750,2.0,2.0,1.000,0.0,0.0,0.000,0.0,3.0,3.0,0.0,1.0,0.0,2.0,3.0,14.0,-31.0,0042100101,2022-04-17,34.000000,50,34.833333
33741,33900,42100101,1610612737,ATL,Atlanta,201568,Danilo Gallinari,Danilo,F,,29.000000:12,5.0,12.0,0.417,1.0,3.0,0.333,6.0,6.0,1.000,0.0,5.0,5.0,0.0,0.0,0.0,1.0,2.0,17.0,-17.0,0042100101,2022-04-17,29.000000,12,29.200000
33742,33901,42100101,1610612737,ATL,Atlanta,1630168,Onyeka Okongwu,Onyeka,C,,20.000000:33,1.0,3.0,0.333,0.0,0.0,0.000,1.0,2.0,0.500,3.0,4.0,7.0,1.0,0.0,1.0,0.0,4.0,3.0,-29.0,0042100101,2022-04-17,20.000000,33,20.550000
33743,33902,42100101,1610612737,ATL,Atlanta,1628989,Kevin Huerter,Kevin,G,,23.000000:13,3.0,8.0,0.375,2.0,6.0,0.333,0.0,0.0,0.000,1.0,0.0,1.0,2.0,1.0,1.0,2.0,2.0,8.0,-21.0,0042100101,2022-04-17,23.000000,13,23.216667
33744,33903,42100101,1610612737,ATL,Atlanta,1629027,Trae Young,Trae,G,,28.000000:27,1.0,12.0,0.083,0.0,7.0,0.000,6.0,7.0,0.857,1.0,5.0,6.0,4.0,2.0,0.0,6.0,1.0,8.0,-22.0,0042100101,2022-04-17,28.000000,27,28.450000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175178,194878,52200211,1610612750,MIN,Minnesota,1629162,Jordan McLaughlin,Jordan,,,6.000000:58,0.0,1.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0052200211,2023-04-14,6.000000,58,6.966667
175179,194879,52200211,1610612750,MIN,Minnesota,1631169,Josh Minott,Josh,,,3.000000:53,2.0,2.0,1.000,1.0,1.0,1.000,2.0,2.0,1.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,2.0,0052200211,2023-04-14,3.000000,53,3.883333
175180,194880,52200211,1610612750,MIN,Minnesota,1630233,Nathan Knight,Nathan,,,3.000000:53,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,2.0,2.0,1.0,1.0,0.0,2.0,0.0,0.0,2.0,0052200211,2023-04-14,3.000000,53,3.883333
175181,194881,52200211,1610612750,MIN,Minnesota,1631111,Wendell Moore Jr.,Wendell,,,3.000000:00,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0052200211,2023-04-14,3.000000,00,3.000000


In [63]:
df[df.GAME_ID.between(0, 20000000)]#.GAME_ID.unique()

Unnamed: 0,index,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_ID_STR,GAME_DATE,_min,_sec,min_sec
0,1,12100001,1610612751,BKN,Brooklyn,1627761,DeAndre' Bembry,DeAndre',F,,24.000000:05,2.0,3.0,0.667,0.0,1.0,0.000,4.0,8.0,0.5,2.0,3.0,5.0,2.0,0.0,0.0,0.0,1.0,8.0,8.0,0012100001,2021-10-03,24.000000,05,24.083333
1,2,12100001,1610612751,BKN,Brooklyn,200794,Paul Millsap,Paul,F,,18.000000:27,4.0,10.0,0.400,2.0,4.0,0.500,0.0,0.0,0.0,4.0,6.0,10.0,3.0,1.0,2.0,2.0,2.0,10.0,13.0,0012100001,2021-10-03,18.000000,27,18.450000
2,3,12100001,1610612751,BKN,Brooklyn,200746,LaMarcus Aldridge,LaMarcus,C,,14.000000:43,2.0,6.0,0.333,0.0,2.0,0.000,2.0,2.0,1.0,0.0,4.0,4.0,3.0,0.0,1.0,0.0,3.0,6.0,7.0,0012100001,2021-10-03,14.000000,43,14.716667
3,4,12100001,1610612751,BKN,Brooklyn,1628971,Bruce Brown,Bruce,G,,25.000000:02,5.0,9.0,0.556,2.0,2.0,1.000,0.0,0.0,0.0,0.0,3.0,3.0,2.0,1.0,0.0,0.0,5.0,12.0,3.0,0012100001,2021-10-03,25.000000,02,25.033333
4,5,12100001,1610612751,BKN,Brooklyn,1628975,Jevon Carter,Jevon,G,,25.000000:37,3.0,9.0,0.333,2.0,4.0,0.500,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,3.0,4.0,8.0,5.0,0012100001,2021-10-03,25.000000,37,25.616667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178317,197595,12300073,1610612744,GSW,Golden State,1627780,Gary Payton II,Gary,,,13.000000:45,2.0,4.0,0.500,1.0,3.0,0.333,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,2.0,5.0,1.0,0012300073,2023-10-20,13.000000,45,13.750000
178318,197596,12300073,1610612744,GSW,Golden State,1630541,Moses Moody,Moses,,,16.000000:58,7.0,11.0,0.636,4.0,6.0,0.667,0.0,0.0,0.0,3.0,5.0,8.0,0.0,1.0,1.0,2.0,3.0,18.0,14.0,0012300073,2023-10-20,16.000000,58,16.966667
178319,197597,12300073,1610612744,GSW,Golden State,1631218,Trayce Jackson-Davis,Trayce,,,12.000000:00,3.0,3.0,1.000,0.0,0.0,0.000,0.0,0.0,0.0,1.0,2.0,3.0,1.0,0.0,2.0,2.0,1.0,6.0,2.0,0012300073,2023-10-20,12.000000,00,12.000000
178320,197598,12300073,1610612744,GSW,Golden State,1641764,Brandin Podziemski,Brandin,,,12.000000:00,1.0,4.0,0.250,0.0,2.0,0.000,0.0,2.0,0.0,2.0,3.0,5.0,4.0,2.0,0.0,0.0,1.0,2.0,2.0,0012300073,2023-10-20,12.000000,00,12.000000


In [72]:
print(df.GAME_ID.between(20000000, 30000000, inclusive='both').mean())
print(df.shape)
df = df[df.GAME_ID.between(20000000, 30000000, inclusive='both')].copy()
print(df.shape)

0.8623719669350854
(149766, 35)
(129154, 35)


In [4]:
START_OF_TRAIN_DATA = '2021-10-19'
START_OF_TEST_DATA = '2023-10-23'
df_train = df[df.GAME_DATE.between(pd.to_datetime(START_OF_TRAIN_DATA).date(), pd.to_datetime(START_OF_TEST_DATA).date(), inclusive='left')].copy()
df_test = df[df.GAME_DATE > pd.to_datetime(START_OF_TEST_DATA).date()].copy()
print(df_train.shape, df_test.shape)
print(set(df_train.GAME_ID) & set(df_test.GAME_ID))

(59543, 35) (6262, 35)
set()


In [5]:
COLS = ['min_sec', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA',
        'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS']

In [7]:
def cast_to_0_1(preds):
    """
    from  -1 for outlies and 1 for inliers
    to 0 for inliers and 1 for outliers)
    """
    return (preds == -1).astype(int)
    
X_train = df_train[COLS]
X_test = df_test[COLS]

# df_train.info()
CONTAMINATION = 0.01
clf = Pipeline([
    ('scaler', StandardScaler()),
    ('isolation_forest', IsolationForest(contamination=CONTAMINATION, random_state=SEED))
])
clf.fit(X_train)
_preds = cast_to_0_1(clf.predict(X_test))
_scores = clf.decision_function(X_test)  # raw outlier scores
print(np.sum(_preds))

67


In [8]:
train_preds = cast_to_0_1(clf.predict(X_train))
print(train_preds.shape, np.sum(train_preds))

(59543,) 596


In [9]:
df_rep = df_test.loc[_preds == 1].copy()
df_rep.head(10)

Unnamed: 0,index,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_ID_STR,GAME_DATE,_min,_sec,min_sec
177844,197773,22300007,1610612763,MEM,Memphis,1630217,Desmond Bane,Desmond,G,,44.000000:47,13.0,26.0,0.5,3.0,9.0,0.333,4.0,5.0,0.8,1.0,7.0,8.0,7.0,4.0,3.0,3.0,5.0,33.0,11.0,22300007,2023-11-03,44.0,47,44.783333
177869,197798,22300008,1610612755,PHI,Philadelphia,203954,Joel Embiid,Joel,C,,35.000000:48,8.0,21.0,0.381,1.0,4.0,0.25,16.0,19.0,0.842,7.0,9.0,16.0,3.0,0.0,2.0,5.0,3.0,33.0,1.0,22300008,2023-11-10,35.0,48,35.8
178157,198086,22300019,1610612754,IND,Indiana,1630169,Tyrese Haliburton,Tyrese,G,,38.000000:26,11.0,18.0,0.611,7.0,12.0,0.583,4.0,5.0,0.8,1.0,6.0,7.0,15.0,2.0,1.0,0.0,1.0,33.0,19.0,22300019,2023-11-14,38.0,26,38.433333
178169,198098,22300019,1610612755,PHI,Philadelphia,203954,Joel Embiid,Joel,C,,37.000000:40,11.0,22.0,0.5,0.0,4.0,0.0,17.0,17.0,1.0,3.0,9.0,12.0,6.0,3.0,0.0,5.0,4.0,39.0,-9.0,22300019,2023-11-14,37.0,40,37.666667
178378,198307,22300027,1610612766,CHA,Charlotte,1630163,LaMelo Ball,LaMelo,G,,34.000000:59,12.0,28.0,0.429,2.0,9.0,0.222,11.0,12.0,0.917,0.0,5.0,5.0,5.0,1.0,0.0,6.0,2.0,37.0,-18.0,22300027,2023-11-17,34.0,59,34.983333
178495,198424,22300032,1610612758,SAC,Sacramento,1628368,De'Aaron Fox,De'Aaron,G,,40.000000:43,14.0,24.0,0.583,5.0,11.0,0.455,10.0,13.0,0.769,0.0,8.0,8.0,7.0,1.0,1.0,1.0,2.0,43.0,3.0,22300032,2023-11-17,40.0,43,40.716667
178603,198532,22300036,1610612747,LAL,Los Angeles,203076,Anthony Davis,Anthony,C,,34.000000:47,8.0,19.0,0.421,0.0,1.0,0.0,0.0,1.0,0.0,4.0,10.0,14.0,2.0,3.0,5.0,6.0,5.0,16.0,1.0,22300036,2023-11-17,34.0,47,34.783333
178683,198612,22300039,1610612754,IND,Indiana,1630169,Tyrese Haliburton,Tyrese,G,,38.000000:31,11.0,18.0,0.611,9.0,15.0,0.6,6.0,7.0,0.857,1.0,4.0,5.0,16.0,3.0,1.0,6.0,0.0,37.0,16.0,22300039,2023-11-21,38.0,31,38.516667
178720,198649,22300040,1610612755,PHI,Philadelphia,203954,Joel Embiid,Joel,C,,41.000000:02,9.0,21.0,0.429,1.0,3.0,0.333,13.0,16.0,0.813,3.0,10.0,13.0,5.0,2.0,5.0,6.0,6.0,32.0,3.0,22300040,2023-11-21,41.0,2,41.033333
178813,198742,22300044,1610612756,PHX,Phoenix,1626164,Devin Booker,Devin,G,,33.000000:22,15.0,21.0,0.714,3.0,4.0,0.75,7.0,9.0,0.778,1.0,4.0,5.0,4.0,4.0,1.0,5.0,3.0,40.0,28.0,22300044,2023-11-24,33.0,22,33.366667


## save, load and re-test model

In [10]:
joblib.dump(clf, f'isolation_forest_model_{START_OF_TRAIN_DATA}_{START_OF_TEST_DATA}.joblib')
!du -hs isolation_forest_model_*.joblib

852K	isolation_forest_model_2021-10-19_2023-10-23.joblib


In [11]:
restored_if = joblib.load('isolation_forest_model_2021-10-19_2023-10-23.joblib')

In [12]:
_preds = cast_to_0_1(clf.predict(X_test))
_scores = restored_if.decision_function(X_test)  # raw outlier scores
print(np.sum(_preds))
df_rep = df_test.loc[_preds == 1].copy()
df_rep.head(10)

67


Unnamed: 0,index,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_ID_STR,GAME_DATE,_min,_sec,min_sec
177844,197773,22300007,1610612763,MEM,Memphis,1630217,Desmond Bane,Desmond,G,,44.000000:47,13.0,26.0,0.5,3.0,9.0,0.333,4.0,5.0,0.8,1.0,7.0,8.0,7.0,4.0,3.0,3.0,5.0,33.0,11.0,22300007,2023-11-03,44.0,47,44.783333
177869,197798,22300008,1610612755,PHI,Philadelphia,203954,Joel Embiid,Joel,C,,35.000000:48,8.0,21.0,0.381,1.0,4.0,0.25,16.0,19.0,0.842,7.0,9.0,16.0,3.0,0.0,2.0,5.0,3.0,33.0,1.0,22300008,2023-11-10,35.0,48,35.8
178157,198086,22300019,1610612754,IND,Indiana,1630169,Tyrese Haliburton,Tyrese,G,,38.000000:26,11.0,18.0,0.611,7.0,12.0,0.583,4.0,5.0,0.8,1.0,6.0,7.0,15.0,2.0,1.0,0.0,1.0,33.0,19.0,22300019,2023-11-14,38.0,26,38.433333
178169,198098,22300019,1610612755,PHI,Philadelphia,203954,Joel Embiid,Joel,C,,37.000000:40,11.0,22.0,0.5,0.0,4.0,0.0,17.0,17.0,1.0,3.0,9.0,12.0,6.0,3.0,0.0,5.0,4.0,39.0,-9.0,22300019,2023-11-14,37.0,40,37.666667
178378,198307,22300027,1610612766,CHA,Charlotte,1630163,LaMelo Ball,LaMelo,G,,34.000000:59,12.0,28.0,0.429,2.0,9.0,0.222,11.0,12.0,0.917,0.0,5.0,5.0,5.0,1.0,0.0,6.0,2.0,37.0,-18.0,22300027,2023-11-17,34.0,59,34.983333
178495,198424,22300032,1610612758,SAC,Sacramento,1628368,De'Aaron Fox,De'Aaron,G,,40.000000:43,14.0,24.0,0.583,5.0,11.0,0.455,10.0,13.0,0.769,0.0,8.0,8.0,7.0,1.0,1.0,1.0,2.0,43.0,3.0,22300032,2023-11-17,40.0,43,40.716667
178603,198532,22300036,1610612747,LAL,Los Angeles,203076,Anthony Davis,Anthony,C,,34.000000:47,8.0,19.0,0.421,0.0,1.0,0.0,0.0,1.0,0.0,4.0,10.0,14.0,2.0,3.0,5.0,6.0,5.0,16.0,1.0,22300036,2023-11-17,34.0,47,34.783333
178683,198612,22300039,1610612754,IND,Indiana,1630169,Tyrese Haliburton,Tyrese,G,,38.000000:31,11.0,18.0,0.611,9.0,15.0,0.6,6.0,7.0,0.857,1.0,4.0,5.0,16.0,3.0,1.0,6.0,0.0,37.0,16.0,22300039,2023-11-21,38.0,31,38.516667
178720,198649,22300040,1610612755,PHI,Philadelphia,203954,Joel Embiid,Joel,C,,41.000000:02,9.0,21.0,0.429,1.0,3.0,0.333,13.0,16.0,0.813,3.0,10.0,13.0,5.0,2.0,5.0,6.0,6.0,32.0,3.0,22300040,2023-11-21,41.0,2,41.033333
178813,198742,22300044,1610612756,PHX,Phoenix,1626164,Devin Booker,Devin,G,,33.000000:22,15.0,21.0,0.714,3.0,4.0,0.75,7.0,9.0,0.778,1.0,4.0,5.0,4.0,4.0,1.0,5.0,3.0,40.0,28.0,22300044,2023-11-24,33.0,22,33.366667
