In [44]:
import os
import pandas as pd
import numpy as np
import time
from nba_api.stats.static import teams
from nba_api.stats.endpoints import leaguegamefinder
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Data

In [7]:
if not os.path.exists('../../../data/players'):
    !7z x ../../../data/players.7z -o../../../data/

if not os.path.exists('../../../data/boxscoretraditionalv2'):
    !7z x ../../../data/season_21-22.7z -o../../../data/

In [9]:
dallas_id = teams.find_team_by_abbreviation('DAL')['id']
season_2122 = leaguegamefinder.LeagueGameFinder(team_id_nullable=dallas_id, season_nullable='2021-22', season_type_nullable=leaguegamefinder.SeasonTypeNullable.regular)
season = season_2122.get_data_frames()[0]
print(season.shape)
season.head()

(82, 28)


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22021,1610612742,DAL,Dallas Mavericks,22101219,2022-04-10,DAL vs. SAS,W,240,130,46,85,0.541,20,36,0.556,18,23,0.783,10,33,43,34,5,3,16,16,10.0
1,22021,1610612742,DAL,Dallas Mavericks,22101209,2022-04-08,DAL vs. POR,W,241,128,42,79,0.532,20,45,0.444,24,31,0.774,9,42,51,25,9,5,14,19,50.0
2,22021,1610612742,DAL,Dallas Mavericks,22101190,2022-04-06,DAL @ DET,W,238,131,45,80,0.563,15,33,0.455,26,34,0.765,8,33,41,28,8,1,10,24,18.0
3,22021,1610612742,DAL,Dallas Mavericks,22101167,2022-04-03,DAL @ MIL,W,239,118,39,76,0.513,16,38,0.421,24,27,0.889,2,36,38,28,7,3,12,20,6.0
4,22021,1610612742,DAL,Dallas Mavericks,22101152,2022-04-01,DAL @ WAS,L,239,103,34,78,0.436,11,38,0.289,24,26,0.923,5,31,36,14,3,3,13,16,-32.0


In [77]:
object_dtypes = {col: object for col in ['GAME_ID', 'TEAM_ID', 'PLAYER_ID']}
df = (
    pd.concat([pd.read_csv(f'../../../data/boxscoretraditionalv2/boxscoretraditionalv2_0_{game_id}.csv', dtype=object_dtypes) 
               for game_id in season.GAME_ID.iloc[:]])
    .query('TEAM_ABBREVIATION == "DAL"')
    .dropna(subset=['MIN'])
)
df[['_min', '_sec']] = df['MIN'].str.split(':', expand=True)
df['min_sec'] = df._min.astype(float) + df._sec.astype(int) / 60 
print(df.shape)
df.head()

(917, 32)


Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,_min,_sec,min_sec
13,22101219,1610612742,DAL,Dallas,203493,Reggie Bullock,Reggie,F,,26.000000:37,1.0,6.0,0.167,1.0,3.0,0.333,1.0,1.0,1.0,1.0,2.0,3.0,2.0,0.0,0.0,1.0,3.0,4.0,16.0,26.0,37,26.616667
14,22101219,1610612742,DAL,Dallas,1627827,Dorian Finney-Smith,Dorian,F,,27.000000:15,6.0,7.0,0.857,4.0,5.0,0.8,0.0,0.0,0.0,0.0,3.0,3.0,1.0,1.0,0.0,0.0,0.0,16.0,14.0,27.0,15,27.25
15,22101219,1610612742,DAL,Dallas,203939,Dwight Powell,Dwight,C,,25.000000:05,5.0,6.0,0.833,0.0,0.0,0.0,2.0,6.0,0.333,3.0,4.0,7.0,2.0,0.0,1.0,3.0,2.0,12.0,11.0,25.0,5,25.083333
16,22101219,1610612742,DAL,Dallas,1628973,Jalen Brunson,Jalen,G,,24.000000:41,7.0,13.0,0.538,3.0,7.0,0.429,1.0,2.0,0.5,0.0,6.0,6.0,5.0,0.0,0.0,1.0,0.0,18.0,12.0,24.0,41,24.683333
17,22101219,1610612742,DAL,Dallas,1629029,Luka Doncic,Luka,G,,28.000000:45,8.0,18.0,0.444,2.0,5.0,0.4,8.0,8.0,1.0,1.0,7.0,8.0,9.0,2.0,1.0,4.0,1.0,26.0,19.0,28.0,45,28.75


In [78]:
df.to_csv('dallas_season_2021_22_boxscoretraditionalv2.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 917 entries, 13 to 12
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GAME_ID            917 non-null    object 
 1   TEAM_ID            917 non-null    object 
 2   TEAM_ABBREVIATION  917 non-null    object 
 3   TEAM_CITY          917 non-null    object 
 4   PLAYER_ID          917 non-null    object 
 5   PLAYER_NAME        917 non-null    object 
 6   NICKNAME           917 non-null    object 
 7   START_POSITION     410 non-null    object 
 8   COMMENT            0 non-null      object 
 9   MIN                917 non-null    object 
 10  FGM                917 non-null    float64
 11  FGA                917 non-null    float64
 12  FG_PCT             917 non-null    float64
 13  FG3M               917 non-null    float64
 14  FG3A               917 non-null    float64
 15  FG3_PCT            917 non-null    float64
 16  FTM                917 non

In [34]:
COLS = ['min_sec', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA',
        'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS']

# IForest

In [54]:
from pyod.models.iforest import IForest, check_array
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler

In [67]:
df_train = StandardScaler().fit_transform(df[COLS])
# df_train.info()
check_array(df_train)
CONTAMINATION = 0.1
clfs = [IForest(contamination=CONTAMINATION), OCSVM(contamination=CONTAMINATION), LOF(contamination=CONTAMINATION)]
preds = {}
cnts = {}
scores = {}
for clf in clfs:
    clf.fit(df_train)
    _key = str(clf).split('(')[0]
    _preds = clf.labels_  # binary labels (0: inliers, 1: outliers)
    preds[_key] = _preds
    scores[_key] = clf.decision_scores_  # raw outlier scores
    cnts[_key] = np.sum(_preds)
cnts

{'IForest': 92, 'OCSVM': 92, 'LOF': 92}

In [68]:
voted_preds = None
for _preds in preds.values():
    if voted_preds is None:
        voted_preds = _preds
    else:
        voted_preds = ((voted_preds == 1) & (_preds == 1)).astype(int)
    
np.sum(voted_preds)

8

In [79]:
df_rep = df.loc[voted_preds == 1].copy()
game_id_to_date = season.set_index('GAME_ID').GAME_DATE.to_dict()
df_rep['GAME_DATE'] = df_rep.GAME_ID.map(game_id_to_date)
df_rep

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,_min,_sec,min_sec,GAME_DATE
4,22101059,1610612742,DAL,Dallas,1629029,Luka Doncic,Luka,G,,29.000000:52,13.0,20.0,0.65,8.0,12.0,0.667,3.0,5.0,0.6,0.0,4.0,4.0,3.0,0.0,0.0,4.0,2.0,37.0,-30.0,29.0,52,29.866667,2022-03-19
5,22101014,1610612742,DAL,Dallas,1628467,Maxi Kleber,Maxi,,,28.000000:49,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,12.0,13.0,0.0,0.0,3.0,1.0,3.0,0.0,7.0,28.0,49,28.816667,2022-03-13
6,22100868,1610612742,DAL,Dallas,1628467,Maxi Kleber,Maxi,,,34.000000:55,5.0,10.0,0.5,3.0,8.0,0.375,6.0,8.0,0.75,2.0,4.0,6.0,0.0,1.0,5.0,1.0,3.0,19.0,13.0,34.0,55,34.916667,2022-02-15
17,22100778,1610612742,DAL,Dallas,1628973,Jalen Brunson,Jalen,G,,42.000000:33,10.0,18.0,0.556,0.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0,4.0,2.0,0.0,6.0,5.0,20.0,3.0,42.0,33,42.55,2022-02-02
15,22100705,1610612742,DAL,Dallas,204001,Kristaps Porzingis,Kristaps,C,,25.000000:36,6.0,11.0,0.545,2.0,7.0,0.286,1.0,3.0,0.333,1.0,7.0,8.0,2.0,0.0,6.0,0.0,6.0,15.0,15.0,25.0,36,25.6,2022-01-23
15,22100663,1610612742,DAL,Dallas,1629029,Luka Doncic,Luka,G,,38.000000:40,4.0,17.0,0.235,0.0,6.0,0.0,12.0,14.0,0.857,2.0,9.0,11.0,12.0,3.0,3.0,2.0,2.0,20.0,12.0,38.0,40,38.666667,2022-01-17
13,22100410,1610612742,DAL,Dallas,204001,Kristaps Porzingis,Kristaps,F,,25.000000:17,7.0,16.0,0.438,3.0,8.0,0.375,7.0,9.0,0.778,4.0,9.0,13.0,0.0,0.0,5.0,0.0,2.0,24.0,12.0,25.0,17,25.283333,2021-12-13
17,22100069,1610612742,DAL,Dallas,1628467,Maxi Kleber,Maxi,,,29.000000:18,4.0,8.0,0.5,4.0,8.0,0.5,0.0,0.0,0.0,3.0,7.0,10.0,2.0,1.0,6.0,0.0,5.0,12.0,22.0,29.0,18,29.3,2021-10-28


In [80]:
df_rep.to_csv('predicted_anomalies.csv')