In [1]:
import pandas as pd
import numpy as np
import math as math
import statistics
import time
import datetime
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth', 1000) 
pd.set_option('display.width', 150)
NBATeams =['ATL','BKN','BOS','CHA','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM',
           'MIA','MIL','MIN','NOP','NYK','OKC','ORL','PHI','PHX','POR','SAC','SAS','TOR','UTA','WAS']

In [2]:
def convertDateColumn(df):
    locationToInsert = df.columns.get_loc('DATE')+1
    dates = pd.to_datetime(df['DATE'],format="%m/%d/%y").dt.date
    df = df.drop(columns='DATE')
    df.insert(locationToInsert,'DATE',dates)
    df = df.sort_values(by=['DATE','GAMEID']).reset_index(drop=True)
    return df
def featureEngineering(df):
    featuresToAddDict = {'OPPT':[],'PTSA':[],'2FGA':[],'2FGM':[],'2FG%':[],
                         'RESULT':[],'PTSDELTA':[],'H/A':[]}
    for i in range(len(df)):
        featuresToAddDict['OPPT'].append(df.iloc[i]['MATCH'][-3:])
        if i % 2 == 0: featuresToAddDict['PTSA'].append(df.iloc[i+1]['PTS'])
        else: featuresToAddDict['PTSA'].append(df.iloc[i-1]['PTS'])
        if df['W/L'][i] == 'L': featuresToAddDict['RESULT'].append(0)
        else: featuresToAddDict['RESULT'].append(1)
        featuresToAddDict['H/A'].append(df['MATCH'][i].count('vs'))
    df.insert(df.columns.get_loc('TEAM')+1,'OPPT',featuresToAddDict['OPPT'])
    df.insert(df.columns.get_loc('PTS')+1,'PTSA',featuresToAddDict['PTSA'])
    df.insert(df.columns.get_loc('FG%')+1,'2FGM',df['FGM'] - df['3PM'])
    df.insert(df.columns.get_loc('FG%')+1,'2FGA',df['FGA'] - df['3PA']) 
    df.insert(df.columns.get_loc('FG%')+1,'2FG%',df['2FGM']/df['2FGA'])
    df.insert(df.columns.get_loc('W/L')+1,'RESULT',featuresToAddDict['RESULT'])
    df = df.drop(columns=['W/L'])
    df.insert(df.columns.get_loc('PTSA')+1,'PTSDELTA',df['PTS']-df['PTSA'])
    df.insert(df.columns.get_loc('DATE')+1,'H/A',featuresToAddDict['H/A'])
    return df
def cleanBoxScore(boxScoreDF):
    if 'Unnamed: 22' in boxScoreDF.columns:
            boxScoreDF = boxScoreDF.drop(columns='Unnamed: 22')
    boxScoreDF['PLAYER']
    players = []
    for playerIter in range(5):
        players.append(boxScoreDF['PLAYER'].iloc[playerIter][:-1])
    for playerIter in range(5,len(boxScoreDF)):
        players.append(boxScoreDF['PLAYER'].iloc[playerIter])
    boxScoreDF = boxScoreDF.drop(columns='PLAYER')
    boxScoreDF.insert(0,'PLAYER',players)
    boxScoreDF = boxScoreDF.reset_index(drop=True).dropna()
    rowsToDrop = []
    for i in range(len(boxScoreDF)):
        if 'DNP' in boxScoreDF.iloc[i]['MIN'] or ' ' in boxScoreDF.iloc[i]['MIN'] or 'SUSPENSION' in boxScoreDF.iloc[i]['MIN']:
            rowsToDrop.append(i)

    boxScoreDF = boxScoreDF.drop(rowsToDrop)
    boxScoreDF['PTS_'] = boxScoreDF['PTS'].astype(int)
    boxScoreDF = boxScoreDF.drop(columns='PTS')
    boxScoreDF = boxScoreDF.rename(columns={'PTS_':'PTS'})
    return boxScoreDF

# Traditional.csv

In [3]:
traditionalDF = pd.read_csv('/Users/olutosinfashusi/jupyter/csv/Traditional.csv')
traditionalDF = convertDateColumn(traditionalDF)
traditionalDF = featureEngineering(traditionalDF)
traditionalDF = traditionalDF.sort_values(by=['DATE', 'GAMEID', 'H/A']).reset_index(drop=True)

# 1,230 TOTAL GAMES IN 2023-24 NBA REGULAR SEASON

In [4]:
len(np.unique(traditionalDF['GAMEID']))

1230

## DETECTED 599 GAMES WHERE TEAM LISTED AS BETTING FAVORITE MISSED BETTING SPREAD

In [5]:
results = {}
for team in NBATeams:
    results[team] = {}
    results[team]['teamListedAsBettingFavoriteButMissedBettingSpread'] = pd.DataFrame
    
for team in NBATeams:
    teamDF = traditionalDF[traditionalDF['TEAM'] == team]
    teamDF = teamDF[teamDF['SPREAD'] < 0]
    teamDF = teamDF.reset_index(drop=True)

    df = pd.DataFrame()
    for i in range(len(teamDF)):
        if teamDF.iloc[i]['PTSDELTA'] < 0:
            df = pd.concat([df,teamDF.iloc[i].to_frame().T],axis=0)
        if teamDF.iloc[i]['PTSDELTA'] > 0:
            if np.abs(teamDF.iloc[i]['SPREAD']) > teamDF.iloc[i]['PTSDELTA']:
                df = pd.concat([df,teamDF.iloc[i].to_frame().T],axis=0)
    results[team]['teamListedAsBettingFavoriteButMissedBettingSpread'] = df.reset_index(drop=True)
    
total = 0
for team in NBATeams:
    total = total + len(results[team]['teamListedAsBettingFavoriteButMissedBettingSpread'])
print(total)

599


In [6]:
playersDict = {}
GAMEIDs = []
players = []

for team in NBATeams:
    playersDict[team] = []
for team in NBATeams:
    teamDF = results[team]['teamListedAsBettingFavoriteButMissedBettingSpread']
    dateOfFirstGame = traditionalDF[traditionalDF['TEAM'] == team].head(1)['DATE'].values[0]
    teamDF = teamDF[teamDF['DATE'] > dateOfFirstGame]
    teamDF = teamDF.reset_index(drop=True);
    
    for i in range (len(teamDF)): 
        GAMEID = str(teamDF.iloc[i]['GAMEID'])
        GAMEIDs.append(GAMEID)
        boxScoreDF = pd.read_csv('/Users/olutosinfashusi/jupyter/csv/boxScoreCSV/boxScoreCSVs/'+ GAMEID +'.csv')
        boxScoreDF = boxScoreDF[boxScoreDF['TEAM'] ==  team]
        boxScoreDF.insert(1,'OPPT',teamDF.iloc[i]['OPPT'])
        boxScoreDF = boxScoreDF.reset_index(drop=True).dropna()
        boxScoreDF = cleanBoxScore(boxScoreDF);

        prevGamesDF = traditionalDF[traditionalDF['TEAM'] == team]
        prevGamesDF  = prevGamesDF[prevGamesDF['DATE'] < teamDF.iloc[i]['DATE']]
        prevGamesDF.reset_index(drop=True)
        
        prevGameIDs = prevGamesDF['GAMEID'].values
        prevOppts = prevGamesDF['OPPT'].values
        
        prevBoxScoresDF = pd.DataFrame()
        for prevIter in range (len(prevGameIDs)):
            GAMEID = str(prevGameIDs[prevIter])
            prevBoxScoreDF = pd.read_csv('/Users/olutosinfashusi/jupyter/csv/boxScoreCSV/boxScoreCSVs/'+ GAMEID +'.csv')
            prevBoxScoreDF = prevBoxScoreDF[prevBoxScoreDF['TEAM'] == team]
            prevBoxScoreDF.insert(1,'OPPT',prevOppts[prevIter])
            prevBoxScoreDF = cleanBoxScore(prevBoxScoreDF)
            prevBoxScoresDF = pd.concat([prevBoxScoresDF,prevBoxScoreDF],axis=0)
        players.append(prevBoxScoresDF['PLAYER'].values)
        prevBoxScoresDF.groupby('PLAYER')['PTS'].mean()
        boxScoreDF.groupby('PLAYER')['PTS'].mean()
        prevAVGPTS = round(pd.DataFrame(prevBoxScoresDF.groupby("PLAYER")['PTS'].mean()),1)

        prevAVGPTS = prevAVGPTS.rename(columns={'PTS':'PREV AVG PTS'})
        df = pd.DataFrame()
        df = pd.concat([boxScoreDF.groupby("PLAYER")['PTS'].mean(),
                        prevAVGPTS.groupby("PLAYER")['PREV AVG PTS'].mean()],axis=1).dropna()
        df['TEAM'] = teamDF.iloc[i]['TEAM']
        df['DATE'] = teamDF.iloc[i]['DATE']
        df['OPPT'] = teamDF.iloc[i]['OPPT']
        df['GAMEID'] = teamDF.iloc[i]['GAMEID']
        df['PTSDELTA'] = teamDF.iloc[i]['PTSDELTA']
        df['SPREAD'] = teamDF.iloc[i]['SPREAD']
    
        playersDict[team].append(df)

# INSPECTING 592 of 599 GAMES <br> FIRST GAME NOT APPLICABLE SINCE PLAYER YET TO HAVE PREVIOUS POINTS SCORED

In [7]:
print(len(GAMEIDs))

592


# 564 PLAYERS THAT PLAYED IN THESE GAME

In [8]:
playersSet = set()
for i in range(len(players)):
    playersSet.update(set(players[i]))
len(playersSet)

564

In [9]:
leagueDetectDF = pd.DataFrame()
for team in NBATeams:
    FLAG = []
    teamDetectDF = pd.DataFrame()
    for i in range (len(playersDict[team])):
        teamDetectDF = pd.concat([teamDetectDF,pd.DataFrame(playersDict[team][i])],axis=0)
    for i in range(len(teamDetectDF)):
        if 10+teamDetectDF.iloc[i]['PTS'] < teamDetectDF.iloc[i]['PREV AVG PTS']:
            FLAG.append('🚩')
        else: FLAG.append(' ')
    teamDetectDF['🚩'] = FLAG
    leagueDetectDF  = leagueDetectDF.append(teamDetectDF[teamDetectDF['🚩'] == '🚩'].sort_values(by='PLAYER'))
leagueDetectDF = leagueDetectDF.sort_values(by='TEAM').reset_index()

# DETECTED 219 SUCH IRREGULARITIES

In [10]:
len(leagueDetectDF)

219

In [11]:
leagueDetectDF

Unnamed: 0,PLAYER,PTS,PREV AVG PTS,TEAM,DATE,OPPT,GAMEID,PTSDELTA,SPREAD,🚩
0,Jalen Johnson,5.0,15.4,ATL,2024-01-13,WAS,22300543,-28,-8.0,🚩
1,Saddiq Bey,2.0,12.1,ATL,2023-11-09,ORL,22300172,1,-3.5,🚩
2,Trae Young,13.0,27.8,ATL,2024-01-12,IND,22300533,-18,-6.0,🚩
3,Trae Young,12.0,27.0,ATL,2024-02-14,CHA,22300777,-23,-7.0,🚩
4,Trae Young,11.0,26.7,ATL,2024-02-23,TOR,22300804,-2,-7.5,🚩
5,Trae Young,14.0,26.4,ATL,2024-04-10,CHA,22301159,-1,-8.5,🚩
6,Lonnie Walker IV,0.0,12.0,BKN,2024-01-27,HOU,22300645,2,-4.0,🚩
7,Lonnie Walker IV,0.0,14.0,BKN,2024-01-07,POR,22300498,-7,-9.0,🚩
8,Cam Thomas,9.0,20.7,BKN,2024-01-07,POR,22300498,-7,-9.0,🚩
9,Cam Thomas,8.0,20.6,BKN,2024-01-17,POR,22300578,-2,-5.0,🚩


# IDENTIFIED PLAYERS WITH MULTIPLE OCCURRENCES

In [12]:
df = pd.DataFrame(np.unique(leagueDetectDF['PLAYER'],return_counts=True)).T.rename(columns={0:'PLAYER',1:'COUNT'}).sort_values(by='COUNT',ascending=False).reset_index(drop=True)
df[df['COUNT'] > 1]

Unnamed: 0,PLAYER,COUNT
0,Jayson Tatum,6
1,Anthony Edwards,6
2,Donovan Mitchell,6
3,Devin Booker,5
4,Tim Hardaway Jr.,5
5,Stephen Curry,5
6,Chet Holmgren,5
7,Paul George,4
8,Jaylen Brown,4
9,Karl-Anthony Towns,4
