## Importing base dataset with unique identifiers and machine learning targets

In [1]:
# Importing initial data
import pandas as pd
df = pd.read_csv('baseGames.csv')
df = df.drop(columns=['ID'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16979 entries, 0 to 16978
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   GAME_DATE_EST   16979 non-null  object 
 1   GAME_ID         16979 non-null  int64  
 2   SEASON          16979 non-null  int64  
 3   HOME_TEAM_ID    16979 non-null  int64  
 4   HOME_W%         16979 non-null  float64
 5   AWAY_TEAM_ID    16979 non-null  int64  
 6   AWAY_W%         16979 non-null  float64
 7   HOME_TEAM_WINS  16979 non-null  int64  
 8   MARGIN          16979 non-null  float64
dtypes: float64(3), int64(5), object(1)
memory usage: 1.2+ MB


## Importing stats dataset with basic and advanced stats (previous season)

In [2]:
# Importing team stats data
stats = pd.read_excel('teamStatsRaw.xlsx')

# Seperating team statistics into groups - cannot use all 80 columns
stats_basic = stats[['nextSeason','teamID','Win %','FG%','3P%','FT%','REB','AST','TOV','STL','BLK']]
stats_advanced = stats[['nextSeason','teamID','MOV','PIE (Team)','Dean Oliver Team FF Rating','Dean Oliver Opp FF Rating',
                        'Dean Oliver Net FF Rating','Sully Team FF Rating','Sully Opp FF Rating','Sully Net FF Rating']]
# Make a copy of base dataframe to use with advanced stats
df2 = df

## Adding basic stats for each team (previous season)

In [3]:
df = pd.merge(df, stats_basic,  how='left', left_on=['SEASON','HOME_TEAM_ID'], right_on = ['nextSeason','teamID'])

# Stats from previous season are named including PS
df = df.rename(columns={'Win %':'HOME_PS_W%', 'FG%':'HOME_PS_FG%', '3P%':'HOME_PS_3P%', 'FT%':'HOME_PS_FT%', 
                        'REB':'HOME_PS_REB', 'AST':'HOME_PS_AST', 'TOV':'HOME_PS_TOV', 'STL':'HOME_PS_STL', 'BLK':'HOME_PS_BLK'})
df = df.drop(columns=['nextSeason','teamID'])

In [4]:
df = pd.merge(df, stats_basic,  how='left', left_on=['SEASON','AWAY_TEAM_ID'], right_on = ['nextSeason','teamID'])

# Stats from previous season are named including PS
df = df.rename(columns={'Win %':'AWAY_PS_W%', 'FG%':'AWAY_PS_FG%', '3P%':'AWAY_PS_3P%', 'FT%':'AWAY_PS_FT%', 
                        'REB':'AWAY_PS_REB', 'AST':'AWAY_PS_AST', 'TOV':'AWAY_PS_TOV', 'STL':'AWAY_PS_STL', 'BLK':'AWAY_PS_BLK'})
df = df.drop(columns=['nextSeason','teamID'])

## Finalize and export games with basic stats added as 'games_basic.csv'

In [5]:
# Rearrange columns as needed
cols = ['GAME_DATE_EST','GAME_ID','SEASON','HOME_TEAM_ID','HOME_W%','HOME_PS_W%','HOME_PS_FG%','HOME_PS_3P%','HOME_PS_FT%',
        'HOME_PS_REB','HOME_PS_AST','HOME_PS_TOV','HOME_PS_STL','HOME_PS_BLK','AWAY_TEAM_ID','AWAY_W%','AWAY_PS_W%',
        'AWAY_PS_FG%','AWAY_PS_3P%','AWAY_PS_FT%','AWAY_PS_REB','AWAY_PS_AST','AWAY_PS_TOV','AWAY_PS_STL','AWAY_PS_BLK',
        'HOME_TEAM_WINS','MARGIN']
df = df[cols]

# Export modified dataframe to a new csv file, "games_basic.csv"
df.to_csv('games_basic.csv')

## Adding advanced stats for each team (previous season)

In [6]:
df2 = pd.merge(df2, stats_advanced,  how='left', left_on=['SEASON','HOME_TEAM_ID'], right_on = ['nextSeason','teamID'])

# Stats from previous season are named including PS
df2 = df2.rename(columns={'MOV':'HOME_PS_MOV','PIE (Team)':'HOME_PS_PIE','Dean Oliver Team FF Rating':'HOME_PS_TEAM_DOFF',
                        'Dean Oliver Opp FF Rating':'HOME_PS_OPP_DOFF','Dean Oliver Net FF Rating':'HOME_PS_NET_DOFF',
                        'Sully Team FF Rating':'HOME_PS_TEAM_SFF','Sully Opp FF Rating':'HOME_PS_OPP_SFF',
                        'Sully Net FF Rating':'HOME_PS_NET_SFF'})
df2 = df2.drop(columns=['nextSeason','teamID'])

In [7]:
df2 = pd.merge(df2, stats_advanced,  how='left', left_on=['SEASON','AWAY_TEAM_ID'], right_on = ['nextSeason','teamID'])

# Stats from previous season are named including PS
df2 = df2.rename(columns={'MOV':'AWAY_PS_MOV','PIE (Team)':'AWAY_PS_PIE','Dean Oliver Team FF Rating':'AWAY_PS_TEAM_DOFF',
                        'Dean Oliver Opp FF Rating':'AWAY_PS_OPP_DOFF','Dean Oliver Net FF Rating':'AWAY_PS_NET_DOFF',
                        'Sully Team FF Rating':'AWAY_PS_TEAM_SFF','Sully Opp FF Rating':'AWAY_PS_OPP_SFF',
                        'Sully Net FF Rating':'AWAY_PS_NET_SFF'})
df2 = df2.drop(columns=['nextSeason','teamID'])

## Finalize and export games with advanced stats added as 'games_advanced.csv'

In [8]:
# Rearrange columns as needed
cols = ['GAME_DATE_EST','GAME_ID','SEASON','HOME_TEAM_ID','HOME_W%','HOME_PS_MOV','HOME_PS_PIE','HOME_PS_TEAM_DOFF',
        'HOME_PS_OPP_DOFF','HOME_PS_NET_DOFF','HOME_PS_TEAM_SFF','HOME_PS_OPP_SFF','HOME_PS_NET_SFF','AWAY_TEAM_ID',
        'AWAY_W%','AWAY_PS_MOV','AWAY_PS_PIE','AWAY_PS_TEAM_DOFF','AWAY_PS_OPP_DOFF','AWAY_PS_NET_DOFF','AWAY_PS_TEAM_SFF',
        'AWAY_PS_OPP_SFF','AWAY_PS_NET_SFF','HOME_TEAM_WINS','MARGIN']
df2 = df2[cols]

# Export modified dataframe to a new csv file, "games_advanced.csv"
df2.to_csv('games_advanced.csv')