# Data

- In this chunk, I'm loading in the csv files I have collected for NBA data.
- The `defense_dashboard`, `box_outs`, `defense_2pt`, `defense_3pt`, and `hustle_stats` objects were collected from NBA.com statistics using a Data Scraper.
  - All from the 2023-2024 NBA Regular Season
- The `dpoy_voting` data was collected from [Basketball-Reference ](https://www.basketball-reference.com/awards/awards_2024.html#all_dpoy)
  - All NBA Regular Seasons from 2013-2014 through 2023-2024

In [87]:
import pandas as pd

# load in the csv files containing the NBA.com data
defense_dashboard = pd.read_csv('data/defense_dashboard.csv')
box_outs = pd.read_csv('data/box_outs.csv')
defense_2pt = pd.read_csv('data/defense_2pt.csv')
defense_3pt = pd.read_csv('data/defense_3pt.csv')
hustle_stats = pd.read_csv('data/hustle_stats.csv')

# load voting information for defensive player of the year from Baseball-Reference
dpoy_voting = pd.read_csv('data/dpoy_voting.csv')

Need to check the columns to see which ones we want, and if there are columns to be renamed.

In [88]:
# view the columns of the dataframes
dfs = [defense_dashboard, box_outs, defense_2pt, defense_3pt, hustle_stats, dpoy_voting]
for df in dfs:
    print(df.columns)

Index(['Player', 'Team', 'Age', 'Position', 'GP', 'DFGM', 'DFGA', 'DFG_PCT',
       'FG_PCT'],
      dtype='object')
Index(['Player', 'Min', 'box_outs', 'off_box_outs', 'def_box_outs',
       'team_reb_pct', 'player_reb_pct'],
      dtype='object')
Index(['Player', 'Team', 'Age', 'Position', 'GP', 'DFGM', 'DFGA', 'DFG_PCT',
       'FG_PCT'],
      dtype='object')
Index(['Player', 'Team', 'Age', 'Position', 'GP', 'DFGM', 'DFGA', 'DFG_PCT',
       'FG_PCT'],
      dtype='object')
Index(['Player', 'Min', 'Deflections', 'off_LB', 'def_LB', 'charges',
       'contested_shots'],
      dtype='object')
Index(['Rank', 'Player', 'Age', 'Tm', 'Season', 'MVP', 'First', 'Pts Won',
       'Pts Max', 'Share', 'G', 'MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%',
       '3P%', 'FT%', 'WS', 'WS/48', 'DWS', 'DBPM', 'DRtg', '-9999'],
      dtype='object')


In [89]:
# Select only the specified columns for dpoy_voting
dpoy_voting = dpoy_voting[['Player', 'Age', 'Tm', 'Season', 'MVP', 'First', 'Pts Won', 
                           'Pts Max', 'Share',  'G', 'MP', 'STL', 'BLK', 
                           'DWS', 'DBPM', 'DRtg']]


# Filter the 2023-24 DPOY Voting candidates
dpoy_24 = dpoy_voting[dpoy_voting['Season'] == '2023-24']

# change column names to be more specific
defense_2pt.rename(columns={'DFGM': 'DFGM_2pt', 'DFGA': 'DFA_2pt', 'DFG_PCT': 'DFG_pct_2pt'}, inplace=True)
defense_3pt.rename(columns={'DFGM': 'DFGM_3pt', 'DFGA': 'DFA_3pt', 'DFG_PCT': 'DFG_pct_3pt'}, inplace=True)

# Filter the 2024 DPOY Candidates

In [90]:

players = dpoy_24['Player']

# filter the dataframes so that we're only focusing on the players that received votes for
# defensive player of the year
defense_dashboard = defense_dashboard[defense_dashboard['Player'].isin(players)]
box_outs = box_outs[box_outs['Player'].isin(players)]
defense_2pt = defense_2pt[defense_2pt['Player'].isin(players)]
defense_3pt = defense_3pt[defense_3pt['Player'].isin(players)]
hustle_stats = hustle_stats[hustle_stats['Player'].isin(players)]

# create larger 'data' object containing all the variables.
data = pd.merge(defense_dashboard, box_outs, on='Player')
data = pd.merge(data, defense_2pt, on='Player')
data = pd.merge(data, defense_3pt, on='Player')
data = pd.merge(data, hustle_stats, on='Player')

# View the columns of data
print(data.columns)

Index(['Player', 'Team_x', 'Age_x', 'Position_x', 'GP_x', 'DFGM', 'DFGA',
       'DFG_PCT', 'FG_PCT_x', 'Min_x', 'box_outs', 'off_box_outs',
       'def_box_outs', 'team_reb_pct', 'player_reb_pct', 'Team_y', 'Age_y',
       'Position_y', 'GP_y', 'DFGM_2pt', 'DFA_2pt', 'DFG_pct_2pt', 'FG_PCT_y',
       'Team', 'Age', 'Position', 'GP', 'DFGM_3pt', 'DFA_3pt', 'DFG_pct_3pt',
       'FG_PCT', 'Min_y', 'Deflections', 'off_LB', 'def_LB', 'charges',
       'contested_shots'],
      dtype='object')


Some columns are unnecessary so we will select the columns we want.

In [91]:
# Desired columns for the data subset
cols_desired = ['Player', 'Team', 'Age', 'Position', 'GP', 'DFGM', 'DFGA',
       'DFG_PCT', 'Min_x', 'box_outs',
       'def_box_outs', 'team_reb_pct', 'player_reb_pct', 
       'DFGM_2pt', 'DFA_2pt', 'DFG_pct_2pt', 'DFGM_3pt', 'DFA_3pt', 'DFG_pct_3pt', 
       'Deflections', 'def_LB', 'charges', 'contested_shots']
# subset the data
data = data[cols_desired]
# rename the column for total minutes played
data.rename(columns={'Min_x': 'MP'}, inplace=True)
# verify the columns 
print(data.columns)

Index(['Player', 'Team', 'Age', 'Position', 'GP', 'DFGM', 'DFGA', 'DFG_PCT',
       'MP', 'box_outs', 'def_box_outs', 'team_reb_pct', 'player_reb_pct',
       'DFGM_2pt', 'DFA_2pt', 'DFG_pct_2pt', 'DFGM_3pt', 'DFA_3pt',
       'DFG_pct_3pt', 'Deflections', 'def_LB', 'charges', 'contested_shots'],
      dtype='object')


# Rank the Players
- I decided to add variables ending in '_rank' in order to rank the players by their respective performance in each category
- `higher_is_better` indicates the columns in which a higher value indicates a positive trend. 
  - e.g. You draw more charges than another guy, that's a defensive benefit
- `lower_is_better` indicates the columns in which a lower value indicates better performance
  - e.g. Lower Defended Field Goal % often (but not always!) indicates better defense

In [92]:
higher_is_better = ['DFGA', 'box_outs', 'def_box_outs', 'team_reb_pct',
                  'player_reb_pct', 'DFA_2pt', 'DFA_3pt', 'Deflections',  
                  'def_LB', 'charges', 'contested_shots']

lower_is_better = ['DFGM', 'DFG_PCT', 'MP', 'DFGM_2pt', 'DFG_pct_2pt', 'DFGM_3pt', 'DFG_pct_3pt']

# create rankings for each of the columns
for col in higher_is_better:
    data[col + '_rank'] = data[col].rank(ascending=False)
# create rankings for each of the columns
for col in lower_is_better:
    data[col + '_rank'] = data[col].rank(ascending=True)
    
# filter the rankings columns for analysis
rankings = data.filter(regex='_rank$')
# filter any columns referring to offense
drop_columns = data.filter(regex='^off_').columns
# remove the columns to be dropped
data = data.drop(columns=drop_columns)

In [93]:
# calculate the avergage rank for each player
data['average_rank'] = rankings.mean(axis=1)
# sort the values by average rank
data = data.sort_values(by='average_rank').reset_index(drop=True)
# merge the data with the dpoy_voting data
data = pd.merge(data, dpoy_24, on='Player')
# Drop duplicate columns
data = data.drop(columns=['MP_y', 'Age_y', 'Tm'])
# rename the columns
data.rename(columns={'MP_x': 'MP', 'Age_x': 'Age'}, inplace=True)

# save the data to a csv file
data.to_csv('data/clean.csv')