In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

In [2]:
# Importing datasets
# Salary data set: https://www.kaggle.com/datasets/hultm28/nba-player-salary-data-2002-2017/data
# Season data set: https://www.kaggle.com/code/koki25ando/nba-salary-prediction-using-multiple-regression/input

season_data = pd.read_csv("Seasons_Stats.csv") # Contains data of a players performance each season (pts, ast, etc)
# The latest season this goes up to is 2016-2017

salary_data = pd.read_csv("NBASalaryData03-17.csv") # Salaries of players in a given season
# The latest season this goes up to is 2017-2018

# Cleaning data

#### Season data

In [3]:
season_data.head()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,...,0.705,,,,176.0,,,,217.0,458.0
1,1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,...,0.708,,,,109.0,,,,99.0,279.0
2,2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,...,0.698,,,,140.0,,,,192.0,438.0
3,3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,...,0.559,,,,20.0,,,,29.0,63.0
4,4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,...,0.548,,,,20.0,,,,27.0,59.0


In [4]:
pd.unique(season_data.columns)

array(['Unnamed: 0', 'Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS',
       'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%',
       'STL%', 'BLK%', 'TOV%', 'USG%', 'blanl', 'OWS', 'DWS', 'WS',
       'WS/48', 'blank2', 'OBPM', 'DBPM', 'BPM', 'VORP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS'], dtype=object)

In [5]:
# Dropping first column
season_data.drop(columns = season_data.columns[0], inplace = True)

In [6]:
# Check which columns have NAs
for col in season_data.columns:
    print(col + " "  + str(season_data[col].isna().sum()))

Year 67
Player 67
Pos 67
Age 75
Tm 67
G 67
GS 6458
MP 553
PER 590
TS% 153
3PAr 5852
FTr 166
ORB% 3899
DRB% 3899
TRB% 3120
AST% 2136
STL% 3899
BLK% 3899
TOV% 5109
USG% 5051
blanl 24691
OWS 106
DWS 106
WS 106
WS/48 590
blank2 24691
OBPM 3894
DBPM 3894
BPM 3894
VORP 3894
FG 67
FGA 67
FG% 166
3P 5764
3PA 5764
3P% 9275
2P 67
2PA 67
2P% 195
eFG% 166
FT 67
FTA 67
FT% 925
ORB 3894
DRB 3894
TRB 379
AST 67
STL 3894
BLK 3894
TOV 5046
PF 67
PTS 67


In [7]:
season_data.isna().any()

Year      True
Player    True
Pos       True
Age       True
Tm        True
G         True
GS        True
MP        True
PER       True
TS%       True
3PAr      True
FTr       True
ORB%      True
DRB%      True
TRB%      True
AST%      True
STL%      True
BLK%      True
TOV%      True
USG%      True
blanl     True
OWS       True
DWS       True
WS        True
WS/48     True
blank2    True
OBPM      True
DBPM      True
BPM       True
VORP      True
FG        True
FGA       True
FG%       True
3P        True
3PA       True
3P%       True
2P        True
2PA       True
2P%       True
eFG%      True
FT        True
FTA       True
FT%       True
ORB       True
DRB       True
TRB       True
AST       True
STL       True
BLK       True
TOV       True
PF        True
PTS       True
dtype: bool

In [8]:
# Drop rows that are all NAs
season_data.dropna(how = "all", inplace = True)

# Removing seasons prior to the 2002-2003 season
season_data = season_data[season_data.Year >= 2003]

In [9]:
# Making season years more descriptive
def seasonYears(year_col):
    begin = year_col - 1
    season_year = str(begin) + "-" + str(year_col)
    return(season_year)

season_data["Year"] = season_data["Year"].astype(int)
season_data["Year"] = season_data["Year"].apply(lambda x:seasonYears(x))

# Changing column names
season_data.rename(columns= {"Year": "season"}, inplace = True)
season_data.rename(columns= {"Tm": "team"}, inplace = True)

In [10]:
season_data.shape

(8671, 52)

In [11]:
# Printing all players
names = pd.unique(season_data.Player)
for name in names:
    print(name)
    
# Some players have * next to their name. These are HOF players, but we will need to remove the * for 
# the data cleaning

Tariq Abdul-Wahad
Shareef Abdur-Rahim
Courtney Alexander
Malik Allen
Ray Allen
Rafer Alston
John Amaechi
Chris Andersen
Derek Anderson
Kenny Anderson
Shandon Anderson
Robert Archibald
Gilbert Arenas
Brandon Armstrong
Darrell Armstrong
Carlos Arroyo
Chucky Atkins
Stacey Augmon
Dalibor Bagaric
Vin Baker
Brent Barry
Jon Barry
Maceo Baston
Mengke Bateer
Mike Batiste
Tony Battie
Shane Battier
Lonny Baxter
Raja Bell
Jonathan Bender
Corey Benjamin
Travis Best
Mike Bibby
Chauncey Billups
Corie Blount
Mark Blount
Calvin Booth
Carlos Boozer
Ruben Boumtje-Boumtje
Bruce Bowen
Ryan Bowen
Earl Boykins
Michael Bradley
Shawn Bradley
Elton Brand
J.R. Bremer
Jamison Brewer
Primoz Brezec
Damone Brown
Devin Brown
Kedrick Brown
Kwame Brown
P.J. Brown
Randy Brown
Tierre Brown
Rick Brunson
Kobe Bryant
Mark Bryant
Greg Buckner
Pat Burke
Caron Butler
Rasual Butler
Jason Caffey
Marcus Camby
Elden Campbell
Brian Cardinal
Anthony Carter
Vince Carter
Sam Cassell
Kelvin Cato
Tyson Chandler
Calbert Cheaney
Chris Chi

In [12]:
season_data["Player"] = season_data["Player"].str.replace("\*", "", regex = True)

One of the biggest challenges with this data pre-processing is addressing the issue when a player is traded during the season. When this occurs, that player will have two or more rows showing their statistics for each team that they played for. However, the machine learning models in this case will interpret these rows as being multiple players, when in fact they only represent one player. This can lead to issues in the training/test data as the data may be skewed if a player performs drastically different on one team than on another. This is addressed by "combining" the rows of each player that played on multiple teams in one season.

In [13]:
dup_df = season_data[season_data.duplicated(subset = ["season", "Player"], keep = False)]
# keep = False here is essential as it marks ALL duplicates as True, not limiting 
# it up till the first or last occurence

non_dup_df = season_data[~season_data.duplicated(subset = ["season", "Player"], keep = False)]

In [14]:
agg_functions = {"season": "first", "PTS": "sum", "AST": "sum", "BLK": "sum", "TOV": "sum", 
                "STL": "sum", "ORB": "sum", "DRB": "sum", "MP": "sum", "G": "sum", "GS": "sum", "TOV": "sum",
                "PF": "sum", "Pos": "first", "Age": "first", "FTA": "sum", "FGA": "sum", "3PA": "sum",
                "PER": "mean", "WS": "sum"}  
# Most of these are sums, ie total number of games, points, etc that a player has accumulated during their time with
# multiple teams in a particular season. 

dup_df = dup_df.groupby("Player").aggregate(agg_functions).reset_index()
dup_df["team"] = "TOT"

#dup_df[dup_df.duplicated(["Player", "season"])]

In [15]:
dup_df = dup_df[['season', 'Player', 'Pos', 'Age', 'team', 'PTS', 'AST', 'BLK', 'TOV', 'STL', 'ORB',
       'DRB', 'MP', 'G', 'GS', 'PF', 'FTA', 'FGA', '3PA',
       'PER', 'WS']]
dup_df.head()

Unnamed: 0,season,Player,Pos,Age,team,PTS,AST,BLK,TOV,STL,...,DRB,MP,G,GS,PF,FTA,FGA,3PA,PER,WS
0,2014-2015,A.J. Price,PG,28.0,TOT,266.0,92.0,0.0,28.0,14.0,...,52.0,648.0,52.0,0.0,30.0,48.0,274.0,114.0,8.8,0.6
1,2010-2011,Aaron Brooks,PG,26.0,TOT,3310.0,1160.0,52.0,566.0,232.0,...,444.0,7676.0,368.0,88.0,710.0,574.0,2968.0,1236.0,11.911111,8.2
2,2009-2010,Aaron Gray,C,25.0,TOT,338.0,88.0,38.0,98.0,38.0,...,260.0,1332.0,138.0,12.0,236.0,96.0,286.0,2.0,10.566667,2.0
3,2004-2005,Aaron Williams,C,33.0,TOT,590.0,50.0,58.0,94.0,44.0,...,320.0,2212.0,180.0,18.0,416.0,150.0,478.0,0.0,8.183333,3.7
4,2009-2010,Acie Law,PG,25.0,TOT,656.0,220.0,2.0,120.0,86.0,...,130.0,1920.0,154.0,2.0,154.0,226.0,522.0,104.0,11.685714,2.0


In [16]:
non_dup_df = non_dup_df[['season', 'Player', 'Pos', 'Age', 'team', 'PTS', 'AST', 'BLK', 'TOV', 'STL', 'ORB',
       'DRB', 'MP', 'G', 'GS', 'PF', 'FTA', 'FGA', '3PA', '3P', 'FG', 'FT',
       'PER', 'WS']]

non_dup_df.head()

Unnamed: 0,season,Player,Pos,Age,team,PTS,AST,BLK,TOV,STL,...,GS,PF,FTA,FGA,3PA,3P,FG,FT,PER,WS
16006,2002-2003,Tariq Abdul-Wahad,SG,28.0,DAL,57.0,21.0,3.0,7.0,6.0,...,0.0,26.0,6.0,58.0,1.0,0.0,27.0,3.0,12.4,0.4
16007,2002-2003,Shareef Abdur-Rahim,PF,26.0,ATL,1608.0,242.0,38.0,212.0,87.0,...,81.0,240.0,541.0,1183.0,60.0,21.0,566.0,455.0,19.9,9.7
16008,2002-2003,Courtney Alexander,PG,25.0,NOH,523.0,79.0,6.0,68.0,31.0,...,7.0,125.0,146.0,505.0,57.0,19.0,193.0,118.0,9.3,1.1
16009,2002-2003,Malik Allen,PF,24.0,MIA,767.0,54.0,78.0,128.0,37.0,...,73.0,234.0,121.0,790.0,4.0,0.0,335.0,97.0,9.9,0.9
16013,2002-2003,Rafer Alston,PG,26.0,TOR,366.0,192.0,15.0,86.0,38.0,...,4.0,120.0,54.0,335.0,130.0,51.0,139.0,37.0,14.4,1.2


In [17]:
season_data_new = pd.concat([non_dup_df, dup_df])

In [18]:
# Getting average game statistics for each player (ie points per game, rebounds per game, etc)
season_data_new["ppg"] = season_data_new["PTS"] / season_data_new["G"]
season_data_new["ast_pg"] = season_data_new["AST"] / season_data_new["G"]
season_data_new["blk_pg"] = season_data_new["BLK"] / season_data_new["G"]
season_data_new["tov_pg"] = season_data_new["TOV"] / season_data_new["G"]
season_data_new["stl_pg"] = season_data_new["STL"] / season_data_new["G"]
season_data_new["reb_pg"] = (season_data_new["ORB"] + season_data_new["DRB"]) / season_data_new["G"]
season_data_new["min_pg"] = season_data_new["MP"] / season_data_new["G"]
season_data_new["FG%"] = season_data_new["FG"] / season_data_new["FGA"]
season_data_new["FT%"] = season_data_new["FT"] / season_data_new["FTA"]
season_data_new["3P%"] = season_data_new["3P"] / season_data_new["3PA"]

In [19]:
season_data_new = season_data_new.sort_values("season")
season_data_new.reset_index(drop = True)
season_data_new.head()

#season_data_new[season_data_new.duplicated(["Player", "season"])]

Unnamed: 0,season,Player,Pos,Age,team,PTS,AST,BLK,TOV,STL,...,ppg,ast_pg,blk_pg,tov_pg,stl_pg,reb_pg,min_pg,FG%,FT%,3P%
16006,2002-2003,Tariq Abdul-Wahad,SG,28.0,DAL,57.0,21.0,3.0,7.0,6.0,...,4.071429,1.5,0.214286,0.5,0.428571,2.857143,14.571429,0.465517,0.5,0.0
16364,2002-2003,Zach Randolph,PF,21.0,POR,650.0,41.0,14.0,62.0,42.0,...,8.441558,0.532468,0.181818,0.805195,0.545455,4.454545,16.896104,0.512621,0.757764,0.0
16363,2002-2003,Igor Rakocevic,PG,24.0,MIN,78.0,33.0,0.0,23.0,4.0,...,1.857143,0.785714,0.0,0.547619,0.095238,0.404762,5.809524,0.37931,0.805556,0.416667
16362,2002-2003,Vladimir Radmanovic,PF,22.0,SEA,724.0,97.0,22.0,100.0,64.0,...,10.055556,1.347222,0.305556,1.388889,0.888889,4.486111,26.527778,0.41018,0.705882,0.354949
16361,2002-2003,Joel Przybilla,C,23.0,MIL,48.0,12.0,45.0,19.0,10.0,...,1.5,0.375,1.40625,0.59375,0.3125,4.53125,17.0625,0.391304,0.5,


In [20]:
season_data_new.shape
# note that the dimensions are smaller from season_data

(6668, 34)

In [21]:
season_data_new.to_csv("Season_Stats_New.csv")

#### Salary Data

In [22]:
salary_data.head()

Unnamed: 0,team,salary,player,position,season
0,Minnesota Timberwolves,25200000.0,Kevin Garnett,PF,2002-2003
1,Portland Trail Blazers,13500000.0,Damon Stoudamire,PG,2002-2003
2,Seattle SuperSonics,13080000.0,Gary Payton,PG,2002-2003
3,Seattle SuperSonics,12375000.0,Ray Allen,SG,2002-2003
4,New York Knicks,12375000.0,Latrell Sprewell,SG,2002-2003


In [23]:
# Renaming columns
salary_data.rename(columns = {"player": "Player"}, inplace = True)

# Scaling salary to millions
salary_data["salary"] = salary_data["salary"] / 1000000

In [24]:
dup_df_s = salary_data[salary_data.duplicated(subset = ["season", "Player"], keep = False)]
dup_df_s.head()

Unnamed: 0,team,salary,Player,position,season
1666,New Jersey Nets,1.1802,Marcus Williams,PG,2007-2008
1772,San Antonio Spurs,0.050254,Marcus Williams,SG,2007-2008
4020,Oklahoma City Thunder,0.473604,Chris Wright,SF,2012-2013
4031,Minnesota Timberwolves,0.346781,Chris Johnson,C,2012-2013
4065,Memphis Grizzlies,0.055718,Chris Johnson,SG,2012-2013


In [25]:
non_dup_df_s = salary_data[~salary_data.duplicated(subset = ["season", "Player"], keep = False)]
non_dup_df_s.head()

Unnamed: 0,team,salary,Player,position,season
0,Minnesota Timberwolves,25.2,Kevin Garnett,PF,2002-2003
1,Portland Trail Blazers,13.5,Damon Stoudamire,PG,2002-2003
2,Seattle SuperSonics,13.08,Gary Payton,PG,2002-2003
3,Seattle SuperSonics,12.375,Ray Allen,SG,2002-2003
4,New York Knicks,12.375,Latrell Sprewell,SG,2002-2003


In [26]:
agg_functions = {"season": "first", "position": "first", "salary": "mean"}
dup_df_s = dup_df_s.groupby("Player").aggregate(agg_functions).reset_index()
dup_df_s["team"] = 'Two or more teams'

In [27]:
dup_df_s = dup_df_s[["team", "salary", "Player", "position", "season"]]
dup_df_s.head()

Unnamed: 0,team,salary,Player,position,season
0,Two or more teams,0.201249,Chris Johnson,C,2012-2013
1,Two or more teams,0.250731,Chris Wright,SF,2012-2013
2,Two or more teams,0.615227,Marcus Williams,PG,2007-2008


In [28]:
salary_data_new = pd.concat([non_dup_df_s, dup_df_s])
#salary_data_new[salary_data_new.duplicated(["Player", "season"])]

salary_data_new.to_csv("NBASalaryData03-17_New.csv")

#### Joined Data

In [29]:
print(pd.unique(season_data_new.team))
# Note TOT means a player played for 2 or more teams in a season

print(pd.unique(salary_data_new.team))

['DAL' 'POR' 'MIN' 'SEA' 'MIL' 'DET' 'NYK' 'SAC' 'BOS' 'ATL' 'LAC' 'MEM'
 'SAS' 'CLE' 'LAL' 'UTA' 'NOH' 'TOR' 'NJN' 'DEN' 'ORL' 'PHI' 'WAS' 'GSW'
 'CHI' 'HOU' 'PHO' 'IND' 'MIA' 'TOT' 'CHA' 'NOK' 'OKC' 'BRK' 'NOP' 'CHO']
['Minnesota Timberwolves' 'Portland Trail Blazers' 'Seattle SuperSonics'
 'New York Knicks' 'Boston Celtics' 'Phoenix Suns' 'San Antonio Spurs'
 'Orlando Magic' 'Indiana Pacers' 'Los Angeles Lakers' 'Toronto Raptors'
 'Dallas Mavericks' 'Sacramento Kings' 'Miami Heat' 'Milwaukee Bucks'
 'Atlanta Hawks' 'Philadelphia 76ers' 'New Jersey Nets' 'Houston Rockets'
 'Utah Jazz' 'Denver Nuggets' 'Detroit Pistons' 'New Orleans Hornets'
 'Chicago Bulls' 'Memphis Grizzlies' 'Golden State Warriors' 'LA Clippers'
 'Washington Wizards' 'Cleveland Cavaliers' 'Charlotte Bobcats'
 'NO/Oklahoma City\r\r\n Hornets' 'Brooklyn Nets'
 'NO/Oklahoma City Hornets' 'Oklahoma City Thunder' 'Madrid Real Madrid'
 'null Unknown' 'New Orleans Pelicans' 'Charlotte Hornets' 'Milan Olimpia'
 'Bilbao Bas

In [30]:
# Abbreviating NBA team names
def teamName(tm_col):
    nba_teams = {
    'Atlanta Hawks': 'ATL',
    'Boston Celtics': 'BOS',
    'Brooklyn Nets': 'BRK',
    'Charlotte Bobcats': 'CHA',
    'Charlotte Hornets': 'CHO',
    'Chicago Bulls': 'CHI',
    'Cleveland Cavaliers': 'CLE',
    'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN',
    'Detroit Pistons': 'DET',
    'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU',
    'Indiana Pacers': 'IND',
    'LA Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL',
    'Memphis Grizzlies': 'MEM',
    'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL',
    'Minnesota Timberwolves': 'MIN',
    'New Orleans Pelicans': 'NOP',
    'New Orleans Hornets': 'NOH',
    'New York Knicks': 'NYK',
    'New Orleans/Oklahoma City Hornets': 'NOK',
    'New Jersey Nets': 'NJN',
    'Oklahoma City Thunder': 'OKC',
    'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI',
    'Phoenix Suns': 'PHO',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'San Antonio Spurs': 'SAS',
    'Seattle SuperSonics': 'SEA',   
    'Toronto Raptors': 'TOR',
    'Two or more teams': 'TOT',
    'Utah Jazz': 'UTA',
    'Washington Wizards': 'WAS'}
    
    for key in nba_teams:
        if tm_col == key:
            tm_abbrv = nba_teams[key]
            return(tm_abbrv)
        
salary_data_new["team"] = salary_data_new["team"].apply(lambda x: teamName(x))

In [31]:
salary_data_new.head()

Unnamed: 0,team,salary,Player,position,season
0,MIN,25.2,Kevin Garnett,PF,2002-2003
1,POR,13.5,Damon Stoudamire,PG,2002-2003
2,SEA,13.08,Gary Payton,PG,2002-2003
3,SEA,12.375,Ray Allen,SG,2002-2003
4,NYK,12.375,Latrell Sprewell,SG,2002-2003


In [32]:
joined = season_data_new.merge(salary_data_new, on = ["Player", "season", "team"], how = "inner")
joined = joined.drop(columns = ["position"])
joined.head()

Unnamed: 0,season,Player,Pos,Age,team,PTS,AST,BLK,TOV,STL,...,ast_pg,blk_pg,tov_pg,stl_pg,reb_pg,min_pg,FG%,FT%,3P%,salary
0,2002-2003,Zach Randolph,PF,21.0,POR,650.0,41.0,14.0,62.0,42.0,...,0.532468,0.181818,0.805195,0.545455,4.454545,16.896104,0.512621,0.757764,0.0,1.096
1,2002-2003,Vladimir Radmanovic,PF,22.0,SEA,724.0,97.0,22.0,100.0,64.0,...,1.347222,0.305556,1.388889,0.888889,4.486111,26.527778,0.41018,0.705882,0.354949,1.561
2,2002-2003,Joel Przybilla,C,23.0,MIL,48.0,12.0,45.0,19.0,10.0,...,0.375,1.40625,0.59375,0.3125,4.53125,17.0625,0.391304,0.5,,1.862
3,2002-2003,Vitaly Potapenko,C,27.0,SEA,104.0,4.0,8.0,25.0,9.0,...,0.153846,0.307692,0.961538,0.346154,3.423077,15.5,0.44086,0.758621,,5.238
4,2002-2003,Scot Pollard,C,27.0,SAC,103.0,6.0,15.0,15.0,13.0,...,0.26087,0.652174,0.652174,0.565217,4.608696,14.130435,0.45977,0.605263,,4.827


In [33]:
joined.shape

(4433, 35)

In [34]:
pd.unique(joined.columns)
#joined[joined.duplicated(["season", "Player"])]

array(['season', 'Player', 'Pos', 'Age', 'team', 'PTS', 'AST', 'BLK',
       'TOV', 'STL', 'ORB', 'DRB', 'MP', 'G', 'GS', 'PF', 'FTA', 'FGA',
       '3PA', '3P', 'FG', 'FT', 'PER', 'WS', 'ppg', 'ast_pg', 'blk_pg',
       'tov_pg', 'stl_pg', 'reb_pg', 'min_pg', 'FG%', 'FT%', '3P%',
       'salary'], dtype=object)

In [35]:
joined.to_csv("joined.csv")

In [36]:
joined.isna().any()

season    False
Player    False
Pos       False
Age       False
team      False
PTS       False
AST       False
BLK       False
TOV       False
STL       False
ORB       False
DRB       False
MP        False
G         False
GS        False
PF        False
FTA       False
FGA       False
3PA       False
3P         True
FG         True
FT         True
PER        True
WS        False
ppg       False
ast_pg    False
blk_pg    False
tov_pg    False
stl_pg    False
reb_pg    False
min_pg    False
FG%        True
FT%        True
3P%        True
salary    False
dtype: bool