In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [131]:
# stats tables
mvp = pd.DataFrame()
team = pd.DataFrame()
game = pd.DataFrame()
totals = pd.DataFrame()

for i in range(12, 22):
    # mvp share data
    mvp_i = pd.read_csv('../data/mvp/{}_mvp.csv'.format(i))
    mvp_i['Season'] = i
    mvp = pd.concat([mvp, mvp_i], ignore_index=True)
    
    # team win-loss data
    team_i = pd.read_csv('../data/team/{}_team.csv'.format(i))
    team_i['Season'] = i
    team = pd.concat([team, team_i], ignore_index=True)
    
    # player per-game data
    game_i = pd.read_csv('../data/per_game/{}_{}_game.csv'.format(i, i + 1))
    game_i['Season'] = i
    game = pd.concat([game, game_i], ignore_index=True)
    
    # player season total data
    totals_i = pd.read_csv('../data/totals/{}_{}_totals.csv'.format(i, i + 1))
    totals_i['Season'] = i
    totals = pd.concat([totals, totals_i], ignore_index=True)

In [132]:
## clean mvp share data ##
mvp = mvp[['Player', 'Share', 'Season']]

In [133]:
mvp.head()

Unnamed: 0,Player,Share,Season
0,LeBron James,0.998,12
1,Kevin Durant,0.632,12
2,Carmelo Anthony,0.393,12
3,Chris Paul,0.239,12
4,Kobe Bryant,0.152,12


In [134]:
## clean team data ##

# team abbreviations
teams = {
    'Atlanta Hawks': 'ATL',
    'Brooklyn Nets': 'BRK',
    'Boston Celtics': 'BOS',
    'Charlotte Hornets': 'CHO',
    'Charlotte Bobcats': 'CHA',
    'Chicago Bulls': 'CHI',
    'Cleveland Cavaliers': 'CLE',
    'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN',
    'Detroit Pistons': 'DET',
    'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU',
    'Indiana Pacers': 'IND',
    'Los Angeles Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL',
    'Memphis Grizzlies': 'MEM',
    'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL',
    'Minnesota Timberwolves': 'MIN',
    'New Orleans Pelicans': 'NOP',
    'New Orleans Hornets': 'NOH',
    'New York Knicks': 'NYK',
    'Oklahoma City Thunder': 'OKC',
    'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI',
    'Phoenix Suns': 'PHO',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'San Antonio Spurs': 'SAS',
    'Toronto Raptors': 'TOR',
    'Utah Jazz': 'UTA',
    'Washington Wizards': 'WAS'
}

team = team[['Team', 'W/L%', 'Season']]
team = team.replace({'Team': teams})
team = team.rename(columns = {'Team': 'Tm'})

In [135]:
team.head()

Unnamed: 0,Tm,W/L%,Season
0,OKC,0.732,12
1,MIA,0.805,12
2,SAS,0.707,12
3,LAC,0.683,12
4,DEN,0.695,12


In [136]:
## clean game data ##
game['Player'] = game['Player'].str.replace('*', '', regex=False)
game = game.drop('Player-additional', axis=1)

In [137]:
game.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Season
0,1,Quincy Acy,PF,22,TOR,29,0,11.8,1.4,2.6,...,1.0,1.6,2.7,0.4,0.4,0.5,0.6,1.8,4.0,12
1,2,Jeff Adrien,PF,26,CHA,52,5,13.7,1.4,3.2,...,1.3,2.5,3.8,0.7,0.3,0.5,0.6,1.5,4.0,12
2,3,Arron Afflalo,SF,27,ORL,64,64,36.0,6.2,14.1,...,0.5,3.3,3.7,3.2,0.6,0.2,2.2,2.1,16.5,12
3,4,Josh Akognon,PG,26,DAL,3,0,3.0,0.7,1.3,...,0.0,0.3,0.3,0.3,0.0,0.0,0.0,1.0,1.7,12
4,5,Cole Aldrich,C,24,TOT,45,0,8.6,1.0,1.8,...,0.7,2.0,2.7,0.2,0.1,0.5,0.5,1.3,2.2,12


In [138]:
## clean season totals data ##
totals['Player'] = totals['Player'].str.replace('*', '', regex=False)
totals = totals.drop(['Rk', 'Pos', 'Age', 'Tm', 'G', 'GS', 'Player-additional'], axis=1)
totals = totals.rename(columns = {col: 'T_' + col for col in totals.columns if col not in ['Player', 'Season']})

In [139]:
totals.head()

Unnamed: 0,Player,T_MP,T_FG,T_FGA,T_FG%,T_3P,T_3PA,T_3P%,T_2P,T_2PA,...,T_ORB,T_DRB,T_TRB,T_AST,T_STL,T_BLK,T_TOV,T_PF,T_PTS,Season
0,Quincy Acy,342,42,75,0.56,1,2,0.5,41,73,...,30,47,77,11,13,15,17,53,116,12
1,Jeff Adrien,713,72,168,0.429,0,2,0.0,72,166,...,68,128,196,36,18,27,32,80,209,12
2,Arron Afflalo,2307,397,905,0.439,72,240,0.3,325,665,...,29,210,239,206,40,11,138,137,1057,12
3,Josh Akognon,9,2,4,0.5,1,2,0.5,1,2,...,0,1,1,1,0,0,0,3,5,12
4,Cole Aldrich,388,44,80,0.55,0,0,,44,80,...,30,90,120,9,5,23,23,60,100,12


In [167]:
# combine per-game and season total stats
data = game.merge(totals, on=['Player', 'Season'])
data = data.drop('Rk', axis=1)
print(data.shape)
data.head()

(5949, 53)


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,T_FT%,T_ORB,T_DRB,T_TRB,T_AST,T_STL,T_BLK,T_TOV,T_PF,T_PTS
0,Quincy Acy,PF,22,TOR,29,0,11.8,1.4,2.6,0.56,...,0.816,30,47,77,11,13,15,17,53,116
1,Jeff Adrien,PF,26,CHA,52,5,13.7,1.4,3.2,0.429,...,0.65,68,128,196,36,18,27,32,80,209
2,Arron Afflalo,SF,27,ORL,64,64,36.0,6.2,14.1,0.439,...,0.857,29,210,239,206,40,11,138,137,1057
3,Josh Akognon,PG,26,DAL,3,0,3.0,0.7,1.3,0.5,...,,0,1,1,1,0,0,0,3,5
4,Cole Aldrich,C,24,TOT,45,0,8.6,1.0,1.8,0.55,...,0.6,30,90,120,9,5,23,23,60,100


In [168]:
# add team win percentage to dataset
data = data.merge(team, on=['Tm', 'Season'])
print(data.shape)
data.head()

(4480, 54)


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,T_ORB,T_DRB,T_TRB,T_AST,T_STL,T_BLK,T_TOV,T_PF,T_PTS,W/L%
0,Quincy Acy,PF,22,TOR,29,0,11.8,1.4,2.6,0.56,...,30,47,77,11,13,15,17,53,116,0.415
1,Alan Anderson,SF,30,TOR,65,2,23.0,3.6,9.5,0.383,...,34,114,148,103,48,7,80,131,693,0.415
2,Andrea Bargnani,PF,27,TOR,35,25,28.7,4.9,12.2,0.399,...,26,102,128,38,21,23,54,64,443,0.415
3,DeMar DeRozan,SG,23,TOR,82,82,36.7,6.7,15.0,0.445,...,48,272,320,204,76,24,151,173,1485,0.415
4,Landry Fields,SF,24,TOR,51,22,20.3,2.0,4.4,0.457,...,59,149,208,60,32,8,42,63,240,0.415


In [169]:
## remove outliers by thresholding stats

# lowest points per game by mvp -- 13.8 (statmuse)
data = data[data['PTS'] > 12]

# lowest recorded number of games started by an mvp -- 49 (statmuse)
data = data[data['GS'] > 40]

# worst win percentage -- 45.8% (statmuse)
data = data[data['W/L%'] >= 0.45]

# fix empty entries relating to three point percentages
data = data[data['3P%'] >= 0] 
data = data[data['T_3P%'] >= 0]

print(data.shape)
data.head()

(576, 54)


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,T_ORB,T_DRB,T_TRB,T_AST,T_STL,T_BLK,T_TOV,T_PF,T_PTS,W/L%
53,Shawn Marion,PF,34,DAL,67,67,30.0,5.3,10.3,0.514,...,146,379,525,163,75,47,103,111,812,0.5
54,O.J. Mayo,SG,25,DAL,82,82,35.5,5.6,12.5,0.449,...,36,255,291,361,93,23,210,195,1255,0.5
56,Dirk Nowitzki,PF,34,DAL,53,47,31.3,6.5,13.7,0.471,...,37,326,363,132,38,37,70,93,917,0.5
90,Chris Bosh,C,28,MIA,74,74,33.2,6.6,12.3,0.535,...,131,370,501,123,66,101,128,168,1232,0.805
96,LeBron James,PF,28,MIA,76,76,37.9,10.1,17.8,0.565,...,97,513,610,551,129,67,226,110,2036,0.805


In [170]:
# add player mvp share to dataset
data = data.merge(mvp, how='left')
data['Share'] = data['Share'].fillna(0)
data = data.drop(['Pos', 'Tm'], axis=1)
print(data.shape)
data.head()

(576, 53)


Unnamed: 0,Player,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,...,T_DRB,T_TRB,T_AST,T_STL,T_BLK,T_TOV,T_PF,T_PTS,W/L%,Share
0,Shawn Marion,34,67,67,30.0,5.3,10.3,0.514,0.3,1.1,...,379,525,163,75,47,103,111,812,0.5,0.0
1,O.J. Mayo,25,82,82,35.5,5.6,12.5,0.449,1.7,4.3,...,255,291,361,93,23,210,195,1255,0.5,0.0
2,Dirk Nowitzki,34,53,47,31.3,6.5,13.7,0.471,1.2,3.0,...,326,363,132,38,37,70,93,917,0.5,0.0
3,Chris Bosh,28,74,74,33.2,6.6,12.3,0.535,0.3,1.0,...,370,501,123,66,101,128,168,1232,0.805,0.0
4,LeBron James,28,76,76,37.9,10.1,17.8,0.565,1.4,3.3,...,513,610,551,129,67,226,110,2036,0.805,0.998


In [172]:
data.to_csv('../data/final_stats.csv')