# NBA

Load packages

In [1]:
import pandas as pd
import numpy as np

# ignore warnings
import warnings
warnings.filterwarnings('ignore') 

# pandas defaults
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 500)

Retrieve data

In [2]:
# function to retrieve data from basketball-reference.com
def extract(url,i):
    dfs = []
    for year in range(i,2021):
        df = pd.read_html(url.format(year), header=None)[0]
        df['season'] = year
        dfs.append(df)
    return pd.concat(dfs)

In [3]:
# create dataframes with data
totals = extract('https://www.basketball-reference.com/leagues/NBA_{}_totals.html',1950)   
advanced = extract('https://www.basketball-reference.com/leagues/NBA_{}_advanced.html',1950)
awards = extract('https://www.basketball-reference.com/awards/awards_{}.html',1956)

In [4]:
# retrieve number os games in each season
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_seasons', header=None)[0]
df.columns = df.columns.droplevel()
games = df[['Regular season','No. of games[e]','Finals']]
games.columns = ['Season', 'a','b','games', 'season', 'c']
games = games.join(pd.DataFrame(games['games'].str.split('–').tolist(), index= df.index), how='inner')

Prepare data

In [5]:
# keep only useful fields and remove duplicated entries for a player in a season
# duplicated entries happens when a player is part of multiple teams in the same season
totals_new = totals.drop_duplicates(['Player','season'])
totals_new = totals_new[['Player','G','MP','TRB','AST','PTS','season']]

advanced_new = advanced.drop_duplicates(['Player','season'])
advanced_new = advanced_new[['Player','PER','USG%', 'WS', 'VORP','BPM', 'season']]

awards.columns=awards.columns.droplevel()
awards_new = awards[['Player','Share','']].rename(columns={'':'season'})

# find minimum games to be eligible to season rankings
games['min_games']=0.7*pd.to_numeric(games[0])
games_new = games[['Season', 'season', 'min_games']]

In [6]:
# join tables
data = pd.merge(totals_new, advanced_new, on=['Player', 'season'], how='left')
data['Player'] = data['Player'].str.replace('*','')
data = pd.merge(data, awards_new, on=['Player', 'season'], how='left')
data = pd.merge(data, games_new, on=['season'], how='left')

# clean data
data = data[data['Player']!='Player']
cols = data.columns.drop(['Player', 'Season'])
data[cols] = data[cols].apply(pd.to_numeric, errors='coerce')
                              
# create per game stats
data['PTS_G'], data['MP_G'] = [data['PTS']/data['G'] , data['MP']/data['G']]
data = pd.melt(data, id_vars=['Player','season', 'Season', 'min_games', 'G', 'MP'], value_name='metric')

# remove nulls
data.dropna(axis=0, inplace=True)

# creation of new fields
data['eligible_season'] = np.select([(data['G'] >= data['min_games'])], [True], default=False) 
data['metric_weight'] = data['MP']*data['metric']
data['rank'] = data[data['eligible_season']==True].groupby(['season','variable'])[['metric']]\
                                                    .rank('first', ascending=False)
data['top'] = np.select([(data['rank'] <= 10)], [True], default=False)

# rename columns
data.rename(columns={'G':'games_played', 'MP':'minutes_played', 'Player':'player_name',
                     'season':'year', 'Season':'season'}, inplace=True)

# transpose to get the final data
final= pd.melt(data, id_vars=['player_name','season', 'year', 'games_played', 'minutes_played','top', 'eligible_season',
                         'variable','metric_weight' ], value_vars=['metric','rank'], var_name='type' ,value_name='value')

Save data

In [7]:
final.to_csv('nba.csv')
final.head()

Unnamed: 0,player_name,season,year,games_played,minutes_played,top,eligible_season,variable,metric_weight,type,value
0,Paul Arizin,1951–52,1952,66,2939.0,True,True,TRB,2189555.0,metric,745.0
1,Cliff Barker,1951–52,1952,44,494.0,False,False,TRB,40014.0,metric,81.0
2,Don Barksdale,1951–52,1952,62,2014.0,False,True,TRB,1210414.0,metric,601.0
3,Leo Barnhorst,1951–52,1952,66,2344.0,False,True,TRB,1007920.0,metric,430.0
4,Elmer Behnke,1951–52,1952,4,55.0,False,False,TRB,935.0,metric,17.0
