In [3]:
from basketball import *

import numpy as np
import pickle
import pandas as pd

## Get Salary Cap info

In [4]:
url = 'https://www.basketball-reference.com/contracts/salary-cap-history.html'

tables = get_tables(url)
df_caps = tables[0]

df_caps.columns = ['Year', 'Salary Cap', 'Salary Cap (2015)']
df_caps.drop(0, inplace=True)
df_caps.reset_index(inplace=True, drop=True)

# Money to float
df_caps['Salary Cap'].replace('\D', '', regex=True, inplace=True)
df_caps['Salary Cap (2015)'].replace('\D', '', regex=True, inplace=True)

df_caps['Salary Cap'] = pd.to_numeric(df_caps['Salary Cap'])
df_caps['Salary Cap (2015)'] = pd.to_numeric(df_caps['Salary Cap (2015)'])

df_caps['Year'] = df_caps['Year'].apply(lambda x: x[:2] + x[-2:])

In [5]:
df_caps.head()

Unnamed: 0,Year,Salary Cap,Salary Cap (2015)
0,1985,3600000,7934034.0
1,1986,4233000,9153509.0
2,1987,4945000,10317292.0
3,1988,6164000,12354015.0
4,1989,7232000,13829137.0


## Get Stats

In [46]:
# Advanced tables
with open('databases.pickle', 'rb') as f:
    databases = pickle.load(f)


yearly_dfs = []

for year in databases.keys():
    if year != 2015:
        adv_stats, adv_sals = database_to_stats_and_salaries(pos=8, year=year, database=databases[year])
        per36_stats, per_sals = database_to_stats_and_salaries(pos=6, year=year, database=databases[year])

        df_adv = stats_salary_join(year=year, dfs=adv_stats, targets=adv_sals)
        df_per_36 = stats_salary_join(year=year, dfs=per36_stats, targets=per_sals)
        
        # drop the extra salary column
        df_per_36.drop('Salary', axis=1, inplace=True)
        
        # concat and append the normal stats and advanced stats
        yearly_dfs.append(pd.concat([df_per_36, df_adv], join='inner', axis=1))
        
df_stats = pd.concat(yearly_dfs)
df_stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Rk,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,...,OWS,DWS,WS,WS/48,Unnamed: 18_level_0,OBPM,DBPM,BPM,VORP,Salary
Name,Team,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Paul Pierce,BOS,2008,1,30,80,80,2874,6.4,13.8,0.464,1.8,4.6,...,6.7,5.7,12.4,0.207,,3.3,1.5,4.7,4.9,16360094
Ray Allen,BOS,2008,2,32,73,73,2624,6.0,13.5,0.445,2.5,6.2,...,5.6,4.1,9.7,0.177,,3.2,-0.1,3.1,3.4,16000000
Kevin Garnett,BOS,2008,3,31,71,71,2328,8.3,15.3,0.539,0.0,0.2,...,6.6,6.2,12.9,0.265,,2.7,4.7,7.4,5.5,23750000
Rajon Rondo,BOS,2008,4,21,77,77,2306,5.5,11.1,0.492,0.1,0.3,...,2.3,4.9,7.2,0.15,,-0.1,2.7,2.6,2.7,1229280
Kendrick Perkins,BOS,2008,5,23,78,78,1912,4.0,6.6,0.615,0.0,0.0,...,1.9,4.3,6.2,0.156,,-1.7,4.3,2.6,2.2,4480912


## Getting Attendence data

In [7]:
### EXPERIMENT WITH GET TABLES 
espn_url = 'http://www.espn.com/nba/attendance'
attend = get_tables(espn_url)

In [8]:
def clean_attend(attend):
    df_a = attend[0]

    columns = ['RK', 'TEAM', 'HOME GMS', 'HM TOTAL', 'HM AVG', 'HM PCT', 'ROAD GMS', 'ROAD AVG',
              'ROAD PCT', 'OVERALL GMS', 'OVERALL AVG', 'OVERALL PCT']

    df_a.rename(mapper={n:column for n, column in zip(range(len(columns)), columns)}, axis=1, inplace=True)

    df_a.drop(labels=[0,1], inplace=True)
    df_a.reset_index(drop=True, inplace=True)
    
    for col in df_a.columns:
        df_a[col] = df_a[col].apply(str.strip)
    
    return df_a

In [9]:
# retrieve and clean the data
attends = {}
for year in range(2008, 2019):
    url = 'http://www.espn.com/nba/attendance/_/year/{}'.format(year)
    attend = get_tables(url)
    attends[year] = clean_attend(attend)

## Feature Engineering

In [60]:
def prepare_dataframe(df, features):
    df = df.copy()

    # let's drop players with low minutes played per game, say less than 10
    df['MPperG'] = df['MP'] / df['G']
    
    # drop blank columns
    df.drop('\xa0', inplace=True, axis=1)

    # We want over 10 minutes played per game and nonzero salary
    mask = ((df['MPperG'] > 10) & (df['Salary'] > 0))
    df = df[mask]

    X = df[features].values
    y = df['Salary'].values
    
    return X, y, df

In [61]:
adv_features = ['Rk', 'Age', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS',
       'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']

_, _, df = prepare_dataframe(df_stats, adv)
df.head()

TypeError: prepare_dataframe() missing 1 required positional argument: 'features'

In [58]:
df_adv.columns

Index(['Rk', 'Age', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', ' ', 'OWS', 'DWS', 'WS',
       'WS/48', ' ', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Salary'],
      dtype='object', name=0)