In [1]:
import selenium
import pickle
import basketball
import requests

import pandas as pd

from bs4 import BeautifulSoup

In [5]:
with open('database.pickle', 'rb') as f:
    database = pickle.load(f)
    
teams = list(database.keys())
print(teams)

['MIA', 'ORL', 'PHI', 'NYK', 'BOS', 'WAS', 'NJN', 'IND', 'ATL', 'DET', 'MIL', 'CHH', 'TOR', 'CLE', 'CHI', 'SAS', 'UTA', 'HOU', 'MIN', 'DAL', 'DEN', 'VAN', 'POR', 'LAL', 'PHO', 'SAC', 'SEA', 'GSW', 'LAC']


In [6]:
# let's work with just San Antonio now
spurs_dfs = database['SAS']

In [7]:
# this is the per 36 minute dataframe. I want to use this in order
# to normalize based minutes played. 

spurs_dfs[6].head() 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,Rk,,Age,G,GS,MP,FG,FGA,FG%,3P,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
1,1,Tim Duncan,22,50,50,1963,7.7,15.5,.495,0.0,...,.690,2.9,7.6,10.5,2.2,0.8,2.3,2.7,2.7,19.9
2,2,Avery Johnson,33,50,50,1672,4.7,9.9,.473,0.0,...,.568,0.5,2.1,2.5,7.9,1.1,0.2,2.4,2.2,10.5
3,3,David Robinson,33,49,49,1554,6.2,12.2,.509,0.0,...,.658,3.4,8.0,11.4,2.4,1.6,2.8,2.5,3.3,18.0
4,4,Sean Elliott,30,50,50,1509,5.0,12.1,.410,0.9,...,.757,0.8,4.2,5.1,2.8,0.6,0.4,1.7,2.5,13.4


__Column meanings for Per 36 Min table.__
___

Rk -- Rank

Age -- Age of Player at the start of February 1st of that season.

G -- Games

GS -- Games Started

MP -- Minutes Played

FG -- Field Goals Per 36 Minutes

FGA -- Field Goal Attempts Per 36 Minutes

FG% -- Field Goal Percentage

3P -- 3-Point Field Goals Per 36 Minutes

3PA -- 3-Point Field Goal Attempts Per 36 Minutes

3P% -- 3-Point Field Goal Percentage

2P -- 2-Point Field Goals Per 36 Minutes

2PA -- 2-Point Field Goal Attempts Per 36 Minutes

2P% -- 2-Point Field Goal Percentage

FT -- Free Throws Per 36 Minutes

FTA -- Free Throw Attempts Per 36 Minutes

FT% -- Free Throw Percentage

ORB -- Offensive Rebounds Per 36 Minutes

DRB -- Defensive Rebounds Per 36 Minutes

TRB -- Total Rebounds Per 36 Minutes

AST -- Assists Per 36 Minutes

STL -- Steals Per 36 Minutes

BLK -- Blocks Per 36 Minutes

TOV -- Turnovers Per 36 Minutes

PF -- Personal Fouls Per 36 Minutes

PTS -- Points Per 36 Minutes


In [8]:
# let's pull all per 36 min dfs

dfs_per_36 = {}

for team, dfs in database.items():
    clean_df = dfs[6].copy()
    clean_df.iloc[0,1] = 'Name'
    clean_df.columns = clean_df.iloc[0,:]
    clean_df.drop(0, inplace=True)
    clean_df.reset_index(inplace=True, drop=True)
    dfs_per_36[team] = clean_df
    
dfs_per_36['SAS']

Unnamed: 0,Rk,Name,Age,G,GS,MP,FG,FGA,FG%,3P,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Tim Duncan,22,50,50,1963,7.7,15.5,0.495,0.0,...,0.69,2.9,7.6,10.5,2.2,0.8,2.3,2.7,2.7,19.9
1,2,Avery Johnson,33,50,50,1672,4.7,9.9,0.473,0.0,...,0.568,0.5,2.1,2.5,7.9,1.1,0.2,2.4,2.2,10.5
2,3,David Robinson,33,49,49,1554,6.2,12.2,0.509,0.0,...,0.658,3.4,8.0,11.4,2.4,1.6,2.8,2.5,3.3,18.0
3,4,Sean Elliott,30,50,50,1509,5.0,12.1,0.41,0.9,...,0.757,0.8,4.2,5.1,2.8,0.6,0.4,1.7,2.5,13.4
4,5,Mario Elie,35,47,37,1291,4.4,9.2,0.471,1.1,...,0.866,1.0,2.8,3.8,2.5,1.3,0.3,1.7,2.5,12.7
5,6,Jaren Jackson,31,47,13,861,4.5,11.9,0.38,2.2,...,0.821,0.9,3.3,4.1,2.0,1.7,0.4,1.5,2.6,12.6
6,7,Steve Kerr,33,44,0,734,3.3,8.5,0.391,1.2,...,0.886,0.3,1.9,2.2,2.4,1.1,0.1,1.1,1.4,9.4
7,8,Jerome Kersey,36,45,0,699,3.5,10.3,0.34,0.2,...,0.429,2.2,4.5,6.7,2.1,1.9,0.7,1.5,4.7,7.5
8,9,Antonio Daniels,23,47,0,614,4.9,10.7,0.454,0.3,...,0.754,0.8,2.4,3.2,6.2,1.8,0.4,2.6,2.3,12.9
9,10,Malik Rose,24,47,0,608,5.5,11.9,0.463,0.0,...,0.671,5.3,5.4,10.8,1.7,2.4,1.3,3.3,7.1,16.8


In [12]:
# we also want the salaries for all the players
salaries = {}

for team, dfs in database.items():
    new_df = dfs[-1].copy()
    new_df.iloc[0,1] = 'Name'
    new_df.columns = new_df.iloc[0, :]
    new_df.drop(0, inplace=True)
    new_df.reset_index(inplace=True, drop=True)
    
    # money to float
    new_df['Salary'].replace('\D', '', regex=True, inplace=True)
    new_df['Salary'] = pd.to_numeric(new_df['Salary'])
    salaries[team] = new_df
    
salaries

{'MIA': 0   Rk             Name    Salary
 0    1  Alonzo Mourning  13130000
 1    2     Tim Hardaway   5600000
 2    3   Jamal Mashburn   5033000
 3    4       P.J. Brown   4480000
 4    5      Dan Majerle   3290000
 5    6    Voshon Lenard   2837000
 6    7  Mark Strickland   2100000
 7    8   Duane Causwell   1454000
 8    9      Terry Mills   1145000
 9   10     Terry Porter   1000000
 10  11     Blue Edwards    850000
 11  12     Keith Askins    725000
 12  13     Marty Conlon    662500
 13  14      Rex Walters    537500, 'ORL': 0   Rk               Name   Salary
 0    1  Anfernee Hardaway  8505000
 1    2       Horace Grant  7843000
 2    3         Mark Price  5488000
 3    4       Isaac Austin  5000000
 4    5      Nick Anderson  3200000
 5    6       Derek Strong  3000000
 6    7  Darrell Armstrong  2900000
 7    8     Gerald Wilkins  2000000
 8    9         Yinka Dare  1657000
 9   10          Bo Outlaw  1150000
 10  11     Michael Doleac  1050000
 11  12      Danny Schayes  1

In [10]:
len(salaries)*14 # i have 406 datapoints

406

In [13]:
# let's pickle the results

with open('per_36_min.pickle', 'wb') as f:
    pickle.dump(dfs_per_36, f)
    
with open('salaries.pickle', 'wb') as f:
    pickle.dump(salaries, f)