**Library Imports**|

In [1]:
import requests
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

Before scrapping I wanted to create a dictionary by year of all the teams that qualified for the playoffs for that given year. Unforutnately it wasn't easily scrapable so I had to build out the dictionary manually (below):

In [2]:
playoff_teams = {'2020': ['LAL', 'MIA', 'BOS', 'DEN', 'MIL', 'TOR', 'LAC', 'HOU',
                         'ORL', 'BRK', 'PHI', 'IND', 'POR', 'DAL', 'UTA', 'OKC'],
           '2019': ['MIL', 'TOR', 'PHI', 'BOS', 'GSW', 'DEN', 'POR', 'HOU',
                   'DET', 'ORL', 'BRK', 'IND', 'LAC', 'SAS', 'OKC', 'UTA'],
           '2018': ['TOR', 'BOS', 'PHI', 'CLE', 'HOU', 'GSW', 'NOP', 'UTA',
                   'WAS', 'MIL', 'MIA', 'IND', 'MIN', 'SAS', 'POR', 'OKC'],
           '2017': ['BOS', 'CLE', 'TOR', 'WAS', 'SAS', 'HOU', 'UTA', 'GSW',
                   'CHI', 'IND', 'MIL', 'ATL', 'POR', 'MEM', 'OKC', 'LAC'],
           '2016': ['CLE', 'TOR', 'MIA', 'ATL', 'GSW', 'SAS', 'OKC', 'POR',
                   'DET', 'IND', 'CHA', 'BOS', 'HOU', 'MEM', 'DAL', 'LAC'],
           '2015': ['ATL', 'CLE', 'CHI', 'WAS', 'GSW', 'HOU', 'LAC', 'MEM',
                   'BRK', 'BOS', 'MIL', 'TOR', 'NOP', 'DAL', 'SAS', 'POR'],
           '2014': ['IND', 'MIA', 'BRK', 'WAS', 'SAS', 'OKC', 'LAC', 'POR',
                   'ATL', 'CHO', 'TOR', 'CHI', 'DAL', 'MEM', 'GSW', 'HOU'],
           '2013': ['MIA', 'NYK', 'IND', 'CHI', 'OKC', 'SAS', 'GSW', 'MEM',
                   'MIL', 'BOS', 'ATL', 'BRK', 'HOU', 'LAL', 'DEN', 'LAC'],
           '2012': ['PHI', 'MIA', 'IND', 'BOS', 'SAS', 'OKC', 'LAL', 'LAC',
                   'CHI', 'NYK', 'ORL', 'ATL', 'UTA', 'DAL', 'DEN', 'MEM'],
           '2011': ['CHI', 'MIA', 'BOS', 'ATL', 'MEM', 'LAL', 'DAL', 'OKC',
                   'IND', 'PHI', 'NYK', 'ORL', 'SAS', 'NOP', 'POR', 'DEN'],
           '2010': ['CLE', 'ORL', 'ATL', 'BOS', 'LAL', 'SAS', 'PHO', 'UTA',
                   'CHI', 'CHA', 'MIL', 'MIA', 'OKC', 'DAL', 'POR', 'DEN'],
           '2009': ['CLE', 'BOS', 'ATL', 'LAL', 'DEN', 'DAL', 'HOU', 'LAL',
                   'DET', 'CHI', 'PHI', 'MIA', 'UTA', 'NOP', 'SAS', 'POR'],
           '2008': ['BOS', 'DET', 'ORL', 'CLE', 'LAL', 'NOH', 'SAS', 'UTA',
                   'ATL', 'PHI', 'TOR', 'WAS', 'DEN', 'DAL', 'PHO', 'HOU'],
           '2007': ['CHI', 'CLE', 'DET', 'NJN', 'GWS', 'PHO', 'SAS', 'UTA',
                   'MIA', 'WAS', 'ORL', 'TOR', 'DAL', 'LAL', 'DEN', 'HOU'],
           '2006': ['CLE', 'DET', 'MIA', 'NJN', 'DAL', 'LAC', 'PHO', 'SAS',
                   'WAS', 'MIL', 'CHI', 'IND', 'MEM', 'DEN', 'LAL', 'SAC'],
           '2005': ['DET', 'IND', 'MIA', 'WAS', 'DAL', 'PHO', 'SAS', 'SEA',
                   'PHI', 'BOS', 'NJN', 'CHI', 'HOU', 'MEM', 'DEN', 'SAC'],
           '2004': ['DET', 'IND', 'MIA', 'NJN', 'LAL', 'MIN', 'SAC', 'SAS',
                   'MIL', 'BOS', 'NOH', 'NYK', 'HOU', 'DEN', 'DAL', 'MEM'],
           '2003': ['BOS', 'DET', 'NJN', 'PHI', 'DAL', 'LAL', 'SAC', 'SAS',
                   'IND', 'ORL', 'MIL', 'NOH', 'POR', 'MIN', 'UTA', 'PHO'],
           '2002': ['BOS', 'CHH', 'DET', 'NJN', 'DAL', 'LAL', 'SAC', 'SAS',
                   'PHI', 'ORL', 'TOR', 'IND', 'MIN', 'POR', 'UTA', 'SEA'],
           '2001': ['CHH', 'MIL', 'PHI', 'TOR', 'DAL', 'LAL', 'SAC', 'SAS',
                   'MIA', 'ORL', 'IND', 'NYK', 'UTA', 'POR', 'PHO', 'MIN'],
           '2000': ['IND', 'MIA', 'NYK', 'PHI', 'LAL', 'PHO', 'POR', 'UTA',
                   'MIL', 'DET', 'TOR', 'CHH', 'SAC', 'SAS', 'MIN', 'SEA'],
           '1999': ['ATL', 'IND', 'NYK', 'PHI', 'LAL', 'POR', 'SAS', 'UTA',
                   'DET', 'MIL', 'MIA', 'ORL', 'HOU', 'PHO', 'MIN', 'SAC'],
           '1998': ['CHH', 'CHI', 'IND', 'NYK', 'LAL', 'SAS', 'SEA', 'UTA',
                   'ATL', 'NJN', 'CLE', 'MIA', 'POR', 'PHO', 'MIN', 'HOU'],
           '1997': ['ATL', 'CHI', 'MIA', 'NYK', 'HOU', 'LAL', 'SEA', 'UTA',
                   'DET', 'WAS', 'ORL', 'CHH', 'MIN', 'POR', 'PHO', 'LAC'],
           '1996': ['ATL', 'CHI', 'NYK', 'ORL', 'HOU', 'SAS', 'SEA', 'UTA',
                   'IND', 'MIA', 'CLE', 'DET', 'LAL', 'PHO', 'SAC', 'POR'],
           '1995': ['CHI', 'IND', 'NYK', 'ORL', 'HOU', 'LAL', 'PHO', 'SAS',
                   'CHH', 'ATL', 'CLE', 'BOS', 'UTA', 'SEA', 'POR', 'DEN'],
           '1994': ['ATL', 'CHI', 'IND', 'NYK', 'DEN', 'HOU', 'PHO', 'UTA',
                   'MIA', 'CLE', 'ORL', 'NJN', 'SEA', 'POR', 'GSW', 'SAS'],
           '1993': ['CHH', 'CHI', 'CLE', 'NYK', 'HOU', 'PHO', 'SAS', 'SEA',
                   'BOS', 'ATL', 'NJN', 'IND', 'LAC', 'LAL', 'POR', 'UTA'],
           '1992': ['BOS', 'CHI', 'CLE', 'NYK', 'PHO', 'POR', 'SEA', 'UTA',
                   'IND', 'MIA', 'NJN', 'DET', 'SAS', 'LAL', 'GSW', 'LAC'],
           '1991': ['BOS', 'CHI', 'DET', 'PHI', 'GSW', 'LAL', 'POR', 'UTA',
                   'IND', 'NYK', 'ATL', 'MIL', 'SAS', 'HOU', 'SEA', 'PHO'],
           '1990': ['CHI', 'DET', 'NYK', 'PHI', 'LAL', 'PHO', 'POR', 'SAS',
                   'MIL', 'IND', 'BOS', 'CLE', 'HOU', 'UTA', 'DAL', 'DEN']}

Next I built a 'for loop' for my basic stats for players over the past 30 seasons (1990-2020). These metrics are your tpyical old fashion stats associated with basketball: FG%, 3PFG%, rebounds, assists, steals, blocks, and point per game. I also wanted to make sure I included the team and year along with the players name so I will know what set of statistics are associated with which player and what year of their career. Also this will allow for an easier joining of dataframes further on. 

In [3]:
#basic stats ALL YEARS
players = []
for i in reversed(range(1990, 2021)):
    year = i
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')
    table = soup.find('table', {'id': 'per_game_stats'})
    for row in table.find('tbody').find_all('tr', {'class':'full_table'}):
        player = {}
        player['name'] = row.find('td', {'data-stat': 'player'}).text
        player['team'] = row.find('td', {'data-stat': 'team_id'}).text
        player['year'] = i
        player['games_played'] = row.find('td', {'data-stat': 'g'}).text
        player['mpg'] = row.find('td', {'data-stat': 'mp_per_g'}).text
        player['fg%'] = row.find('td', {'data-stat': 'fg_pct'}).text
        player['3pfg%'] = row.find('td', {'data-stat': 'fg3_pct'}).text
        player['rebounds'] = row.find('td', {'data-stat': 'trb_per_g'}).text
        player['assists'] = row.find('td', {'data-stat': 'ast_per_g'}).text
        player['steals'] = row.find('td', {'data-stat': 'stl_per_g'}).text
        player['blocks'] = row.find('td', {'data-stat': 'blk_per_g'}).text
        player['points_per_game'] = row.find('td', {'data-stat': 'pts_per_g'}).text
        players.append(player)

time.sleep(5)        

Since we will want to use this scrape for modeling, it's a good idea to save it to a pandas df and then save it to a csv for further usage in our modeling notebook:

In [4]:
df_basic = pd.DataFrame(players)
df_basic

Unnamed: 0,name,team,year,games_played,mpg,fg%,3pfg%,rebounds,assists,steals,blocks,points_per_game
0,Steven Adams,OKC,2020,63,26.7,.592,.333,9.3,2.3,0.8,1.1,10.9
1,Bam Adebayo,MIA,2020,72,33.6,.557,.143,10.2,5.1,1.1,1.3,15.9
2,LaMarcus Aldridge,SAS,2020,53,33.1,.493,.389,7.4,2.4,0.7,1.6,18.9
3,Kyle Alexander,MIA,2020,2,6.5,.500,,1.5,0.0,0.0,0.0,1.0
4,Nickeil Alexander-Walker,NOP,2020,47,12.6,.368,.346,1.8,1.9,0.4,0.2,5.7
...,...,...,...,...,...,...,...,...,...,...,...,...
13940,Orlando Woolridge,LAL,1990,62,22.9,.556,.000,3.0,1.5,0.6,0.7,12.7
13941,Haywoode Workman,ATL,1990,6,2.7,.667,,0.5,0.3,0.5,0.0,1.0
13942,James Worthy*,LAL,1990,80,37.0,.548,.306,6.0,3.6,1.2,0.6,21.1
13943,Danny Young,POR,1990,82,17.0,.421,.271,1.5,2.8,1.0,0.0,4.7


Performing same basic stats scrape for the players for this current season (2021). These are the players we will be analyzing and using our models to predict if they are having a "playoff caliber" season.

In [5]:
#2021 basic
players_2021_basic = []
year = 2021
url = f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
res = requests.get(url)
soup = BeautifulSoup(res.content, 'lxml')
table = soup.find('table', {'id': 'per_game_stats'})
for row in table.find('tbody').find_all('tr', {'class':'full_table'}):
    player = {}
    player['name'] = row.find('td', {'data-stat': 'player'}).text
    player['team'] = row.find('td', {'data-stat': 'team_id'}).text
    player['year'] = year
    player['games_played'] = row.find('td', {'data-stat': 'g'}).text
    player['mpg'] = row.find('td', {'data-stat': 'mp_per_g'}).text
    player['fg%'] = row.find('td', {'data-stat': 'fg_pct'}).text
    player['3pfg%'] = row.find('td', {'data-stat': 'fg3_pct'}).text
    player['rebounds'] = row.find('td', {'data-stat': 'trb_per_g'}).text
    player['assists'] = row.find('td', {'data-stat': 'ast_per_g'}).text
    player['steals'] = row.find('td', {'data-stat': 'stl_per_g'}).text
    player['blocks'] = row.find('td', {'data-stat': 'blk_per_g'}).text
    player['points_per_game'] = row.find('td', {'data-stat': 'pts_per_g'}).text
    players_2021_basic.append(player)

time.sleep(5)        

Same as above, saving the scrape to a pandas df:

In [6]:
df_2021_basic = pd.DataFrame(players_2021_basic)
df_2021_basic

Unnamed: 0,name,team,year,games_played,mpg,fg%,3pfg%,rebounds,assists,steals,blocks,points_per_game
0,Precious Achiuwa,MIA,2021,37,13.5,.585,,3.9,0.5,0.4,0.5,5.8
1,Jaylen Adams,MIL,2021,7,2.6,.125,.000,0.4,0.3,0.0,0.0,0.3
2,Steven Adams,NOP,2021,35,27.6,.624,.000,9.2,2.0,0.9,0.6,8.2
3,Bam Adebayo,MIA,2021,33,33.9,.563,.333,9.5,5.4,0.9,1.0,19.2
4,LaMarcus Aldridge,SAS,2021,21,25.9,.464,.360,4.5,1.7,0.4,0.9,13.7
...,...,...,...,...,...,...,...,...,...,...,...,...
492,Delon Wright,DET,2021,31,29.1,.470,.392,4.5,4.8,1.4,0.5,10.6
493,Thaddeus Young,CHI,2021,32,25.1,.605,.227,6.1,4.3,1.3,0.6,12.2
494,Trae Young,ATL,2021,36,34.7,.433,.380,4.1,9.3,0.9,0.3,26.7
495,Cody Zeller,CHO,2021,20,23.5,.522,.158,7.5,2.3,0.7,0.6,10.0


Now we will perform a separate scrap of the advanced statistics for each player over the past 30 years. Since the tables are organized in different orders on the basketball-reference website, it's important to include a common column to join on with our other scrapes, so once again we are making sure to include the player's name, team, and the year associated with the data. Also we will want to make sure to create our target variable by denoting a 1 to the player row if their scraped team is in the playoff teams dictionary from above for that corresponding year (and a 0 if not). **Refer to ReadMe for more info on 'Playoff Caliber' definition**

In [7]:
#advanced stats ALL YEARS:
players_advanced = []
for i in reversed(range(1990, 2021)):
    year = i
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html'
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')
    table = soup.find('table', {'id': 'advanced_stats'})
    for row in table.find('tbody').find_all('tr', {'class':'full_table'}):
        player = {}
        player['name'] = row.find('td', {'data-stat': 'player'}).text
        player['team'] = row.find('td', {'data-stat': 'team_id'}).text
        player['year'] = i
        player['position'] = row.find('td', {'data-stat': 'pos'}).text
        player['player_efficiency'] = row.find('td', {'data-stat': 'per'}).text
        player['true_shooting'] = row.find('td', {'data-stat': 'ts_pct'}).text
        player['plus_minus'] = row.find('td', {'data-stat': 'bpm'}).text
        player['win_shares_percentage'] = row.find('td', {'data-stat': 'ws_per_48'}).text
        player['in_playoff'] = 1 if player.get('team') in playoff_teams.get(str(year)) else 0
        players_advanced.append(player)

time.sleep(5)        

Saving advanced scrape to a dataframe:

In [8]:
df_advanced = pd.DataFrame(players_advanced)
df_advanced

Unnamed: 0,name,team,year,position,player_efficiency,true_shooting,plus_minus,win_shares_percentage,in_playoff
0,Steven Adams,OKC,2020,C,20.5,.604,2.9,.185,1
1,Bam Adebayo,MIA,2020,PF,20.3,.598,3.4,.168,1
2,LaMarcus Aldridge,SAS,2020,C,19.7,.571,1.4,.122,0
3,Kyle Alexander,MIA,2020,C,4.7,.500,-9.6,-0.003,1
4,Nickeil Alexander-Walker,NOP,2020,SG,8.9,.473,-4.6,-0.020,0
...,...,...,...,...,...,...,...,...,...
13940,Orlando Woolridge,LAL,1990,SF,17.6,.601,1.5,.161,1
13941,Haywoode Workman,ATL,1990,PG,26.1,.773,13.6,.357,0
13942,James Worthy*,LAL,1990,SF,19.8,.586,3.8,.172,1
13943,Danny Young,POR,1990,PG,11.5,.508,0.9,.103,1


Scraping for the same advanced statistics for the current players (except no in_playoffs column since that will be what we are predicting for the 2021 players):

In [9]:
#advanced stats 2021:
players_2021_advanced = []

year = 2021
url = f'https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html'
res = requests.get(url)
soup = BeautifulSoup(res.content, 'lxml')
table = soup.find('table', {'id': 'advanced_stats'})
for row in table.find('tbody').find_all('tr', {'class':'full_table'}):
    player = {}
    player['name'] = row.find('td', {'data-stat': 'player'}).text
    player['team'] = row.find('td', {'data-stat': 'team_id'}).text
    player['year'] = year
    player['position'] = row.find('td', {'data-stat': 'pos'}).text
    player['player_efficiency'] = row.find('td', {'data-stat': 'per'}).text
    player['true_shooting'] = row.find('td', {'data-stat': 'ts_pct'}).text
    player['plus_minus'] = row.find('td', {'data-stat': 'bpm'}).text
    player['win_shares_percentage'] = row.find('td', {'data-stat': 'ws_per_48'}).text
    players_2021_advanced.append(player)

time.sleep(5)        

Saving to a dataframe:

In [10]:
df_2021_advanced = pd.DataFrame(players_2021_advanced)
df_2021_advanced

Unnamed: 0,name,team,year,position,player_efficiency,true_shooting,plus_minus,win_shares_percentage
0,Precious Achiuwa,MIA,2021,PF,15.5,.588,-2.7,.112
1,Jaylen Adams,MIL,2021,PG,-6.6,.125,-19.8,-0.248
2,Steven Adams,NOP,2021,C,16.4,.606,-0.7,.115
3,Bam Adebayo,MIA,2021,C,22.1,.631,4.4,.195
4,LaMarcus Aldridge,SAS,2021,C,15.0,.545,-1.2,.078
...,...,...,...,...,...,...,...,...
492,Delon Wright,DET,2021,SG,16.8,.576,3.0,.142
493,Thaddeus Young,CHI,2021,PF,21.1,.612,3.6,.173
494,Trae Young,ATL,2021,PG,23.1,.597,4.2,.167
495,Cody Zeller,CHO,2021,C,17.4,.568,0.3,.146


Now we will want to merge the dataframes into one to allow us to build our models. We will perform an inner join and will join on 3 selected common columns. Using these multiple columns allows us to avoid the issue of repeating joins, since a player will likely appear multiple times (due to playing multiple years), so additionally joining on year and time will ensure we are joinging the right corresponding advanced stats for a player for the correct year of their career. 

In [11]:
df_m = pd.merge(df_basic, df_advanced, how='inner', on =['name', 'year', 'team'])
df_m.head()

Unnamed: 0,name,team,year,games_played,mpg,fg%,3pfg%,rebounds,assists,steals,blocks,points_per_game,position,player_efficiency,true_shooting,plus_minus,win_shares_percentage,in_playoff
0,Steven Adams,OKC,2020,63,26.7,0.592,0.333,9.3,2.3,0.8,1.1,10.9,C,20.5,0.604,2.9,0.185,1
1,Bam Adebayo,MIA,2020,72,33.6,0.557,0.143,10.2,5.1,1.1,1.3,15.9,PF,20.3,0.598,3.4,0.168,1
2,LaMarcus Aldridge,SAS,2020,53,33.1,0.493,0.389,7.4,2.4,0.7,1.6,18.9,C,19.7,0.571,1.4,0.122,0
3,Kyle Alexander,MIA,2020,2,6.5,0.5,,1.5,0.0,0.0,0.0,1.0,C,4.7,0.5,-9.6,-0.003,1
4,Nickeil Alexander-Walker,NOP,2020,47,12.6,0.368,0.346,1.8,1.9,0.4,0.2,5.7,SG,8.9,0.473,-4.6,-0.02,0


Same method for the 2021 players:

In [12]:
df_m_2021 = pd.merge(df_2021_basic, df_2021_advanced, how='inner', on =['name', 'year', 'team'])
df_m_2021.head()

Unnamed: 0,name,team,year,games_played,mpg,fg%,3pfg%,rebounds,assists,steals,blocks,points_per_game,position,player_efficiency,true_shooting,plus_minus,win_shares_percentage
0,Precious Achiuwa,MIA,2021,37,13.5,0.585,,3.9,0.5,0.4,0.5,5.8,PF,15.5,0.588,-2.7,0.112
1,Jaylen Adams,MIL,2021,7,2.6,0.125,0.0,0.4,0.3,0.0,0.0,0.3,PG,-6.6,0.125,-19.8,-0.248
2,Steven Adams,NOP,2021,35,27.6,0.624,0.0,9.2,2.0,0.9,0.6,8.2,C,16.4,0.606,-0.7,0.115
3,Bam Adebayo,MIA,2021,33,33.9,0.563,0.333,9.5,5.4,0.9,1.0,19.2,C,22.1,0.631,4.4,0.195
4,LaMarcus Aldridge,SAS,2021,21,25.9,0.464,0.36,4.5,1.7,0.4,0.9,13.7,C,15.0,0.545,-1.2,0.078


Now let's save these dataframes to csv for EDA/Data Cleaning and then our modeling purposes:

In [24]:
# df_m.to_csv('data/players_all.csv')

In [25]:
# df_m_2021.to_csv('data/players_2021.csv')