# Web Scraping Basketball Reference Data 

I will be using the website Basteball-reference.com in order to scrap data of all players from this years season 2021-2022. This website provides clean sports data, thus making it easy to scrap the data and create my own dataframe. 

Now we use the libraries that we will need in order to begin web scraping the webpage. Then after that we will get the headers and rows then combine them into its own dataframe.

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# url to scrape 
url = 'https://www.basketball-reference.com/leagues/NBA_2022_per_game.html'

In [4]:
# collect HTML data 
html = urlopen(url)

# create beautiful soup object from HTML
soup = BeautifulSoup(html, features='lxml')

In [5]:
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
headers.remove('Rk')
headers

['Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

In [6]:
# get rows from table
rows = soup.findAll('tr')[1:]
rows_data = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]

rows_data[0:1]

[['Precious Achiuwa',
  'C',
  '22',
  'TOR',
  '73',
  '28',
  '23.6',
  '3.6',
  '8.3',
  '.439',
  '0.8',
  '2.1',
  '.359',
  '2.9',
  '6.1',
  '.468',
  '.486',
  '1.1',
  '1.8',
  '.595',
  '2.0',
  '4.5',
  '6.5',
  '1.1',
  '0.5',
  '0.6',
  '1.2',
  '2.1',
  '9.1']]

In [7]:
# now we create a dataframe with the data from rows and the headers
stats = pd.DataFrame(rows_data, columns=headers)
stats

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Precious Achiuwa,C,22,TOR,73,28,23.6,3.6,8.3,.439,...,.595,2.0,4.5,6.5,1.1,0.5,0.6,1.2,2.1,9.1
1,Steven Adams,C,28,MEM,76,75,26.3,2.8,5.1,.547,...,.543,4.6,5.4,10.0,3.4,0.9,0.8,1.5,2.0,6.9
2,Bam Adebayo,C,24,MIA,56,56,32.6,7.3,13.0,.557,...,.753,2.4,7.6,10.1,3.4,1.4,0.8,2.6,3.1,19.1
3,Santi Aldama,PF,21,MEM,32,0,11.3,1.7,4.1,.402,...,.625,1.0,1.7,2.7,0.7,0.2,0.3,0.5,1.1,4.1
4,LaMarcus Aldridge,C,36,BRK,47,12,22.3,5.4,9.7,.550,...,.873,1.6,3.9,5.5,0.9,0.3,1.0,0.9,1.7,12.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
837,Thaddeus Young,PF,33,TOR,26,0,18.3,2.6,5.5,.465,...,.481,1.5,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3
838,Trae Young,PG,23,ATL,76,76,34.9,9.4,20.3,.460,...,.904,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4
839,Omer Yurtseven,C,23,MIA,56,12,12.6,2.3,4.4,.526,...,.623,1.5,3.7,5.3,0.9,0.3,0.4,0.7,1.5,5.3
840,Cody Zeller,C,29,POR,27,0,13.1,1.9,3.3,.567,...,.776,1.9,2.8,4.6,0.8,0.3,0.2,0.7,2.1,5.2


We will now check to see if the data has any missing values and if it does we will drop thos rows.

In [8]:
# check to see if we have any missing values
stats.isnull().sum()

Player    30
Pos       30
Age       30
Tm        30
G         30
GS        30
MP        30
FG        30
FGA       30
FG%       30
3P        30
3PA       30
3P%       30
2P        30
2PA       30
2P%       30
eFG%      30
FT        30
FTA       30
FT%       30
ORB       30
DRB       30
TRB       30
AST       30
STL       30
BLK       30
TOV       30
PF        30
PTS       30
dtype: int64

In [9]:
# drop the rows with missing values
stats.dropna(axis=0, inplace=True)

In [10]:
# check to make sure we dropped all the rows with missing values
stats.isnull().sum()

Player    0
Pos       0
Age       0
Tm        0
G         0
GS        0
MP        0
FG        0
FGA       0
FG%       0
3P        0
3PA       0
3P%       0
2P        0
2PA       0
2P%       0
eFG%      0
FT        0
FTA       0
FT%       0
ORB       0
DRB       0
TRB       0
AST       0
STL       0
BLK       0
TOV       0
PF        0
PTS       0
dtype: int64

In [11]:
# we reset the index of the dataframe after dropping the rows that are missing values
stats = stats.reset_index(drop=True)


In [12]:
stats.dtypes 

Player    object
Pos       object
Age       object
Tm        object
G         object
GS        object
MP        object
FG        object
FGA       object
FG%       object
3P        object
3PA       object
3P%       object
2P        object
2PA       object
2P%       object
eFG%      object
FT        object
FTA       object
FT%       object
ORB       object
DRB       object
TRB       object
AST       object
STL       object
BLK       object
TOV       object
PF        object
PTS       object
dtype: object

Looking at the dataframe we see that all the rows on objects and this is a problem because we need it to be numeric if we want to perform any data analysis on the data. But first I will make a copy of the data.

In [13]:
test = stats.copy()

When looking at the data for some of the players they had some rows with an empty string causing problems with converting the data into a float, so I will add a zero inplace for those data points. 

In [14]:
#loop through all of the dataframe to add a '0' where there is an empty string
for column in test.columns:
    if column != 'Player' and column != 'Pos' and column != 'Tm':
        for i in range(len(test[column])):
            if(len(test[column][i]) == 0):
                test[column][i] = '0'
            else:
                test[column][i] = float(test[column][i])
    else:
        continue    

Now we can finally convert the columns that we need to floats in order to perform data analysis.

In [15]:
# convert all the columns with numbers into floats instead of objects
test[['Age', 'G', 'GS', 'MP','FG', 'FGA']] = test[['Age', 'G', 'GS', 'MP','FG','FGA']].astype(float)
test[['3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']] = test[['3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']].astype(float)
test[['FG%', '3P%', '2P%', 'eFG%', 'FT%']] = test[['FG%', '3P%', '2P%', 'eFG%', 'FT%']].astype(float)

In [16]:
test.dtypes

Player     object
Pos        object
Age       float64
Tm         object
G         float64
GS        float64
MP        float64
FG        float64
FGA       float64
FG%       float64
3P        float64
3PA       float64
3P%       float64
2P        float64
2PA       float64
2P%       float64
eFG%      float64
FT        float64
FTA       float64
FT%       float64
ORB       float64
DRB       float64
TRB       float64
AST       float64
STL       float64
BLK       float64
TOV       float64
PF        float64
PTS       float64
dtype: object

Now I will make a function in order to gather stats data from other years instead of just the most recent season

In [17]:
# import needed libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np


# create a function that will take in a year and create a dataframe by scraping basketball reference
def scrape_nba_data(years):
    # the url to scrape
    url = f'https://www.basketball-reference.com/leagues/NBA_{years}_per_game.html'

    html = urlopen(url)
    # create beautiful soup object from the html
    soup = BeautifulSoup(html, features='lxml')

    # get the headers for the columns
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    headers.remove('Rk')

    # get the rows from the table
    rows = soup.findAll('tr')[1:]
    rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
    
    # now we create a dataframe with the data from rows and the headers
    df = pd.DataFrame(rows_data, columns=headers)
    return df

Now to create a function in order to clean the data that we get from web scraping the website

In [18]:
def clean_dataframe(stats):
    # drop the rows with missing values
    stats.dropna(axis=0, inplace=True)
    stats = stats.reset_index(drop=True)

    # loop through all of the dataframe to add a '0' where there is an empty string
    for column in stats.columns:
        if column != 'Player' and column != 'Pos' and column != 'Tm':
            for i in range(len(stats[column])):
                if(len(stats[column][i]) == 0):
                    stats[column][i] = '0'
                else:
                    stats[column][i] = float(stats[column][i])
        else:
            continue


    # convert all the columns with numbers into floats instead of objects
    stats[['Age', 'G', 'GS', 'MP','FG', 'FGA']] = stats[['Age', 'G', 'GS', 'MP','FG','FGA']].astype(float)
    stats[['3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']] = stats[['3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']].astype(float)
    stats[['FG%', '3P%', '2P%', 'eFG%', 'FT%']] = stats[['FG%', '3P%', '2P%', 'eFG%', 'FT%']].astype(float)
    return stats
