In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
from urllib.request import urlopen

In [22]:
from bs4 import BeautifulSoup

In [27]:
integer_years = list(range(1991,2024))
string_years = list(map(str, integer_years))
print(string_years)

['1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']


In [28]:
def scrape_advanced_stats(years):
    advanced_stats_df = pd.DataFrame(columns = ['Yr', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr',
       'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'])
    for year in years:
        url = 'https://www.basketball-reference.com/leagues/NBA_' + year + '_advanced.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers.remove('Rk')
        headers.insert(0, 'Yr')
        headers = list(map(lambda x: x.replace(' ', 'Blank'), headers))
        rows = soup.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        rows_data = list(filter(None,rows_data))
        for i in range(0, len(rows_data)):
            rows_data[i].insert(0, year)
        year_advanced_stats_df = pd.DataFrame(rows_data, columns = headers)
        year_advanced_stats_df = year_advanced_stats_df.drop(columns = '\xa0', axis = 1)
        advanced_stats_df = advanced_stats_df.append(year_advanced_stats_df)
    print(advanced_stats_df)
    advanced_stats_df.to_csv("advanced_stats.csv", index=False)

In [30]:
scrape_advanced_stats(string_years)

       Yr              Player Pos Age   Tm   G    MP   PER   TS%  3PAr  ...  \
0    1991      Alaa Abdelnaby  PF  22  POR  43   290  13.1  .499  .000  ...   
1    1991  Mahmoud Abdul-Rauf  PG  21  DEN  67  1505  12.2  .448  .099  ...   
2    1991          Mark Acres   C  28  ORL  68  1313   9.2  .551  .014  ...   
3    1991       Michael Adams  PG  28  DEN  66  2346  22.3  .530  .397  ...   
4    1991        Mark Aguirre  SF  31  DET  78  2006  16.7  .526  .086  ...   
..    ...                 ...  ..  ..  ...  ..   ...   ...   ...   ...  ...   
674  2023      Thaddeus Young  PF  34  TOR  54   795  14.1  .573  .172  ...   
675  2023          Trae Young  PG  24  ATL  73  2541  22.0  .573  .331  ...   
676  2023      Omer Yurtseven   C  24  MIA   9    83  16.7  .675  .259  ...   
677  2023         Cody Zeller   C  30  MIA  15   217  16.4  .659  .034  ...   
678  2023         Ivica Zubac   C  25  LAC  76  2170  16.7  .661  .004  ...   

     TOV%  USG%   OWS   DWS    WS   WS/48  OBPM  DB

In [26]:
def scrape_totals_stats(years):
    totals_stats_df = pd.DataFrame(columns = ['Yr', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF','PTS'])
    for year in years:
        url = 'https://www.basketball-reference.com/leagues/NBA_' + year + '_totals.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers.remove('Rk')
        headers.insert(0, 'Yr')
        rows = soup.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        rows_data = list(filter(None,rows_data))
        for i in range(0, len(rows_data)):
            rows_data[i].insert(0, year)
        year_totals_stats_df = pd.DataFrame(rows_data, columns = headers)
        totals_stats_df = totals_stats_df.append(year_totals_stats_df)
    print(totals_stats_df)
    totals_stats_df.to_csv("totals_stats.csv", index=False)

In [28]:
scrape_totals_stats(string_years)

       Yr              Player Pos Age   Tm   G  GS    MP   FG   FGA  ...  \
0    1991      Alaa Abdelnaby  PF  22  POR  43   0   290   55   116  ...   
1    1991  Mahmoud Abdul-Rauf  PG  21  DEN  67  19  1505  417  1009  ...   
2    1991          Mark Acres   C  28  ORL  68   0  1313  109   214  ...   
3    1991       Michael Adams  PG  28  DEN  66  66  2346  560  1421  ...   
4    1991        Mark Aguirre  SF  31  DET  78  13  2006  420   909  ...   
..    ...                 ...  ..  ..  ...  ..  ..   ...  ...   ...  ...   
674  2023      Thaddeus Young  PF  34  TOR  54   9   795  108   198  ...   
675  2023          Trae Young  PG  24  ATL  73  73  2541  597  1390  ...   
676  2023      Omer Yurtseven   C  24  MIA   9   0    83   16    27  ...   
677  2023         Cody Zeller   C  30  MIA  15   2   217   37    59  ...   
678  2023         Ivica Zubac   C  25  LAC  76  76  2170  326   514  ...   

      FT%  ORB  DRB  TRB  AST  STL BLK  TOV   PF   PTS  
0    .568   27   62   89   12 

In [29]:
def scrape_pergame_stats(years):
    pergame_stats_df = pd.DataFrame(columns = ['Yr', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF','PTS'])
    for year in years:
        url = 'https://www.basketball-reference.com/leagues/NBA_' + year + '_per_game.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers.remove('Rk')
        headers.insert(0, 'Yr')
        rows = soup.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        rows_data = list(filter(None,rows_data))
        for i in range(0, len(rows_data)):
            rows_data[i].insert(0, year)
        year_pergame_stats_df = pd.DataFrame(rows_data, columns = headers)
        pergame_stats_df = pergame_stats_df.append(year_pergame_stats_df)
    print(pergame_stats_df)
    pergame_stats_df.to_csv("pergame_stats.csv", index=False)

In [30]:
scrape_pergame_stats(string_years)

       Yr              Player Pos Age   Tm   G  GS    MP   FG   FGA  ...  \
0    1991      Alaa Abdelnaby  PF  22  POR  43   0   6.7  1.3   2.7  ...   
1    1991  Mahmoud Abdul-Rauf  PG  21  DEN  67  19  22.5  6.2  15.1  ...   
2    1991          Mark Acres   C  28  ORL  68   0  19.3  1.6   3.1  ...   
3    1991       Michael Adams  PG  28  DEN  66  66  35.5  8.5  21.5  ...   
4    1991        Mark Aguirre  SF  31  DET  78  13  25.7  5.4  11.7  ...   
..    ...                 ...  ..  ..  ...  ..  ..   ...  ...   ...  ...   
674  2023      Thaddeus Young  PF  34  TOR  54   9  14.7  2.0   3.7  ...   
675  2023          Trae Young  PG  24  ATL  73  73  34.8  8.2  19.0  ...   
676  2023      Omer Yurtseven   C  24  MIA   9   0   9.2  1.8   3.0  ...   
677  2023         Cody Zeller   C  30  MIA  15   2  14.5  2.5   3.9  ...   
678  2023         Ivica Zubac   C  25  LAC  76  76  28.6  4.3   6.8  ...   

      FT%  ORB  DRB  TRB   AST  STL  BLK  TOV   PF   PTS  
0    .568  0.6  1.4  2.1   0

In [31]:
def scrape_per36min_stats(years):
    per36min_stats_df = pd.DataFrame(columns = ['Yr', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF','PTS'])
    for year in years:
        url = 'https://www.basketball-reference.com/leagues/NBA_' + year + '_per_minute.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers.remove('Rk')
        headers.insert(0, 'Yr')
        rows = soup.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        rows_data = list(filter(None,rows_data))
        for i in range(0, len(rows_data)):
            rows_data[i].insert(0, year)
        year_per36min_stats_df = pd.DataFrame(rows_data, columns = headers)
        per36min_stats_df = per36min_stats_df.append(year_per36min_stats_df)
    print(per36min_stats_df)
    per36min_stats_df.to_csv("per36min_stats.csv", index=False)

In [32]:
scrape_per36min_stats(string_years)

       Yr              Player Pos Age   Tm   G  GS    MP    FG   FGA  ...  \
0    1991      Alaa Abdelnaby  PF  22  POR  43   0   290   6.8  14.4  ...   
1    1991  Mahmoud Abdul-Rauf  PG  21  DEN  67  19  1505  10.0  24.1  ...   
2    1991          Mark Acres   C  28  ORL  68   0  1313   3.0   5.9  ...   
3    1991       Michael Adams  PG  28  DEN  66  66  2346   8.6  21.8  ...   
4    1991        Mark Aguirre  SF  31  DET  78  13  2006   7.5  16.3  ...   
..    ...                 ...  ..  ..  ...  ..  ..   ...   ...   ...  ...   
674  2023      Thaddeus Young  PF  34  TOR  54   9   795   4.9   9.0  ...   
675  2023          Trae Young  PG  24  ATL  73  73  2541   8.5  19.7  ...   
676  2023      Omer Yurtseven   C  24  MIA   9   0    83   6.9  11.7  ...   
677  2023         Cody Zeller   C  30  MIA  15   2   217   6.1   9.8  ...   
678  2023         Ivica Zubac   C  25  LAC  76  76  2170   5.4   8.5  ...   

      FT%  ORB  DRB   TRB   AST  STL  BLK  TOV   PF   PTS  
0    .568  3.4 

In [33]:
def scrape_per100pos_stats(years):
    per100pos_stats_df = pd.DataFrame(columns = ['Yr', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF','PTS', 'ORtg', 'DRtg'])
    for year in years:
        url = 'https://www.basketball-reference.com/leagues/NBA_' + year + '_per_poss.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers.remove('Rk')
        headers.insert(0, 'Yr')
        rows = soup.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        rows_data = list(filter(None,rows_data))
        for i in range(0, len(rows_data)):
            rows_data[i].insert(0, year)
        year_per100pos_stats_df = pd.DataFrame(rows_data, columns = headers)
        per100pos_stats_df = per100pos_stats_df.append(year_per100pos_stats_df)
    print(per100pos_stats_df)
    per100pos_stats_df.to_csv("per100pos_stats.csv", index=False)

In [34]:
scrape_per100pos_stats(string_years)

       Yr              Player Pos Age   Tm   G  GS    MP    FG   FGA  ...  \
0    1991      Alaa Abdelnaby  PF  22  POR  43   0   290   9.1  19.1  ...   
1    1991  Mahmoud Abdul-Rauf  PG  21  DEN  67  19  1505  11.7  28.3  ...   
2    1991          Mark Acres   C  28  ORL  68   0  1313   4.0   7.9  ...   
3    1991       Michael Adams  PG  28  DEN  66  66  2346  10.1  25.6  ...   
4    1991        Mark Aguirre  SF  31  DET  78  13  2006  10.9  23.7  ...   
..    ...                 ...  ..  ..  ...  ..  ..   ...   ...   ...  ...   
674  2023      Thaddeus Young  PF  34  TOR  54   9   795   6.7  12.3  ...   
675  2023          Trae Young  PG  24  ATL  73  73  2541  11.2  26.1  ...   
676  2023      Omer Yurtseven   C  24  MIA   9   0    83   9.6  16.2  ...   
677  2023         Cody Zeller   C  30  MIA  15   2   217   8.5  13.6  ...   
678  2023         Ivica Zubac   C  25  LAC  76  76  2170   7.4  11.6  ...   

      TRB   AST  STL  BLK  TOV   PF   PTS ORtg DRtg     
0    14.7   2.0  0

In [46]:
def scrape_team_data(years):
    
    teams_df = pd.DataFrame(columns = ["Year", "Team", "W", "L",
                                       "W/L%", "GB", "PS/G", "PA/G",
                                       "SRS", "Playoffs",
                                       "Losing_season"])
    
    for year in years:
        url = 'https://www.basketball-reference.com/leagues/NBA_' + year + '_standings.html'
        html = urlopen(url)
        
        soup = BeautifulSoup(html, features="lxml")
        titles = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        
        headers = titles[1:titles.index("SRS")+1]
        
        titles = titles[titles.index("SRS")+1:]
        
        try:
            row_titles = titles[0:titles.index("Eastern Conference")]
        except: row_titles = titles
        for i in headers:
            row_titles.remove(i)
        row_titles.remove("Western Conference")
        divisions = ["Atlantic Division", "Central Division",
                     "Southeast Division", "Northwest Division",
                     "Pacific Division", "Southwest Division",
                     "Midwest Division"]
        for d in divisions:
            try:
                row_titles.remove(d)
            except:
                continue
        
        rows = soup.findAll('tr')[1:]
        team_stats = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        team_stats = [e for e in team_stats if e != []]
        team_stats = team_stats[0:len(row_titles)]
        
        for i in range(0, len(team_stats)):
            team_stats[i].insert(0, row_titles[i])
            team_stats[i].insert(0, year)
            
        headers.insert(0, "Team")
        headers.insert(0, "Year")
        
        year_standings = pd.DataFrame(team_stats, columns = headers)
        
        year_standings["Playoffs"] = ["Y" if "*" in ele else "N" for ele in year_standings["Team"]]
        year_standings["Team"] = [ele.replace('*', '') for ele in year_standings["Team"]]
        year_standings["Losing_season"] = ["Y" if float(ele) < .5 else "N" for ele in year_standings["W/L%"]]
        
        teams_df = teams_df.append(year_standings)
        
    teams_df.to_csv("teams_data.csv", index=False)

In [50]:
scrape_team_data(string_years)

In [None]:
'Next step is to scrape awards data for MVP and DPOY for those seasons'

In [None]:
'Next step is to pull additional years from 1980-1990 for more data points and combine to the current files'

In [35]:
'Next step is to join all the separate files together using players, years, and teams as joinkeys'

'Next step is to join all the separate files together using players, years, and teams as joinkeys'