In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from urllib.request import urlopen

In [3]:
from bs4 import BeautifulSoup

In [4]:
integer_years = list(range(1991,2024))
string_years = list(map(str, integer_years))
print(string_years)

['1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']


In [5]:
integer_years2 = list(range(1980,1991))
string_years2 = list(map(str, integer_years2))
print(string_years2)

['1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990']


In [6]:
def scrape_advanced_stats(years):
    advanced_stats_df = pd.DataFrame(columns = ['Yr', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr',
       'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'])
    for year in years:
        url = 'https://www.basketball-reference.com/leagues/NBA_' + year + '_advanced.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers.remove('Rk')
        headers.insert(0, 'Yr')
        headers = list(map(lambda x: x.replace(' ', 'Blank'), headers))
        rows = soup.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        rows_data = list(filter(None,rows_data))
        for i in range(0, len(rows_data)):
            rows_data[i].insert(0, year)
        year_advanced_stats_df = pd.DataFrame(rows_data, columns = headers)
        year_advanced_stats_df = year_advanced_stats_df.drop(columns = '\xa0', axis = 1)
        advanced_stats_df = advanced_stats_df.append(year_advanced_stats_df)
    print(advanced_stats_df)
    advanced_stats_df.to_csv("advanced_stats2.csv", index=False)

In [7]:
scrape_advanced_stats(string_years2)

       Yr                Player Pos Age   Tm   G    MP   PER   TS%  3PAr  ...  \
0    1980  Kareem Abdul-Jabbar*   C  32  LAL  82  3143  25.3  .639  .001  ...   
1    1980         Tom Abernethy  PF  25  GSW  67  1222  11.0  .511  .003  ...   
2    1980           Alvan Adams   C  25  PHO  75  2168  19.2  .571  .002  ...   
3    1980       Tiny Archibald*  PG  31  BOS  80  2864  15.3  .574  .023  ...   
4    1980         Dennis Awtrey   C  31  CHI  26   560   7.4  .524  .000  ...   
..    ...                   ...  ..  ..  ...  ..   ...   ...   ...   ...  ...   
454  1990     Orlando Woolridge  SF  30  LAL  62  1421  17.6  .601  .009  ...   
455  1990      Haywoode Workman  PG  24  ATL   6    16  26.1  .773  .000  ...   
456  1990         James Worthy*  SF  28  LAL  80  2960  19.8  .586  .038  ...   
457  1990           Danny Young  PG  27  POR  82  1393  11.5  .508  .180  ...   
458  1990         Michael Young  SF  29  LAC  45   459  16.0  .520  .134  ...   

     TOV%  USG%  OWS  DWS  

In [30]:
scrape_advanced_stats(string_years)

       Yr              Player Pos Age   Tm   G    MP   PER   TS%  3PAr  ...  \
0    1991      Alaa Abdelnaby  PF  22  POR  43   290  13.1  .499  .000  ...   
1    1991  Mahmoud Abdul-Rauf  PG  21  DEN  67  1505  12.2  .448  .099  ...   
2    1991          Mark Acres   C  28  ORL  68  1313   9.2  .551  .014  ...   
3    1991       Michael Adams  PG  28  DEN  66  2346  22.3  .530  .397  ...   
4    1991        Mark Aguirre  SF  31  DET  78  2006  16.7  .526  .086  ...   
..    ...                 ...  ..  ..  ...  ..   ...   ...   ...   ...  ...   
674  2023      Thaddeus Young  PF  34  TOR  54   795  14.1  .573  .172  ...   
675  2023          Trae Young  PG  24  ATL  73  2541  22.0  .573  .331  ...   
676  2023      Omer Yurtseven   C  24  MIA   9    83  16.7  .675  .259  ...   
677  2023         Cody Zeller   C  30  MIA  15   217  16.4  .659  .034  ...   
678  2023         Ivica Zubac   C  25  LAC  76  2170  16.7  .661  .004  ...   

     TOV%  USG%   OWS   DWS    WS   WS/48  OBPM  DB

In [8]:
def scrape_totals_stats(years):
    totals_stats_df = pd.DataFrame(columns = ['Yr', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF','PTS'])
    for year in years:
        url = 'https://www.basketball-reference.com/leagues/NBA_' + year + '_totals.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers.remove('Rk')
        headers.insert(0, 'Yr')
        rows = soup.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        rows_data = list(filter(None,rows_data))
        for i in range(0, len(rows_data)):
            rows_data[i].insert(0, year)
        year_totals_stats_df = pd.DataFrame(rows_data, columns = headers)
        totals_stats_df = totals_stats_df.append(year_totals_stats_df)
    print(totals_stats_df)
    totals_stats_df.to_csv("totals_stats2.csv", index=False)

In [9]:
scrape_totals_stats(string_years2)

       Yr                Player Pos Age   Tm   G  GS    MP   FG   FGA  ...  \
0    1980  Kareem Abdul-Jabbar*   C  32  LAL  82      3143  835  1383  ...   
1    1980         Tom Abernethy  PF  25  GSW  67      1222  153   318  ...   
2    1980           Alvan Adams   C  25  PHO  75      2168  465   875  ...   
3    1980       Tiny Archibald*  PG  31  BOS  80  80  2864  383   794  ...   
4    1980         Dennis Awtrey   C  31  CHI  26       560   27    60  ...   
..    ...                   ...  ..  ..  ...  ..  ..   ...  ...   ...  ...   
454  1990     Orlando Woolridge  SF  30  LAL  62   2  1421  306   550  ...   
455  1990      Haywoode Workman  PG  24  ATL   6   0    16    2     3  ...   
456  1990         James Worthy*  SF  28  LAL  80  80  2960  711  1298  ...   
457  1990           Danny Young  PG  27  POR  82   8  1393  138   328  ...   
458  1990         Michael Young  SF  29  LAC  45   2   459   92   194  ...   

       FT%  ORB  DRB  TRB  AST  STL  BLK  TOV   PF   PTS  
0   

In [28]:
scrape_totals_stats(string_years)

       Yr              Player Pos Age   Tm   G  GS    MP   FG   FGA  ...  \
0    1991      Alaa Abdelnaby  PF  22  POR  43   0   290   55   116  ...   
1    1991  Mahmoud Abdul-Rauf  PG  21  DEN  67  19  1505  417  1009  ...   
2    1991          Mark Acres   C  28  ORL  68   0  1313  109   214  ...   
3    1991       Michael Adams  PG  28  DEN  66  66  2346  560  1421  ...   
4    1991        Mark Aguirre  SF  31  DET  78  13  2006  420   909  ...   
..    ...                 ...  ..  ..  ...  ..  ..   ...  ...   ...  ...   
674  2023      Thaddeus Young  PF  34  TOR  54   9   795  108   198  ...   
675  2023          Trae Young  PG  24  ATL  73  73  2541  597  1390  ...   
676  2023      Omer Yurtseven   C  24  MIA   9   0    83   16    27  ...   
677  2023         Cody Zeller   C  30  MIA  15   2   217   37    59  ...   
678  2023         Ivica Zubac   C  25  LAC  76  76  2170  326   514  ...   

      FT%  ORB  DRB  TRB  AST  STL BLK  TOV   PF   PTS  
0    .568   27   62   89   12 

In [10]:
def scrape_pergame_stats(years):
    pergame_stats_df = pd.DataFrame(columns = ['Yr', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF','PTS'])
    for year in years:
        url = 'https://www.basketball-reference.com/leagues/NBA_' + year + '_per_game.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers.remove('Rk')
        headers.insert(0, 'Yr')
        rows = soup.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        rows_data = list(filter(None,rows_data))
        for i in range(0, len(rows_data)):
            rows_data[i].insert(0, year)
        year_pergame_stats_df = pd.DataFrame(rows_data, columns = headers)
        pergame_stats_df = pergame_stats_df.append(year_pergame_stats_df)
    print(pergame_stats_df)
    pergame_stats_df.to_csv("pergame_stats2.csv", index=False)

In [11]:
scrape_pergame_stats(string_years2)

       Yr                Player Pos Age   Tm   G  GS    MP    FG   FGA  ...  \
0    1980  Kareem Abdul-Jabbar*   C  32  LAL  82      38.3  10.2  16.9  ...   
1    1980         Tom Abernethy  PF  25  GSW  67      18.2   2.3   4.7  ...   
2    1980           Alvan Adams   C  25  PHO  75      28.9   6.2  11.7  ...   
3    1980       Tiny Archibald*  PG  31  BOS  80  80  35.8   4.8   9.9  ...   
4    1980         Dennis Awtrey   C  31  CHI  26      21.5   1.0   2.3  ...   
..    ...                   ...  ..  ..  ...  ..  ..   ...   ...   ...  ...   
454  1990     Orlando Woolridge  SF  30  LAL  62   2  22.9   4.9   8.9  ...   
455  1990      Haywoode Workman  PG  24  ATL   6   0   2.7   0.3   0.5  ...   
456  1990         James Worthy*  SF  28  LAL  80  80  37.0   8.9  16.2  ...   
457  1990           Danny Young  PG  27  POR  82   8  17.0   1.7   4.0  ...   
458  1990         Michael Young  SF  29  LAC  45   2  10.2   2.0   4.3  ...   

       FT%  ORB  DRB   TRB  AST  STL  BLK  TOV   PF

In [30]:
scrape_pergame_stats(string_years)

       Yr              Player Pos Age   Tm   G  GS    MP   FG   FGA  ...  \
0    1991      Alaa Abdelnaby  PF  22  POR  43   0   6.7  1.3   2.7  ...   
1    1991  Mahmoud Abdul-Rauf  PG  21  DEN  67  19  22.5  6.2  15.1  ...   
2    1991          Mark Acres   C  28  ORL  68   0  19.3  1.6   3.1  ...   
3    1991       Michael Adams  PG  28  DEN  66  66  35.5  8.5  21.5  ...   
4    1991        Mark Aguirre  SF  31  DET  78  13  25.7  5.4  11.7  ...   
..    ...                 ...  ..  ..  ...  ..  ..   ...  ...   ...  ...   
674  2023      Thaddeus Young  PF  34  TOR  54   9  14.7  2.0   3.7  ...   
675  2023          Trae Young  PG  24  ATL  73  73  34.8  8.2  19.0  ...   
676  2023      Omer Yurtseven   C  24  MIA   9   0   9.2  1.8   3.0  ...   
677  2023         Cody Zeller   C  30  MIA  15   2  14.5  2.5   3.9  ...   
678  2023         Ivica Zubac   C  25  LAC  76  76  28.6  4.3   6.8  ...   

      FT%  ORB  DRB  TRB   AST  STL  BLK  TOV   PF   PTS  
0    .568  0.6  1.4  2.1   0

In [12]:
def scrape_per36min_stats(years):
    per36min_stats_df = pd.DataFrame(columns = ['Yr', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF','PTS'])
    for year in years:
        url = 'https://www.basketball-reference.com/leagues/NBA_' + year + '_per_minute.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers.remove('Rk')
        headers.insert(0, 'Yr')
        rows = soup.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        rows_data = list(filter(None,rows_data))
        for i in range(0, len(rows_data)):
            rows_data[i].insert(0, year)
        year_per36min_stats_df = pd.DataFrame(rows_data, columns = headers)
        per36min_stats_df = per36min_stats_df.append(year_per36min_stats_df)
    print(per36min_stats_df)
    per36min_stats_df.to_csv("per36min_stats2.csv", index=False)

In [13]:
scrape_per36min_stats(string_years2)

       Yr                Player Pos Age   Tm   G  GS    MP   FG   FGA  ...  \
0    1980  Kareem Abdul-Jabbar*   C  32  LAL  82      3143  9.6  15.8  ...   
1    1980         Tom Abernethy  PF  25  GSW  67      1222  4.5   9.4  ...   
2    1980           Alvan Adams   C  25  PHO  75      2168  7.7  14.5  ...   
3    1980       Tiny Archibald*  PG  31  BOS  80  80  2864  4.8  10.0  ...   
4    1980         Dennis Awtrey   C  31  CHI  26       560  1.7   3.9  ...   
..    ...                   ...  ..  ..  ...  ..  ..   ...  ...   ...  ...   
454  1990     Orlando Woolridge  SF  30  LAL  62   2  1421  7.8  13.9  ...   
455  1990      Haywoode Workman  PG  24  ATL   6   0    16  4.5   6.8  ...   
456  1990         James Worthy*  SF  28  LAL  80  80  2960  8.6  15.8  ...   
457  1990           Danny Young  PG  27  POR  82   8  1393  3.6   8.5  ...   
458  1990         Michael Young  SF  29  LAC  45   2   459  7.2  15.2  ...   

       FT%  ORB  DRB   TRB  AST  STL  BLK  TOV   PF   PTS  
0  

In [32]:
scrape_per36min_stats(string_years)

       Yr              Player Pos Age   Tm   G  GS    MP    FG   FGA  ...  \
0    1991      Alaa Abdelnaby  PF  22  POR  43   0   290   6.8  14.4  ...   
1    1991  Mahmoud Abdul-Rauf  PG  21  DEN  67  19  1505  10.0  24.1  ...   
2    1991          Mark Acres   C  28  ORL  68   0  1313   3.0   5.9  ...   
3    1991       Michael Adams  PG  28  DEN  66  66  2346   8.6  21.8  ...   
4    1991        Mark Aguirre  SF  31  DET  78  13  2006   7.5  16.3  ...   
..    ...                 ...  ..  ..  ...  ..  ..   ...   ...   ...  ...   
674  2023      Thaddeus Young  PF  34  TOR  54   9   795   4.9   9.0  ...   
675  2023          Trae Young  PG  24  ATL  73  73  2541   8.5  19.7  ...   
676  2023      Omer Yurtseven   C  24  MIA   9   0    83   6.9  11.7  ...   
677  2023         Cody Zeller   C  30  MIA  15   2   217   6.1   9.8  ...   
678  2023         Ivica Zubac   C  25  LAC  76  76  2170   5.4   8.5  ...   

      FT%  ORB  DRB   TRB   AST  STL  BLK  TOV   PF   PTS  
0    .568  3.4 

In [9]:
def scrape_per100pos_stats(years):
    per100pos_stats_df = pd.DataFrame(columns = ['Yr', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF','PTS', 'ORtg', 'DRtg'])
    for year in years:
        url = 'https://www.basketball-reference.com/leagues/NBA_' + year + '_per_poss.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers.remove('Rk')
        headers.insert(0, 'Yr')
        rows = soup.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        rows_data = list(filter(None,rows_data))
        for i in range(0, len(rows_data)):
            rows_data[i].insert(0, year)
        year_per100pos_stats_df = pd.DataFrame(rows_data, columns = headers)
        per100pos_stats_df = per100pos_stats_df.append(year_per100pos_stats_df)
    print(per100pos_stats_df)
    per100pos_stats_df.to_csv("per100pos_stats2.csv", index=False)

In [15]:
scrape_per100pos_stats(string_years2)

       Yr                Player Pos Age   Tm   G  GS    MP    FG   FGA  ...  \
0    1980  Kareem Abdul-Jabbar*   C  32  LAL  82      3143  12.3  20.3  ...   
1    1980         Tom Abernethy  PF  25  GSW  67      1222   6.0  12.4  ...   
2    1980           Alvan Adams   C  25  PHO  75      2168   9.8  18.5  ...   
3    1980       Tiny Archibald*  PG  31  BOS  80  80  2864   6.3  13.0  ...   
4    1980         Dennis Awtrey   C  31  CHI  26       560   2.3   5.0  ...   
..    ...                   ...  ..  ..  ...  ..  ..   ...   ...   ...  ...   
454  1990     Orlando Woolridge  SF  30  LAL  62   2  1421  10.7  19.3  ...   
455  1990      Haywoode Workman  PG  24  ATL   6   0    16   6.3   9.4  ...   
456  1990         James Worthy*  SF  28  LAL  80  80  2960  12.0  21.9  ...   
457  1990           Danny Young  PG  27  POR  82   8  1393   4.6  11.0  ...   
458  1990         Michael Young  SF  29  LAC  45   2   459   9.7  20.5  ...   

      TRB   AST  STL  BLK  TOV   PF   PTS ORtg DRtg

In [34]:
scrape_per100pos_stats(string_years)

       Yr              Player Pos Age   Tm   G  GS    MP    FG   FGA  ...  \
0    1991      Alaa Abdelnaby  PF  22  POR  43   0   290   9.1  19.1  ...   
1    1991  Mahmoud Abdul-Rauf  PG  21  DEN  67  19  1505  11.7  28.3  ...   
2    1991          Mark Acres   C  28  ORL  68   0  1313   4.0   7.9  ...   
3    1991       Michael Adams  PG  28  DEN  66  66  2346  10.1  25.6  ...   
4    1991        Mark Aguirre  SF  31  DET  78  13  2006  10.9  23.7  ...   
..    ...                 ...  ..  ..  ...  ..  ..   ...   ...   ...  ...   
674  2023      Thaddeus Young  PF  34  TOR  54   9   795   6.7  12.3  ...   
675  2023          Trae Young  PG  24  ATL  73  73  2541  11.2  26.1  ...   
676  2023      Omer Yurtseven   C  24  MIA   9   0    83   9.6  16.2  ...   
677  2023         Cody Zeller   C  30  MIA  15   2   217   8.5  13.6  ...   
678  2023         Ivica Zubac   C  25  LAC  76  76  2170   7.4  11.6  ...   

      TRB   AST  STL  BLK  TOV   PF   PTS ORtg DRtg     
0    14.7   2.0  0

In [16]:
def scrape_team_data(years):
    
    teams_df = pd.DataFrame(columns = ["Year", "Team", "W", "L",
                                       "W/L%", "GB", "PS/G", "PA/G",
                                       "SRS", "Playoffs",
                                       "Losing_season"])
    
    for year in years:
        url = 'https://www.basketball-reference.com/leagues/NBA_' + year + '_standings.html'
        html = urlopen(url)
        
        soup = BeautifulSoup(html, features="lxml")
        titles = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        
        headers = titles[1:titles.index("SRS")+1]
        
        titles = titles[titles.index("SRS")+1:]
        
        try:
            row_titles = titles[0:titles.index("Eastern Conference")]
        except: row_titles = titles
        for i in headers:
            row_titles.remove(i)
        row_titles.remove("Western Conference")
        divisions = ["Atlantic Division", "Central Division",
                     "Southeast Division", "Northwest Division",
                     "Pacific Division", "Southwest Division",
                     "Midwest Division"]
        for d in divisions:
            try:
                row_titles.remove(d)
            except:
                continue
        
        rows = soup.findAll('tr')[1:]
        team_stats = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        team_stats = [e for e in team_stats if e != []]
        team_stats = team_stats[0:len(row_titles)]
        
        for i in range(0, len(team_stats)):
            team_stats[i].insert(0, row_titles[i])
            team_stats[i].insert(0, year)
            
        headers.insert(0, "Team")
        headers.insert(0, "Year")
        
        year_standings = pd.DataFrame(team_stats, columns = headers)
        
        year_standings["Playoffs"] = ["Y" if "*" in ele else "N" for ele in year_standings["Team"]]
        year_standings["Team"] = [ele.replace('*', '') for ele in year_standings["Team"]]
        year_standings["Losing_season"] = ["Y" if float(ele) < .5 else "N" for ele in year_standings["W/L%"]]
        
        teams_df = teams_df.append(year_standings)
        
    teams_df.to_csv("teams_data2.csv", index=False)

In [17]:
scrape_team_data(string_years2)

In [50]:
scrape_team_data(string_years)

In [18]:
'Next step is to consolidate all seasons for each scrape'

'Next step is to consolidate all seasons for each scrape'

In [20]:
advanced_stats1 = pd.read_csv('advanced_stats.csv')
advanced_stats2 = pd.read_csv('advanced_stats2.csv')
advanced_stats_combined = pd.concat([advanced_stats1, advanced_stats2], ignore_index=True, axis=0)
advanced_stats_combined.to_csv("advanced_stats_combined.csv", index = False)

In [21]:
advanced_stats_combined

Unnamed: 0,Yr,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,1991,Alaa Abdelnaby,PF,22,POR,43,290,13.1,0.499,0.000,...,14.0,22.1,0.0,0.5,0.5,0.079,-3.4,-1.2,-4.6,-0.2
1,1991,Mahmoud Abdul-Rauf,PG,21,DEN,67,1505,12.2,0.448,0.099,...,9.5,27.2,-0.7,-0.3,-1.0,-0.031,-2.0,-3.0,-5.0,-1.1
2,1991,Mark Acres,C,28,ORL,68,1313,9.2,0.551,0.014,...,14.0,9.3,1.4,1.1,2.5,0.090,-2.8,-0.2,-3.0,-0.3
3,1991,Michael Adams,PG,28,DEN,66,2346,22.3,0.530,0.397,...,12.7,28.5,5.8,0.4,6.3,0.128,6.0,-0.7,5.3,4.3
4,1991,Mark Aguirre,SF,31,DET,78,2006,16.7,0.526,0.086,...,10.9,25.7,2.8,2.7,5.5,0.132,1.2,0.2,1.4,1.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23141,1990,Orlando Woolridge,SF,30,LAL,62,1421,17.6,0.601,0.009,...,10.0,21.9,3.3,1.5,4.8,0.161,1.3,0.2,1.5,1.3
23142,1990,Haywoode Workman,PG,24,ATL,6,16,26.1,0.773,0.000,...,0.0,10.2,0.1,0.0,0.1,0.357,-0.2,13.8,13.6,0.1
23143,1990,James Worthy*,SF,28,LAL,80,2960,19.8,0.586,0.038,...,10.0,23.0,7.5,3.1,10.6,0.172,3.6,0.2,3.8,4.3
23144,1990,Danny Young,PG,27,POR,82,1393,11.5,0.508,0.180,...,17.5,12.9,1.0,1.9,3.0,0.103,-1.2,2.1,0.9,1.0


In [23]:
totals_stats1 = pd.read_csv('totals_stats.csv')
totals_stats2 = pd.read_csv('totals_stats2.csv')
totals_stats_combined = pd.concat([totals_stats1, totals_stats2], ignore_index=True, axis=0)
totals_stats_combined.to_csv("totals_stats_combined.csv", index = False)

In [24]:
totals_stats_combined

Unnamed: 0,Yr,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1991,Alaa Abdelnaby,PF,22,POR,43,0.0,290,55,116,...,0.568,27,62,89,12,4,12,22,39,135
1,1991,Mahmoud Abdul-Rauf,PG,21,DEN,67,19.0,1505,417,1009,...,0.857,34,87,121,206,55,4,110,149,942
2,1991,Mark Acres,C,28,ORL,68,0.0,1313,109,214,...,0.653,140,219,359,25,25,25,42,218,285
3,1991,Michael Adams,PG,28,DEN,66,66.0,2346,560,1421,...,0.879,58,198,256,693,147,6,240,162,1752
4,1991,Mark Aguirre,SF,31,DET,78,13.0,2006,420,909,...,0.757,134,240,374,139,47,20,128,209,1104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23141,1990,Orlando Woolridge,SF,30,LAL,62,2.0,1421,306,550,...,0.733,49,136,185,96,39,46,73,160,788
23142,1990,Haywoode Workman,PG,24,ATL,6,0.0,16,2,3,...,1.000,0,3,3,2,3,0,0,3,6
23143,1990,James Worthy*,SF,28,LAL,80,80.0,2960,711,1298,...,0.782,160,318,478,288,99,49,160,190,1685
23144,1990,Danny Young,PG,27,POR,82,8.0,1393,138,328,...,0.813,29,93,122,231,82,4,80,84,383


In [25]:
pergame_stats1 = pd.read_csv('pergame_stats.csv')
pergame_stats2 = pd.read_csv('pergame_stats2.csv')
pergame_stats_combined = pd.concat([pergame_stats1, pergame_stats2], ignore_index=True, axis=0)
pergame_stats_combined.to_csv("pergame_stats_combined.csv", index = False)

In [27]:
pergame_stats_combined

Unnamed: 0,Yr,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1991,Alaa Abdelnaby,PF,22,POR,43,0.0,6.7,1.3,2.7,...,0.568,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1
1,1991,Mahmoud Abdul-Rauf,PG,21,DEN,67,19.0,22.5,6.2,15.1,...,0.857,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1
2,1991,Mark Acres,C,28,ORL,68,0.0,19.3,1.6,3.1,...,0.653,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2
3,1991,Michael Adams,PG,28,DEN,66,66.0,35.5,8.5,21.5,...,0.879,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5
4,1991,Mark Aguirre,SF,31,DET,78,13.0,25.7,5.4,11.7,...,0.757,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23141,1990,Orlando Woolridge,SF,30,LAL,62,2.0,22.9,4.9,8.9,...,0.733,0.8,2.2,3.0,1.5,0.6,0.7,1.2,2.6,12.7
23142,1990,Haywoode Workman,PG,24,ATL,6,0.0,2.7,0.3,0.5,...,1.000,0.0,0.5,0.5,0.3,0.5,0.0,0.0,0.5,1.0
23143,1990,James Worthy*,SF,28,LAL,80,80.0,37.0,8.9,16.2,...,0.782,2.0,4.0,6.0,3.6,1.2,0.6,2.0,2.4,21.1
23144,1990,Danny Young,PG,27,POR,82,8.0,17.0,1.7,4.0,...,0.813,0.4,1.1,1.5,2.8,1.0,0.0,1.0,1.0,4.7


In [28]:
per36min_stats1 = pd.read_csv('per36min_stats.csv')
per36min_stats2 = pd.read_csv('per36min_stats2.csv')
per36min_stats_combined = pd.concat([per36min_stats1, per36min_stats2], ignore_index=True, axis=0)
per36min_stats_combined.to_csv("per36min_stats_combined.csv", index = False)

In [29]:
per36min_stats_combined

Unnamed: 0,Yr,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1991,Alaa Abdelnaby,PF,22,POR,43,0.0,290,6.8,14.4,...,0.568,3.4,7.7,11.0,1.5,0.5,1.5,2.7,4.8,16.8
1,1991,Mahmoud Abdul-Rauf,PG,21,DEN,67,19.0,1505,10.0,24.1,...,0.857,0.8,2.1,2.9,4.9,1.3,0.1,2.6,3.6,22.5
2,1991,Mark Acres,C,28,ORL,68,0.0,1313,3.0,5.9,...,0.653,3.8,6.0,9.8,0.7,0.7,0.7,1.2,6.0,7.8
3,1991,Michael Adams,PG,28,DEN,66,66.0,2346,8.6,21.8,...,0.879,0.9,3.0,3.9,10.6,2.3,0.1,3.7,2.5,26.9
4,1991,Mark Aguirre,SF,31,DET,78,13.0,2006,7.5,16.3,...,0.757,2.4,4.3,6.7,2.5,0.8,0.4,2.3,3.8,19.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23141,1990,Orlando Woolridge,SF,30,LAL,62,2.0,1421,7.8,13.9,...,0.733,1.2,3.4,4.7,2.4,1.0,1.2,1.8,4.1,20.0
23142,1990,Haywoode Workman,PG,24,ATL,6,0.0,16,4.5,6.8,...,1.000,0.0,6.8,6.8,4.5,6.8,0.0,0.0,6.8,13.5
23143,1990,James Worthy*,SF,28,LAL,80,80.0,2960,8.6,15.8,...,0.782,1.9,3.9,5.8,3.5,1.2,0.6,1.9,2.3,20.5
23144,1990,Danny Young,PG,27,POR,82,8.0,1393,3.6,8.5,...,0.813,0.7,2.4,3.2,6.0,2.1,0.1,2.1,2.2,9.9


In [30]:
per100pos_stats1 = pd.read_csv('per100pos_stats.csv')
per100pos_stats2 = pd.read_csv('per100pos_stats2.csv')
per100pos_stats_combined = pd.concat([per100pos_stats1, per100pos_stats2], ignore_index=True, axis=0)
per100pos_stats_combined.to_csv("per100pos_stats_combined.csv", index = False)

In [31]:
per100pos_stats_combined

Unnamed: 0,Yr,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,TRB,AST,STL,BLK,TOV,PF,PTS,ORtg,DRtg,Unnamed: 31
0,1991,Alaa Abdelnaby,PF,22,POR,43,0.0,290,9.1,19.1,...,14.7,2.0,0.7,2.0,3.6,6.4,22.2,99.0,103.0,
1,1991,Mahmoud Abdul-Rauf,PG,21,DEN,67,19.0,1505,11.7,28.3,...,3.4,5.8,1.5,0.1,3.1,4.2,26.4,96.0,118.0,
2,1991,Mark Acres,C,28,ORL,68,0.0,1313,4.0,7.9,...,13.2,0.9,0.9,0.9,1.5,8.0,10.5,116.0,109.0,
3,1991,Michael Adams,PG,28,DEN,66,66.0,2346,10.1,25.6,...,4.6,12.5,2.6,0.1,4.3,2.9,31.5,113.0,115.0,
4,1991,Mark Aguirre,SF,31,DET,78,13.0,2006,10.9,23.7,...,9.7,3.6,1.2,0.5,3.3,5.4,28.7,109.0,105.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23141,1990,Orlando Woolridge,SF,30,LAL,62,2.0,1421,10.7,19.3,...,6.5,3.4,1.4,1.6,2.6,5.6,27.6,118.0,108.0,
23142,1990,Haywoode Workman,PG,24,ATL,6,0.0,16,6.3,9.4,...,9.4,6.3,9.4,0.0,0.0,9.4,18.8,175.0,94.0,
23143,1990,James Worthy*,SF,28,LAL,80,80.0,2960,12.0,21.9,...,8.0,4.8,1.7,0.8,2.7,3.2,28.4,119.0,108.0,
23144,1990,Danny Young,PG,27,POR,82,8.0,1393,4.6,11.0,...,4.1,7.8,2.8,0.1,2.7,2.8,12.9,109.0,105.0,


In [32]:
teams_data1 = pd.read_csv('teams_data.csv')
teams_data2 = pd.read_csv('teams_data2.csv')
teams_data_combined = pd.concat([teams_data1, teams_data2], ignore_index=True, axis=0)
teams_data_combined.to_csv("teams_data_combined.csv", index = False)

In [33]:
teams_data_combined

Unnamed: 0,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,Playoffs,Losing_season
0,1991,Boston Celtics,56,26,0.683,—,111.5,105.7,5.22,Y,N
1,1991,Philadelphia 76ers,44,38,0.537,12.0,105.4,105.6,-0.39,Y,N
2,1991,New York Knicks,39,43,0.476,17.0,103.1,103.3,-0.43,Y,Y
3,1991,Washington Bullets,30,52,0.366,26.0,101.4,106.4,-4.84,N,Y
4,1991,New Jersey Nets,26,56,0.317,30.0,102.9,107.5,-4.53,N,Y
...,...,...,...,...,...,...,...,...,...,...,...
1219,1990,Phoenix Suns,54,28,0.659,9.0,114.9,107.8,7.09,Y,N
1220,1990,Seattle SuperSonics,41,41,0.500,22.0,106.9,105.9,1.40,N,N
1221,1990,Golden State Warriors,37,45,0.451,26.0,116.3,119.4,-2.55,N,Y
1222,1990,Los Angeles Clippers,30,52,0.366,33.0,103.8,107.2,-2.80,N,Y


In [None]:
'Next step is to scrape awards data for MVP and DPOY for those seasons'

In [62]:
def scrape_mvps(years):
    mvps_df = pd.DataFrame(columns = ['Yr', 'Rank', 'Player', 'Age', 'Tm', 'First', 'Pts Won', 'Pts Max', 'Share', 'G','MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48'])
    for year in years:
        url = 'https://www.basketball-reference.com/awards/awards_' + year + '.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        table = soup.findAll('table')[0]
        headers = [th.getText() for th in table.findAll('tr', limit=2)[1].findAll('th')]
        headers.insert(0, 'Yr')
        rows = table.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        rows_data = list(filter(None,rows_data))
        for i in range(0, len(rows_data)):
            rows_data[i].insert(0, year)
            rows_data[i].insert(1, i+1)
        
        year_mvps_df = pd.DataFrame(rows_data, columns = headers)
        mvps_df = mvps_df.append(year_mvps_df)
        
    mvps_df.to_csv("mvps2.csv", index=False)

In [61]:
scrape_mvps(string_years)

In [63]:
scrape_mvps(string_years2)

In [80]:
def scrape_dpoys(years):
    dpoys_df = pd.DataFrame(columns = ['Yr', 'Rank', 'Player', 'Age', 'Tm', 'First', 'Pts Won', 'Pts Max', 'Share', 'G','MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48', 'DWS', 'DBPM','DRtg'])
    for year in years:
        url = 'https://www.basketball-reference.com/awards/awards_' + year + '.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
        table = soup.findAll('table')[4]
#         headers = [th.getText() for th in table.findAll('tr', limit=2)[1].findAll('th')]
#         headers.insert(0, 'Yr')
#         rows = table.findAll('tr')[1:]
#         rows_data = [[td.getText() for td in rows[i].findAll('td')]
#                     for i in range(len(rows))]
#         rows_data = list(filter(None,rows_data))
#         for i in range(0, len(rows_data)):
#             rows_data[i].insert(0, year)
#             rows_data[i].insert(1, i+1)
        
#         year_dpoys_df = pd.DataFrame(rows_data, columns = headers)
#         dpoys_df = dpoys_df.append(year_dpoys_df)
        
#         print(dpoys_df)
        print(table)
        
#     dpoys_df.to_csv("dpoys.csv", index=False)

In [43]:
url = 'https://www.basketball-reference.com/awards/awards_1991.html'
html = urlopen(url)
soup = BeautifulSoup(html, features = 'lxml')

print(html)
# print(soup.findAll("hakeem olajuwon trophy"))

### Why isn't DPOY in the HTML listed the same as the other tables?

<http.client.HTTPResponse object at 0x7f829c88dca0>


In [None]:
'Next step is to pull additional years from 1980-1990 for more data points and combine to the current files'

In [35]:
'Next step is to join all the separate files together using players, years, and teams as joinkeys'

'Next step is to join all the separate files together using players, years, and teams as joinkeys'