In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Define functions to scrape stats

In [2]:
def scrapeMVP(year):
    address_string='https://www.basketball-reference.com/awards/awards_'+str(year)+'.html';
    print(address_string);
    page = requests.get(address_string);
    soup = BeautifulSoup(page.text, 'html.parser');
    
    # First, restrict to MVP table
    mvpSoup = soup.find(id='all_mvp');
    
    # Extract the player names
    player_name_entries = mvpSoup.findAll("td", {"data-stat" : "player"});
    player_name_list = [];
    for item in player_name_entries:
        player_name_list.append(str(item.contents[0].contents[0]));
    
    # Extract the first place votes earned
    votes_first_entries = mvpSoup.findAll("td", {"data-stat" : "votes_first"});
    votes_first_list = [];
    for item in votes_first_entries:
        score = int(float(item.contents[0]));
        votes_first_list.append(score);
    
    # Extract the total MVP votes earned
    points_won_entries = mvpSoup.findAll("td", {"data-stat" : "points_won"});
    points_won_list = [];
    for item in points_won_entries:
        score = int(float(item.contents[0]));
        points_won_list.append(score);
    
    # This just checks that nothing went wrong in extracting the data
    # makes sure the lengths are the same
    if len(player_name_list)!=len(points_won_list):
        print('player_name_list not same length as points_won_list');
        return 'Failed'
    
    mvpVotesData=pd.DataFrame(data={'player': player_name_list, 'votes_first': votes_first_list,'points_won': points_won_list}).set_index('player');
    mvpVotesData.to_csv('MVP_Data/mvpVotes'+str(year)+'.csv', index=False);
    return mvpVotesData;

In [3]:
def scrapeAdvancedStats(year):
    address_string='https://www.basketball-reference.com/leagues/NBA_'+str(year)+'_advanced.html';
    print(address_string);
    page = requests.get(address_string);
    soup = BeautifulSoup(page.text, 'html.parser');
    
    # First, restrict to per_poss_stats table
    perstatsSoup = soup.find(id='all_advanced_stats');
    
    # Identify the header id's
    data_stat_entries = perstatsSoup.findAll("th")[0:29];
    #data_stat_entries[1].attrs #this lists the attributes associated with the HTML tag
    data_stats = [];
    for data_stat in data_stat_entries:
        data_stats.append(str(data_stat.attrs['data-stat']));
    
    raw_dict = {};
    for data_stat in data_stats[1::]:
        #print('Working on '+data_stat);
        given_data_stat_entries = perstatsSoup.findAll("td", {"data-stat" : data_stat});
        given_data_stat_list = [];
        for item in given_data_stat_entries:
            #some data entries are actually hyperlinks
            #I need the deepest content, which I get using descendants method
            val = list(item.descendants);
            if len(val)==0:
                val = float('NaN')
            elif len(val)==1:
                val = val[0];
            else:
                val = val[1]
            given_data_stat_list.append(val);
        #append to dictionary using update method
        raw_dict.update({data_stat:given_data_stat_list});
    
    #build the dataframe using the dictonary containing the scraped data
    advData=pd.DataFrame(data=raw_dict).set_index('player');
    #possData.to_csv('seasonData'+str(year)+'.csv', index=False);
    advData.to_csv('Advanced_Data/advancedStats'+str(year)+'.csv', index=False);
    return advData;

In [4]:
def scrapeSeasonStatsPerG(year):
    address_string='https://www.basketball-reference.com/leagues/NBA_'+str(year)+'_per_game.html';
    print(address_string);
    page = requests.get(address_string);
    soup = BeautifulSoup(page.text, 'html.parser');
    
    # First, restrict to per_poss_stats table
    perstatsSoup = soup.find(id='all_per_game_stats');
    
    # Identify the header id's
    data_stat_entries = perstatsSoup.findAll("th")[0:30];
    #data_stat_entries[1].attrs #this lists the attributes associated with the HTML tag
    data_stats = [];
    for data_stat in data_stat_entries:
        data_stats.append(str(data_stat.attrs['data-stat']));
    
    raw_dict = {};
    for data_stat in data_stats[1::]:
        #print('Working on '+data_stat);
        given_data_stat_entries = perstatsSoup.findAll("td", {"data-stat" : data_stat});
        given_data_stat_list = [];
        for item in given_data_stat_entries:
            #some data entries are actually hyperlinks
            #I need the deepest content, which I get using descendants method
            val = list(item.descendants);
            if len(val)==0:
                val = float('NaN')
            elif len(val)==1:
                val = val[0];
            else:
                val = val[1]
            given_data_stat_list.append(val);
        #append to dictionary using update method
        raw_dict.update({data_stat:given_data_stat_list});
    
    #build the dataframe using the dictonary containing the scraped data
    possData=pd.DataFrame(data=raw_dict).set_index('player');
    possData.to_csv('Per_Game_Data/seasonData'+str(year)+'_per_g.csv', index=False);
    return possData;

In [5]:
def scrapeSeasonStatsPerMin(year):
    address_string='https://www.basketball-reference.com/leagues/NBA_'+str(year)+'_per_minute.html';
    print(address_string);
    page = requests.get(address_string);
    soup = BeautifulSoup(page.text, 'html.parser');
    
    # First, restrict to per_poss_stats table
    perstatsSoup = soup.find(id='all_per_minute_stats');
    
    # Identify the header id's
    data_stat_entries = perstatsSoup.findAll("th")[0:29];
    #data_stat_entries[1].attrs #this lists the attributes associated with the HTML tag
    data_stats = [];
    for data_stat in data_stat_entries:
        data_stats.append(str(data_stat.attrs['data-stat']));
    
    raw_dict = {};
    for data_stat in data_stats[1::]:
        #print('Working on '+data_stat);
        given_data_stat_entries = perstatsSoup.findAll("td", {"data-stat" : data_stat});
        given_data_stat_list = [];
        for item in given_data_stat_entries:
            #some data entries are actually hyperlinks
            #I need the deepest content, which I get using descendants method
            val = list(item.descendants);
            if len(val)==0:
                val = float('NaN')
            elif len(val)==1:
                val = val[0];
            else:
                val = val[1]
            given_data_stat_list.append(val);
        #append to dictionary using update method
        raw_dict.update({data_stat:given_data_stat_list});
    
    #build the dataframe using the dictonary containing the scraped data
    possData=pd.DataFrame(data=raw_dict).set_index('player');
    possData.to_csv('Per_Game_Data/seasonData'+str(year)+'_per_min.csv', index=False);
    return possData;

In [6]:
def scrapeSeasonStats(year):
    address_string='https://www.basketball-reference.com/leagues/NBA_'+str(year)+'_per_poss.html';
    print(address_string);
    page = requests.get(address_string);
    soup = BeautifulSoup(page.text, 'html.parser');
    
    # First, restrict to per_poss_stats table
    perstatsSoup = soup.find(id='all_per_poss_stats');
    
    # Identify the header id's
    data_stat_entries = perstatsSoup.findAll("th")[0:32];
    #data_stat_entries[1].attrs #this lists the attributes associated with the HTML tag
    data_stats = [];
    for data_stat in data_stat_entries:
        data_stats.append(str(data_stat.attrs['data-stat']));
    
    raw_dict = {};
    for data_stat in data_stats[1::]:
        #print('Working on '+data_stat);
        given_data_stat_entries = perstatsSoup.findAll("td", {"data-stat" : data_stat});
        given_data_stat_list = [];
        for item in given_data_stat_entries:
            #some data entries are actually hyperlinks
            #I need the deepest content, which I get using descendants method
            val = list(item.descendants);
            if len(val)==0:
                val = float('NaN')
            elif len(val)==1:
                val = val[0];
            else:
                val = val[1]
            given_data_stat_list.append(val);
        #append to dictionary using update method
        raw_dict.update({data_stat:given_data_stat_list});
    
    #build the dataframe using the dictonary containing the scraped data
    possData=pd.DataFrame(data=raw_dict).set_index('player');
    possData.to_csv('Per_Poss_Data/seasonData'+str(year)+'.csv', index=False);
    return possData;

# Merge dataframes

In [7]:
#Merge season stats and MVP votes dataframes
def mergeWithMVP(ssdf,mvpdf):
    mergedpd = pd.merge(ssdf,mvpdf, on= 'player', how ='outer');
    mergedpd['votes_first']=mergedpd['votes_first'].fillna(0);
    mergedpd['points_won']=mergedpd['points_won'].fillna(0);
    return mergedpd;

def mergeAandB(ssdf,advdf):
    mergedpd = pd.merge(ssdf, advdf, on='player', how='outer');
    return mergedpd;

# Loop over years and data sets

In [8]:
#Loop over years, and save resulting dataframe as CSV file
yearList = range(1981,2020,1);
for year in yearList:
    #scrape the data from webpages
    ssdf = scrapeSeasonStats(year);
    advdf = scrapeAdvancedStats(year);
    mvpdf = scrapeMVP(year);
    sspgdf = scrapeSeasonStatsPerG(year);
    sspmdf = scrapeSeasonStatsPerMin(year);
    
    #merge the data
    df = mergeAandB(ssdf,advdf);
    lf = mergeAandB(df,sspgdf);
    zf = mergeAandB(lf,sspmdf);
    pf = mergeWithMVP(zf,mvpdf);
    
    pf.to_csv('Raw_Data/data_'+str(year)+'.csv', index=True);

https://www.basketball-reference.com/leagues/NBA_1981_per_poss.html
https://www.basketball-reference.com/leagues/NBA_1981_advanced.html
https://www.basketball-reference.com/awards/awards_1981.html
https://www.basketball-reference.com/leagues/NBA_1981_per_game.html
https://www.basketball-reference.com/leagues/NBA_1981_per_minute.html
https://www.basketball-reference.com/leagues/NBA_1982_per_poss.html
https://www.basketball-reference.com/leagues/NBA_1982_advanced.html
https://www.basketball-reference.com/awards/awards_1982.html
https://www.basketball-reference.com/leagues/NBA_1982_per_game.html
https://www.basketball-reference.com/leagues/NBA_1982_per_minute.html
https://www.basketball-reference.com/leagues/NBA_1983_per_poss.html
https://www.basketball-reference.com/leagues/NBA_1983_advanced.html
https://www.basketball-reference.com/awards/awards_1983.html
https://www.basketball-reference.com/leagues/NBA_1983_per_game.html
https://www.basketball-reference.com/leagues/NBA_1983_per_minute.

https://www.basketball-reference.com/leagues/NBA_2005_per_game.html
https://www.basketball-reference.com/leagues/NBA_2005_per_minute.html
https://www.basketball-reference.com/leagues/NBA_2006_per_poss.html
https://www.basketball-reference.com/leagues/NBA_2006_advanced.html
https://www.basketball-reference.com/awards/awards_2006.html
https://www.basketball-reference.com/leagues/NBA_2006_per_game.html
https://www.basketball-reference.com/leagues/NBA_2006_per_minute.html
https://www.basketball-reference.com/leagues/NBA_2007_per_poss.html
https://www.basketball-reference.com/leagues/NBA_2007_advanced.html
https://www.basketball-reference.com/awards/awards_2007.html
https://www.basketball-reference.com/leagues/NBA_2007_per_game.html
https://www.basketball-reference.com/leagues/NBA_2007_per_minute.html
https://www.basketball-reference.com/leagues/NBA_2008_per_poss.html
https://www.basketball-reference.com/leagues/NBA_2008_advanced.html
https://www.basketball-reference.com/awards/awards_2008.

### Normalize data based on best score

In [4]:
yearList = range(1981,2020,1);
categories = ['player','pos_x','age_x','team_id_x','g_x','mp_x','fg_per_poss','fga_per_poss','fg_pct_x'
              ,'fg3_per_poss','fg3a_per_poss','fg3_pct_x','fg2_per_poss','fg2a_per_poss','fg2_pct_x','ft_per_poss'
              ,'fta_per_poss','ft_pct_x','orb_per_poss','drb_per_poss','trb_per_poss','ast_per_poss','stl_per_poss'
              ,'blk_per_poss','tov_per_poss','pf_per_poss','pts_per_poss','off_rtg','def_rtg','per','ts_pct'
              ,'fg3a_per_fga_pct','fta_per_fga_pct','orb_pct','drb_pct','trb_pct','ast_pct','stl_pct','blk_pct'
              ,'tov_pct','usg_pct','ws-dum','ows','dws','ws','ws_per_48','bpm-dum','obpm','dbpm','bpm','vorp'
              ,'mp_per_g','fg_per_g','fga_per_g','fg3_per_g','fg3a_per_g'
              ,'fg2_per_g','fg2a_per_g','efg_pct','ft_per_g','fta_per_g','orb_per_g','drb_per_g','trb_per_g'
              ,'ast_per_g','stl_per_g','blk_per_g','tov_per_g','pf_per_g','pts_per_g','fg_per_mp','fga_per_mp'
              ,'fg3_per_mp','fg3a_per_mp','fg2_per_mp','fg2a_per_mp','ft_per_mp','fta_per_mp','orb_per_mp'
              ,'drb_per_mp','trb_per_mp','ast_per_mp','stl_per_mp','blk_per_mp','tov_per_mp','pf_per_mp','pts_per_mp'
              ,'votes_first','points_won']
for year in yearList: 
    file2 = pd.read_csv('Raw_Data/data_' + str(year) + '.csv');
    file2 = file2[categories]
    file2 = file2.drop_duplicates(subset=['player'])
    #file2 = file2.drop(file2[file2.mp_x <1000].index)
    for i in range(4,len(categories)):
        cat = file2[[categories[i]]]
        max_cat = np.amax(cat)
        max_val = max_cat.values
        file2[[categories[i]]] = file2[[categories[i]]]/max_val

    
    pd.DataFrame(file2.to_csv('Cut_Data/data_'+str(year)+'_Norm.csv', index = False));
    
    
    

### Merge files for training/validating and testing

In [5]:
yearList = range(1982,2009,1);
file1 = pd.read_csv('Cut_Data/data_1981_Norm.csv')
for year in yearList: 
    file2 = pd.read_csv('Cut_Data/data_' + str(year) + '_Norm.csv');
    file1 = pd.concat([file1, file2]); 
    print(file1)
pd.DataFrame(file1).to_csv("DataTrainSet.csv", index=False)

                  player pos_x  age_x team_id_x       g_x      mp_x  \
0    Kareem Abdul-Jabbar     C     33       LAL  0.952381  0.870939   
1          Tom Abernethy    SF     26       TOT  0.464286  0.087211   
2            Alvan Adams     C     26       PHO  0.892857  0.601112   
3         Darrell Allums    PF     22       DAL  0.261905  0.080773   
4         Tiny Archibald    PG     32       BOS  0.952381  0.825285   
..                   ...   ...    ...       ...       ...       ...   
309         Mike Woodson    SG     23       TOT  0.988095  0.685992   
310    Orlando Woolridge    SF     22       CHI  0.892857  0.349617   
311          Sam Worthen    PG     24       UTA  0.059524  0.006474   
312         Larry Wright    PG     27       DET  0.011905  0.001766   
313         Rich Yonakor    PF     23       SAS  0.119048  0.020600   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  drb_per_mp  \
0       0.839744      0.658046  0.856716      0.000000  ...    0.625000 

                  player pos_x  age_x team_id_x       g_x      mp_x  \
0    Kareem Abdul-Jabbar     C     33       LAL  0.952381  0.870939   
1          Tom Abernethy    SF     26       TOT  0.464286  0.087211   
2            Alvan Adams     C     26       PHO  0.892857  0.601112   
3         Darrell Allums    PF     22       DAL  0.261905  0.080773   
4         Tiny Archibald    PG     32       BOS  0.952381  0.825285   
..                   ...   ...    ...       ...       ...       ...   
329    Orlando Woolridge    SF     27       NJN  0.914634  0.804023   
330         James Worthy    SF     25       LAL  1.000000  0.859189   
331          Brad Wright    PF     24       NYK  0.170732  0.042060   
332          Danny Young    PG     24       SEA  0.890244  0.451692   
333          Perry Young    SG     23       TOT  0.109756  0.021945   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  drb_per_mp  \
0       0.839744      0.658046  0.856716           0.0  ...    0.625000 

                  player pos_x  age_x team_id_x       g_x      mp_x  \
0    Kareem Abdul-Jabbar     C     33       LAL  0.952381  0.870939   
1          Tom Abernethy    SF     26       TOT  0.464286  0.087211   
2            Alvan Adams     C     26       PHO  0.892857  0.601112   
3         Darrell Allums    PF     22       DAL  0.261905  0.080773   
4         Tiny Archibald    PG     32       BOS  0.952381  0.825285   
..                   ...   ...    ...       ...       ...       ...   
381     Haywoode Workman    PG     25       WSB  0.890244  0.613575   
382         James Worthy    SF     29       LAL  0.951220  0.907391   
383        Howard Wright    PF     23       TOT  0.182927  0.049472   
384          A.J. Wynder    PG     26       BOS  0.073171  0.011765   
385          Danny Young    PG     28       POR  0.914634  0.270588   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  drb_per_mp  \
0       0.839744      0.658046  0.856716       0.00000  ...    0.625000 

                  player pos_x  age_x team_id_x       g_x      mp_x  \
0    Kareem Abdul-Jabbar     C     33       LAL  0.952381  0.870939   
1          Tom Abernethy    SF     26       TOT  0.464286  0.087211   
2            Alvan Adams     C     26       PHO  0.892857  0.601112   
3         Darrell Allums    PF     22       DAL  0.261905  0.080773   
4         Tiny Archibald    PG     32       BOS  0.952381  0.825285   
..                   ...   ...    ...       ...       ...       ...   
423           David Wood    PF     31       TOT  0.746988  0.223315   
424          Randy Woods    PG     25       DEN  0.096386  0.020827   
425     Haywoode Workman    PG     30       IND  0.927711  0.336708   
426       Sharone Wright     C     23       TOT  0.686747  0.414811   
427         George Zídek     C     22       CHH  0.855422  0.256870   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  drb_per_mp  \
0       0.839744      0.658046  0.856716      0.000000  ...    0.625000 

                  player pos_x  age_x team_id_x       g_x      mp_x  \
0    Kareem Abdul-Jabbar     C     33       LAL  0.952381  0.870939   
1          Tom Abernethy    SF     26       TOT  0.464286  0.087211   
2            Alvan Adams     C     26       PHO  0.892857  0.601112   
3         Darrell Allums    PF     22       DAL  0.261905  0.080773   
4         Tiny Archibald    PG     32       BOS  0.952381  0.825285   
..                   ...   ...    ...       ...       ...       ...   
436        David Wingate    SG     37       SEA  0.012048  0.002614   
437     Rubén Wolkowyski    PF     27       SEA  0.409639  0.088586   
438    Metta World Peace    SF     21       CHI  0.915663  0.686320   
439      Lorenzen Wright     C     25       ATL  0.855422  0.577403   
440          Wang Zhizhi     C     23       DAL  0.060241  0.011037   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  drb_per_mp  \
0       0.839744      0.658046  0.856716      0.000000  ...    0.625000 

                  player pos_x  age_x team_id_x       g_x      mp_x  \
0    Kareem Abdul-Jabbar     C     33       LAL  0.952381  0.870939   
1          Tom Abernethy    SF     26       TOT  0.464286  0.087211   
2            Alvan Adams     C     26       PHO  0.892857  0.601112   
3         Darrell Allums    PF     22       DAL  0.261905  0.080773   
4         Tiny Archibald    PG     32       BOS  0.952381  0.825285   
..                   ...   ...    ...       ...       ...       ...   
459         Qyntel Woods    SF     23       MIA  0.035714  0.011806   
460    Metta World Peace    SF     25       IND  0.083333  0.085891   
461        Dorell Wright    SF     19       MIA  0.035714  0.007969   
462      Lorenzen Wright     C     29       MEM  0.952381  0.675030   
463          Wang Zhizhi     C     27       MIA  0.238095  0.027155   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  drb_per_mp  \
0       0.839744      0.658046  0.856716          0.00  ...    0.625000 

In [11]:
yearList = range(2010,2015,1);
file1 = pd.read_csv('Cut_Data/data_2009_Norm.csv')
for year in yearList: 
    file2 = pd.read_csv('Cut_Data/data_' + str(year) + '_Norm.csv');
    file1 = pd.concat([file1, file2]); 
    print(file1)
pd.DataFrame(file1).to_csv("DataValidateSet.csv", index=False)

             player pos_x  age_x team_id_x       g_x      mp_x  fg_per_poss  \
0        Alex Acker    SG     26       TOT  0.304878  0.060875     0.308594   
1      Hassan Adams    SG     24       TOR  0.146341  0.015907     0.156250   
2     Arron Afflalo    SG     23       DET  0.902439  0.377485     0.230469   
3      Maurice Ager    SG     24       NJN  0.243902  0.029673     0.324219   
4      Blake Ahearn    PG     24       SAS  0.036585  0.005812     0.222656   
..              ...   ...    ...       ...       ...       ...          ...   
437   Dorell Wright    SF     24       MIA  0.878049  0.461871     0.479167   
438   Julian Wright    SF     22       NOH  0.829268  0.268910     0.479167   
439      Nick Young    SG     24       WAS  0.902439  0.438716     0.597222   
440       Sam Young    SF     24       MEM  0.975610  0.407842     0.604167   
441  Thaddeus Young    PF     21       PHI  0.817073  0.662241     0.645833   

     fga_per_poss  fg_pct_x  fg3_per_poss  ...  drb

             player pos_x  age_x team_id_x       g_x      mp_x  fg_per_poss  \
0        Alex Acker    SG   26.0       TOT  0.304878  0.060875     0.308594   
1      Hassan Adams    SG   24.0       TOR  0.146341  0.015907     0.156250   
2     Arron Afflalo    SG   23.0       DET  0.902439  0.377485     0.230469   
3      Maurice Ager    SG   24.0       NJN  0.243902  0.029673     0.324219   
4      Blake Ahearn    PG   24.0       SAS  0.036585  0.005812     0.222656   
..              ...   ...    ...       ...       ...       ...          ...   
477     Tony Wroten    PG   20.0       PHI  0.867470  0.565343     0.180608   
478      Nick Young    SG   28.0       LAL  0.771084  0.579757     0.197719   
479  Thaddeus Young    PF   25.0       PHI  0.951807  0.870596     0.197719   
480     Cody Zeller     C   21.0       CHA  0.987952  0.453555     0.119772   
481    Tyler Zeller     C   24.0       CLE  0.843373  0.336003     0.146388   

     fga_per_poss  fg_pct_x  fg3_per_poss  ...  drb

In [12]:
yearList = range(2016,2020,1);
file1 = pd.read_csv('Cut_Data/data_2015_Norm.csv')
for year in yearList: 
    file2 = pd.read_csv('Cut_Data/data_' + str(year) + '_Norm.csv');
    file1 = pd.concat([file1, file2]); 
    print(file1)
pd.DataFrame(file1).to_csv("DataTestSet.csv", index=False)

             player pos_x  age_x team_id_x       g_x      mp_x  fg_per_poss  \
0        Quincy Acy    PF   24.0       NYK  0.819277  0.431734     0.246032   
1      Jordan Adams    SG   20.0       MEM  0.361446  0.083194     0.293651   
2      Steven Adams     C   21.0       OKC  0.843373  0.594096     0.242063   
3       Jeff Adrien    PF   28.0       MIN  0.204819  0.072123     0.178571   
4     Arron Afflalo    SG   29.0       TOT  0.939759  0.839316     0.297619   
..              ...   ...    ...       ...       ...       ...          ...   
471       Joe Young    PG   23.0       IND  0.500000  0.122880     0.311284   
472      Nick Young    SG   30.0       LAL  0.658537  0.330560     0.237354   
473  Thaddeus Young    PF   27.0       BRK  0.890244  0.770240     0.404669   
474     Cody Zeller     C   23.0       CHO  0.890244  0.567680     0.252918   
475    Tyler Zeller     C   26.0       BOS  0.731707  0.227200     0.369650   

     fga_per_poss  fg_pct_x  fg3_per_poss  ...  drb