In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Define functions to scrape stats

In [31]:
def scrapeMVP(year):
    address_string='https://www.basketball-reference.com/awards/awards_'+str(year)+'.html';
    print(address_string);
    page = requests.get(address_string);
    soup = BeautifulSoup(page.text, 'html.parser');
    
    # First, restrict to MVP table
    mvpSoup = soup.find(id='all_mvp');
    
    # Extract the player names
    player_name_entries = mvpSoup.findAll("td", {"data-stat" : "player"});
    player_name_list = [];
    for item in player_name_entries:
        player_name_list.append(str(item.contents[0].contents[0]));
    
    # Extract the first place votes earned
    votes_first_entries = mvpSoup.findAll("td", {"data-stat" : "votes_first"});
    votes_first_list = [];
    for item in votes_first_entries:
        score = int(float(item.contents[0]));
        votes_first_list.append(score);
    
    # Extract the total MVP votes earned
    points_won_entries = mvpSoup.findAll("td", {"data-stat" : "points_won"});
    points_won_list = [];
    for item in points_won_entries:
        score = int(float(item.contents[0]));
        points_won_list.append(score);
    
    # This just checks that nothing went wrong in extracting the data
    # makes sure the lengths are the same
    if len(player_name_list)!=len(points_won_list):
        print('player_name_list not same length as points_won_list');
        return 'Failed'
    
    mvpVotesData=pd.DataFrame(data={'player': player_name_list, 'votes_first': votes_first_list,'points_won': points_won_list}).set_index('player');
    mvpVotesData.to_csv('MVP_Data/mvpVotes'+str(year)+'.csv', index=False);
    return mvpVotesData;

In [32]:
def scrapeAdvancedStats(year):
    address_string='https://www.basketball-reference.com/leagues/NBA_'+str(year)+'_advanced.html';
    print(address_string);
    page = requests.get(address_string);
    soup = BeautifulSoup(page.text, 'html.parser');
    
    # First, restrict to per_poss_stats table
    perstatsSoup = soup.find(id='all_advanced_stats');
    
    # Identify the header id's
    data_stat_entries = perstatsSoup.findAll("th")[0:29];
    #data_stat_entries[1].attrs #this lists the attributes associated with the HTML tag
    data_stats = [];
    for data_stat in data_stat_entries:
        data_stats.append(str(data_stat.attrs['data-stat']));
    
    raw_dict = {};
    for data_stat in data_stats[1::]:
        #print('Working on '+data_stat);
        given_data_stat_entries = perstatsSoup.findAll("td", {"data-stat" : data_stat});
        given_data_stat_list = [];
        for item in given_data_stat_entries:
            #some data entries are actually hyperlinks
            #I need the deepest content, which I get using descendants method
            val = list(item.descendants);
            if len(val)==0:
                val = float('NaN')
            elif len(val)==1:
                val = val[0];
            else:
                val = val[1]
            given_data_stat_list.append(val);
        #append to dictionary using update method
        raw_dict.update({data_stat:given_data_stat_list});
    
    #build the dataframe using the dictonary containing the scraped data
    advData=pd.DataFrame(data=raw_dict).set_index('player');
    #possData.to_csv('seasonData'+str(year)+'.csv', index=False);
    advData.to_csv('Advanced_Data/advancedStats'+str(year)+'.csv', index=False);
    return advData;

In [33]:
def scrapeSeasonStatsPerG(year):
    address_string='https://www.basketball-reference.com/leagues/NBA_'+str(year)+'_per_game.html';
    print(address_string);
    page = requests.get(address_string);
    soup = BeautifulSoup(page.text, 'html.parser');
    
    # First, restrict to per_poss_stats table
    perstatsSoup = soup.find(id='all_per_game_stats');
    
    # Identify the header id's
    data_stat_entries = perstatsSoup.findAll("th")[0:30];
    #data_stat_entries[1].attrs #this lists the attributes associated with the HTML tag
    data_stats = [];
    for data_stat in data_stat_entries:
        data_stats.append(str(data_stat.attrs['data-stat']));
    
    raw_dict = {};
    for data_stat in data_stats[1::]:
        #print('Working on '+data_stat);
        given_data_stat_entries = perstatsSoup.findAll("td", {"data-stat" : data_stat});
        given_data_stat_list = [];
        for item in given_data_stat_entries:
            #some data entries are actually hyperlinks
            #I need the deepest content, which I get using descendants method
            val = list(item.descendants);
            if len(val)==0:
                val = float('NaN')
            elif len(val)==1:
                val = val[0];
            else:
                val = val[1]
            given_data_stat_list.append(val);
        #append to dictionary using update method
        raw_dict.update({data_stat:given_data_stat_list});
    
    #build the dataframe using the dictonary containing the scraped data
    possData=pd.DataFrame(data=raw_dict).set_index('player');
    possData.to_csv('Per_Game_Data/seasonData'+str(year)+'_per_g.csv', index=False);
    return possData;

In [34]:
def scrapeSeasonStatsPerMin(year):
    address_string='https://www.basketball-reference.com/leagues/NBA_'+str(year)+'_per_minute.html';
    print(address_string);
    page = requests.get(address_string);
    soup = BeautifulSoup(page.text, 'html.parser');
    
    # First, restrict to per_poss_stats table
    perstatsSoup = soup.find(id='all_per_minute_stats');
    
    # Identify the header id's
    data_stat_entries = perstatsSoup.findAll("th")[0:29];
    #data_stat_entries[1].attrs #this lists the attributes associated with the HTML tag
    data_stats = [];
    for data_stat in data_stat_entries:
        data_stats.append(str(data_stat.attrs['data-stat']));
    
    raw_dict = {};
    for data_stat in data_stats[1::]:
        #print('Working on '+data_stat);
        given_data_stat_entries = perstatsSoup.findAll("td", {"data-stat" : data_stat});
        given_data_stat_list = [];
        for item in given_data_stat_entries:
            #some data entries are actually hyperlinks
            #I need the deepest content, which I get using descendants method
            val = list(item.descendants);
            if len(val)==0:
                val = float('NaN')
            elif len(val)==1:
                val = val[0];
            else:
                val = val[1]
            given_data_stat_list.append(val);
        #append to dictionary using update method
        raw_dict.update({data_stat:given_data_stat_list});
    
    #build the dataframe using the dictonary containing the scraped data
    possData=pd.DataFrame(data=raw_dict).set_index('player');
    possData.to_csv('Per_Game_Data/seasonData'+str(year)+'_per_min.csv', index=False);
    return possData;

In [35]:
def scrapeSeasonStats(year):
    address_string='https://www.basketball-reference.com/leagues/NBA_'+str(year)+'_per_poss.html';
    print(address_string);
    page = requests.get(address_string);
    soup = BeautifulSoup(page.text, 'html.parser');
    
    # First, restrict to per_poss_stats table
    perstatsSoup = soup.find(id='all_per_poss_stats');
    
    # Identify the header id's
    data_stat_entries = perstatsSoup.findAll("th")[0:32];
    #data_stat_entries[1].attrs #this lists the attributes associated with the HTML tag
    data_stats = [];
    for data_stat in data_stat_entries:
        data_stats.append(str(data_stat.attrs['data-stat']));
    
    raw_dict = {};
    for data_stat in data_stats[1::]:
        #print('Working on '+data_stat);
        given_data_stat_entries = perstatsSoup.findAll("td", {"data-stat" : data_stat});
        given_data_stat_list = [];
        for item in given_data_stat_entries:
            #some data entries are actually hyperlinks
            #I need the deepest content, which I get using descendants method
            val = list(item.descendants);
            if len(val)==0:
                val = float('NaN')
            elif len(val)==1:
                val = val[0];
            else:
                val = val[1]
            given_data_stat_list.append(val);
        #append to dictionary using update method
        raw_dict.update({data_stat:given_data_stat_list});
    
    #build the dataframe using the dictonary containing the scraped data
    possData=pd.DataFrame(data=raw_dict).set_index('player');
    possData.to_csv('Per_Poss_Data/seasonData'+str(year)+'.csv', index=False);
    return possData;

# Merge dataframes

In [36]:
#Merge season stats and MVP votes dataframes
def mergeSSandMVP(ssdf,mvpdf):
    mergedpd = pd.merge(ssdf,mvpdf, on= 'player', how ='outer');
    mergedpd['votes_first']=mergedpd['votes_first'].fillna(0);
    mergedpd['points_won']=mergedpd['points_won'].fillna(0);
    return mergedpd;

def mergeASandSS(ssdf,advdf):
    mergedpd = pd.merge(ssdf, advdf, on='player', how='outer');
    return mergedpd;

def mergeWithPerG(file, sspgdf):
    mergedpd = pd.merge(file, sspgdf, on='player', how='outer');
    return mergedpd;

# Loop over years and data sets

In [38]:
#Loop over years, and save resulting dataframe as CSV file
yearList = range(1981,2020,1);
for year in yearList:
    #scrape the data from webpages
    ssdf = scrapeSeasonStats(year);
    advdf = scrapeAdvancedStats(year);
    mvpdf = scrapeMVP(year);
    sspgdf = scrapeSeasonStatsPerG(year);
    sspmdf = scrapeSeasonStatsPerMin(year);
    
    #merge the data
    df = mergeASandSS(ssdf,advdf);
    pf = mergeSSandMVP(df, mvpdf); 
    lf = mergeWithPerG(pf, sspgdf); 
    zf = mergeWithPerG(lf, sspmdf)
    
    zf.to_csv('Raw_Data/data_'+str(year)+'.csv', index=True);

https://www.basketball-reference.com/leagues/NBA_1981_per_poss.html
https://www.basketball-reference.com/leagues/NBA_1981_advanced.html
https://www.basketball-reference.com/awards/awards_1981.html
https://www.basketball-reference.com/leagues/NBA_1981_per_game.html
https://www.basketball-reference.com/leagues/NBA_1981_per_minute.html
https://www.basketball-reference.com/leagues/NBA_1982_per_poss.html
https://www.basketball-reference.com/leagues/NBA_1982_advanced.html
https://www.basketball-reference.com/awards/awards_1982.html
https://www.basketball-reference.com/leagues/NBA_1982_per_game.html
https://www.basketball-reference.com/leagues/NBA_1982_per_minute.html
https://www.basketball-reference.com/leagues/NBA_1983_per_poss.html
https://www.basketball-reference.com/leagues/NBA_1983_advanced.html
https://www.basketball-reference.com/awards/awards_1983.html
https://www.basketball-reference.com/leagues/NBA_1983_per_game.html
https://www.basketball-reference.com/leagues/NBA_1983_per_minute.

https://www.basketball-reference.com/leagues/NBA_2005_per_game.html
https://www.basketball-reference.com/leagues/NBA_2005_per_minute.html
https://www.basketball-reference.com/leagues/NBA_2006_per_poss.html
https://www.basketball-reference.com/leagues/NBA_2006_advanced.html
https://www.basketball-reference.com/awards/awards_2006.html
https://www.basketball-reference.com/leagues/NBA_2006_per_game.html
https://www.basketball-reference.com/leagues/NBA_2006_per_minute.html
https://www.basketball-reference.com/leagues/NBA_2007_per_poss.html
https://www.basketball-reference.com/leagues/NBA_2007_advanced.html
https://www.basketball-reference.com/awards/awards_2007.html
https://www.basketball-reference.com/leagues/NBA_2007_per_game.html
https://www.basketball-reference.com/leagues/NBA_2007_per_minute.html
https://www.basketball-reference.com/leagues/NBA_2008_per_poss.html
https://www.basketball-reference.com/leagues/NBA_2008_advanced.html
https://www.basketball-reference.com/awards/awards_2008.

In [8]:
yearList = range(1981,2020,1);
categories = ['player','pos_x','age_x','team_id_x','g_x','mp_x','fg_per_poss','fga_per_poss','fg_pct_x'
              ,'fg3_per_poss','fg3a_per_poss','fg3_pct_x','fg2_per_poss','fg2a_per_poss','fg2_pct_x','ft_per_poss'
              ,'fta_per_poss','ft_pct_x','orb_per_poss','drb_per_poss','trb_per_poss','ast_per_poss','stl_per_poss'
              ,'blk_per_poss','tov_per_poss','pf_per_poss','pts_per_poss','off_rtg','def_rtg','per','ts_pct'
              ,'fg3a_per_fga_pct','fta_per_fga_pct','orb_pct','drb_pct','trb_pct','ast_pct','stl_pct','blk_pct'
              ,'tov_pct','usg_pct','ws-dum','ows','dws','ws','ws_per_48','bpm-dum','obpm','dbpm','bpm','vorp'
              ,'votes_first','points_won','mp_per_g','fg_per_g','fga_per_g','fg3_per_g','fg3a_per_g'
              ,'fg2_per_g','fg2a_per_g','efg_pct','ft_per_g','fta_per_g','orb_per_g','drb_per_g','trb_per_g'
              ,'ast_per_g','stl_per_g','blk_per_g','tov_per_g','pf_per_g','pts_per_g','fg_per_mp','fga_per_mp'
              ,'fg3_per_mp','fg3a_per_mp','fg2_per_mp','fg2a_per_mp','ft_per_mp','fta_per_mp','orb_per_mp'
              ,'drb_per_mp','trb_per_mp','ast_per_mp','stl_per_mp','blk_per_mp','tov_per_mp','pf_per_mp','pts_per_mp']
for year in yearList: 
    file2 = pd.read_csv('Raw_Data/data_' + str(year) + '.csv');
    file2 = file2[categories]
    file2 = file2.drop_duplicates(subset=['player'])
    #file2 = file2.drop(file2[file2.mp_x <1000].index)
    for i in range(4,len(categories)):
        cat = file2[[categories[i]]]
        max_cat = np.amax(cat)
        max_val = max_cat.values
        file2[[categories[i]]] = file2[[categories[i]]]/max_val

    
    pd.DataFrame(file2.to_csv('Cut_Data/data_'+str(year)+'_Norm.csv', index = False));
    
    
    

In [3]:
yearList = range(1982,2017,1);
file1 = pd.read_csv('Cut_Data/data_1981_Norm.csv')
for year in yearList: 
    file2 = pd.read_csv('Cut_Data/data_' + str(year) + '_Norm.csv');
    file1 = pd.concat([file1, file2]); 
    print(file1)
pd.DataFrame(file1).to_csv("DataTrainSetFull_Norm.csv", index=False)

                  player pos_x  age_x team_id_x       g_x      mp_x  \
0    Kareem Abdul-Jabbar     C     33       LAL  0.952381  0.870939   
1            Alvan Adams     C     26       PHO  0.892857  0.601112   
2         Tiny Archibald    PG     32       BOS  0.952381  0.825285   
3           James Bailey    PF     23       SEA  0.976190  0.743049   
4           Greg Ballard    SF     26       WSB  0.976190  0.763828   
..                   ...   ...    ...       ...       ...       ...   
198         Sly Williams    SF     24       NYK  0.714286  0.447616   
199      Bill Willoughby    SF     24       HOU  0.821429  0.434079   
200        Brian Winters    SG     29       MIL  0.726190  0.538258   
201         Mike Woodson    SG     23       TOT  0.988095  0.685992   
202    Orlando Woolridge    SF     22       CHI  0.892857  0.349617   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  fta_per_mp  \
0       0.839744      0.683582  0.856716      0.000000  ...    0.670000 

                  player pos_x  age_x team_id_x       g_x      mp_x  \
0    Kareem Abdul-Jabbar     C     33       LAL  0.952381  0.870939   
1            Alvan Adams     C     26       PHO  0.892857  0.601112   
2         Tiny Archibald    PG     32       BOS  0.952381  0.825285   
3           James Bailey    PF     23       SEA  0.976190  0.743049   
4           Greg Ballard    SF     26       WSB  0.976190  0.763828   
..                   ...   ...    ...       ...       ...       ...   
205        Randy Wittman    SF     29       TOT  0.780488  0.344086   
206             Joe Wolf     C     24       LAC  0.804878  0.445469   
207         Mike Woodson    SG     30       HOU  0.987805  0.694009   
208    Orlando Woolridge    SF     29       LAL  0.902439  0.458065   
209         James Worthy    SF     27       LAL  0.987805  0.909370   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  fta_per_mp  \
0       0.839744      0.683582  0.856716      0.000000  ...    0.670000 

                  player pos_x  age_x team_id_x       g_x      mp_x  \
0    Kareem Abdul-Jabbar     C     33       LAL  0.952381  0.870939   
1            Alvan Adams     C     26       PHO  0.892857  0.601112   
2         Tiny Archibald    PG     32       BOS  0.952381  0.825285   
3           James Bailey    PF     23       SEA  0.976190  0.743049   
4           Greg Ballard    SF     26       WSB  0.976190  0.763828   
..                   ...   ...    ...       ...       ...       ...   
221        Walt Williams    SF     24       SAC  0.939024  0.814936   
222         Kevin Willis    PF     32       TOT  0.817073  0.711098   
223           David Wood    PF     30       GSW  0.951220  0.397501   
224     Haywoode Workman    PG     29       IND  0.841463  0.305861   
225       Sharone Wright     C     22       PHI  0.963415  0.608152   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  fta_per_mp  \
0       0.839744      0.683582  0.856716      0.000000  ...    0.670000 

                  player pos_x  age_x team_id_x       g_x      mp_x  \
0    Kareem Abdul-Jabbar     C     33       LAL  0.952381  0.870939   
1            Alvan Adams     C     26       PHO  0.892857  0.601112   
2         Tiny Archibald    PG     32       BOS  0.952381  0.825285   
3           James Bailey    PF     23       SEA  0.976190  0.743049   
4           Greg Ballard    SF     26       WSB  0.976190  0.763828   
..                   ...   ...    ...       ...       ...       ...   
237        Walt Williams    SF     30       HOU  0.867470  0.459773   
238   Corliss Williamson    SF     27       TOT  0.831325  0.489689   
239         Kevin Willis     C     38       TOT  0.939759  0.531513   
240    Metta World Peace    SF     21       CHI  0.915663  0.686320   
241      Lorenzen Wright     C     25       ATL  0.855422  0.577403   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  fta_per_mp  \
0       0.839744      0.683582  0.856716      0.000000  ...    0.670000 

                  player pos_x  age_x team_id_x       g_x      mp_x  \
0    Kareem Abdul-Jabbar     C     33       LAL  0.952381  0.870939   
1            Alvan Adams     C     26       PHO  0.892857  0.601112   
2         Tiny Archibald    PG     32       BOS  0.952381  0.825285   
3           James Bailey    PF     23       SEA  0.976190  0.743049   
4           Greg Ballard    SF     26       WSB  0.976190  0.763828   
..                   ...   ...    ...       ...       ...       ...   
249      Marvin Williams    PF     19       ATL  0.963415  0.576832   
250          Mo Williams    PG     23       MIL  0.707317  0.452423   
251         Qyntel Woods    SF     24       NYK  0.597561  0.299350   
252    Metta World Peace    SF     26       TOT  0.682927  0.652187   
253      Lorenzen Wright     C     30       MEM  0.951220  0.499113   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  fta_per_mp  \
0       0.839744      0.683582  0.856716      0.000000  ...        0.67 

                  player pos_x  age_x team_id_x       g_x      mp_x  \
0    Kareem Abdul-Jabbar     C   33.0       LAL  0.952381  0.870939   
1            Alvan Adams     C   26.0       PHO  0.892857  0.601112   
2         Tiny Archibald    PG   32.0       BOS  0.952381  0.825285   
3           James Bailey    PF   23.0       SEA  0.976190  0.743049   
4           Greg Ballard    SF   26.0       WSB  0.976190  0.763828   
..                   ...   ...    ...       ...       ...       ...   
251        Dorell Wright    SF   25.0       GSW  0.987952  0.975209   
252           Nick Young    SG   25.0       WAS  0.771084  0.630307   
253            Sam Young    SF   25.0       MEM  0.939759  0.488689   
254       Thaddeus Young    PF   22.0       PHI  0.987952  0.661605   
255        Manu Gin�bili   NaN    NaN       NaN       NaN       NaN   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  fta_per_mp  \
0       0.839744      0.683582  0.856716      0.000000  ...    0.670000 

                  player pos_x  age_x team_id_x       g_x      mp_x  \
0    Kareem Abdul-Jabbar     C   33.0       LAL  0.952381  0.870939   
1            Alvan Adams     C   26.0       PHO  0.892857  0.601112   
2         Tiny Archibald    PG   32.0       BOS  0.952381  0.825285   
3           James Bailey    PF   23.0       SEA  0.976190  0.743049   
4           Greg Ballard    SF   26.0       WSB  0.976190  0.763828   
..                   ...   ...    ...       ...       ...       ...   
254          Tony Wroten    PG   20.0       PHI  0.867470  0.565343   
255           Nick Young    SG   28.0       LAL  0.771084  0.579757   
256       Thaddeus Young    PF   25.0       PHI  0.951807  0.870596   
257          Cody Zeller     C   21.0       CHA  0.987952  0.453555   
258         Tyler Zeller     C   24.0       CLE  0.843373  0.336003   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  fta_per_mp  \
0       0.839744      0.683582  0.856716      0.000000  ...     0.67000 

In [10]:
yearList = range(2016,2020,1);
file1 = pd.read_csv('Cut_Data/data_2015_Norm.csv')
for year in yearList: 
    file2 = pd.read_csv('Cut_Data/data_' + str(year) + '_Norm.csv');
    file1 = pd.concat([file1, file2]); 
    print(file1)
pd.DataFrame(file1).to_csv("DataTestSetFull_Norm.csv", index=False)

                player pos_x  age_x team_id_x       g_x      mp_x  \
0           Quincy Acy    PF     24       NYK  0.819277  0.431734   
1         Steven Adams     C     21       OKC  0.843373  0.594096   
2        Arron Afflalo    SG     29       TOT  0.939759  0.839316   
3    LaMarcus Aldridge    PF     29       POR  0.855422  0.842670   
4          Lavoy Allen     C     25       IND  0.759036  0.358940   
..                 ...   ...    ...       ...       ...       ...   
269    Marvin Williams    PF     29       CHO  0.987805  0.748160   
270    Justise Winslow    SF     19       MIA  0.951220  0.714240   
271         Nick Young    SG     30       LAL  0.658537  0.330560   
272     Thaddeus Young    PF     27       BRK  0.890244  0.770240   
273        Cody Zeller     C     23       CHO  0.890244  0.567680   

     fg_per_poss  fga_per_poss  fg_pct_x  fg3_per_poss  ...  fta_per_mp  \
0       0.452555      0.421875  0.646479      0.132075  ...    0.264706   
1       0.445255     