In [9]:
from urllib.request import urlopen
import time
from bs4 import BeautifulSoup, Comment
import requests
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import re
from scipy.spatial.distance import cdist
from datetime import date
from numpy import asarray
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

pd.options.mode.chained_assignment = None  # default='warn'

In [10]:
def mvp_scraper(url):
    # Open URL and pass to BeautifulSoup
    html = urlopen(url)
    stats_page = BeautifulSoup(html)
    # Collect table headers
    column_headers = stats_page.findAll('tr')[1]
    column_headers = [i.getText() for i in column_headers.findAll('th')]
    # Collect table rows
    rows = stats_page.findAll('tr')[2:]
    # Get stats from each row
    stats = []
    for i in range(len(rows)):
      stats.append([col.getText() for col in rows[i].findAll('td')])
    #Parse through the stats to extract the first table for MVP voting
    mvp = []
    for item in stats:
        if not item:
            break
        mvp.append(item)
    df = pd.DataFrame(mvp, columns=column_headers[1:])
    return df

In [11]:
def mvp_compiler(start_year, end_year):
    #Establish range of years to compile
    start_dt = date(start_year, 1, 1)
    end_dt = date(end_year, 1, 11)
    
    year_range = [year for year in range(start_dt.year, end_dt.year +1)]
    for i in year_range:
        total = len(year_range)
        url = 'https://www.basketball-reference.com/awards/awards_{}.html#mvp'.format(i)
        df = mvp_scraper(url)
        #Add Season Column and move the column to the 3rd position
        df['Season'] = i
        last_col = df.columns[-1]
        col_to_move = df.pop(last_col)
        df.insert(2, last_col, col_to_move)
        #Name the dataframe based on the year
        name = 'mvp_df_'+str(i)
        vars()[name] = df
        time.sleep(1)
        if i % total == (total/2):
            print('Compiled 50% of {} season'.format(total))
            
        if i % total == 0:
            print('Compiled 100% of {} season'.format(total))
    
    #Compile all the season into one df
    dfs = []
    for year in year_range:
        df_name = f"mvp_df_{year}"
        dfs.append(vars()[df_name])
    mvp_df = pd.concat(dfs)
    
    #Fill 0s for players who did not shoot any 3s
    mvp_df = mvp_df.fillna(value='')
    mvp_df = mvp_df.replace('', 0)
    #Convert all statistical columns into floats
    mvp_df.iloc[:, 4:] = mvp_df.iloc[:, 4:].astype(float)
    #mvp_df = mvp_df.loc[mvp_df.First > 0]
    #mvp_df = mvp_df.loc[mvp_df.Tm != 'TOT']
    mvp_df.reset_index(drop = True, inplace = True)
    return mvp_df

In [5]:
mvp_df = mvp_compiler(start_year = 2013, end_year = 2022)
mvp_df

Compiled 50% of 10 season
Compiled 100% of 10 season


  mvp_df.iloc[:, 4:] = mvp_df.iloc[:, 4:].astype(float)


Unnamed: 0,Player,Age,Season,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,LeBron James,28,2013,MIA,120.0,1207.0,1210.0,0.998,76.0,37.9,26.8,8.0,7.3,1.7,0.9,0.565,0.406,0.753,19.3,0.322
1,Kevin Durant,24,2013,OKC,0.0,765.0,1210.0,0.632,81.0,38.5,28.1,7.9,4.6,1.4,1.3,0.510,0.416,0.905,18.9,0.291
2,Carmelo Anthony,28,2013,NYK,1.0,475.0,1210.0,0.393,67.0,37.0,28.7,6.9,2.6,0.8,0.5,0.449,0.379,0.830,9.5,0.184
3,Chris Paul,27,2013,LAC,0.0,289.0,1210.0,0.239,70.0,33.4,16.9,3.7,9.7,2.4,0.1,0.481,0.328,0.885,13.9,0.287
4,Kobe Bryant,34,2013,LAL,0.0,184.0,1210.0,0.152,78.0,38.6,27.3,5.6,6.0,1.4,0.3,0.463,0.324,0.839,10.9,0.174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,Stephen Curry,33,2022,GSW,0.0,4.0,1000.0,0.004,64.0,34.5,25.5,5.2,6.3,1.3,0.4,0.437,0.380,0.923,8.0,0.173
126,Chris Paul,36,2022,PHO,0.0,2.0,1000.0,0.002,65.0,32.9,14.7,4.4,10.8,1.9,0.3,0.493,0.317,0.837,9.4,0.210
127,DeMar DeRozan,32,2022,CHI,0.0,1.0,1000.0,0.001,76.0,36.1,27.9,5.2,4.9,0.9,0.3,0.504,0.352,0.877,8.8,0.154
128,Kevin Durant,33,2022,BRK,0.0,1.0,1000.0,0.001,55.0,37.2,29.9,7.4,6.4,0.9,0.9,0.518,0.383,0.910,8.4,0.198


In [6]:
mvp_df.to_csv('/Users/yushunli/Documents/Data Science/NBA Projects/MVP Predictor/mvp_results_full.csv', index = False)

In [14]:
team_name = pd.read_csv('/Users/yushunli/Documents/Data Science/NBA Projects/MVP Predictor/team_names.csv')

In [16]:
def standings_scraper(url):
    # Open URL and pass to BeautifulSoup
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table within the commented section
    commented_div = soup.find('div', {'id': 'all_expanded_standings'})
    comment = next(elem for elem in commented_div.children if isinstance(elem, Comment))
    table_soup = BeautifulSoup(str(comment), 'html.parser')
    table = table_soup.find('table', {'id': 'expanded_standings'})

    # Extract the headers (including the over_header)
    over_headers = [header.get_text(strip=True) for header in table.find_all('th', {'class': 'over_header'})]
    headers = [header.get_text(strip=True) for header in table.find_all('th', {'scope': 'col'})]

    # Find the number of columns each over_header spans
    over_header_colspan = [int(header['colspan']) for header in table.find_all('th', {'class': 'over_header'})]

    # Combine over_headers with regular headers
    combined_headers = []
    for over_header, colspan in zip(over_headers, over_header_colspan):
        for _ in range(colspan):
            combined_headers.append(over_header)

    final_headers = [f'{over_header} {header}' for over_header, header in zip(combined_headers, headers)]
    final_headers.pop(0)
    
    # Extract the rows
    rows = table.find_all('tr')

    # Extract the data from each row
    data = []
    for row in rows:
        row_data = [cell.get_text(strip=True) for cell in row.find_all('td')]
        if row_data:
            data.append(row_data)

    # Create a DataFrame
    team_df = pd.DataFrame(data, columns=final_headers)
    return team_df[[' Team', ' Overall']]

In [17]:
def team_stats_cleaner(df, year, mapping = team_name):
    #Change Column name
    df['Team'] = df[' Team']
    df.drop(columns = ' Team', inplace = True)
    
    #Identify Wins
    df['Wins'] = df[' Overall'].str.extract(r'^(\d+)-').astype(int)
    
    #Add Year
    df['Season'] = year
    #Join to Abbreviation
    return df.merge(mapping, on = 'Team', how = 'left')[['Tm', 'Season', 'Wins']]

In [18]:
bballref = {
    'table_name': ['player_counting', 'player_advanced'],
    'url': ['https://www.basketball-reference.com/leagues/NBA_{}_per_game.html', 
            'https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'],
    'table_id': ['div_per_game_stats', 'div_advanced_stats']
}

bballref_tables = pd.DataFrame(bballref)
bballref_tables

Unnamed: 0,table_name,url,table_id
0,player_counting,https://www.basketball-reference.com/leagues/N...,div_per_game_stats
1,player_advanced,https://www.basketball-reference.com/leagues/N...,div_advanced_stats


In [19]:
def bball_ref_scraper(url, year, table_id):
    scrape_url = url.format(year)
    
    response = requests.get(scrape_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # find the table container div element with the given class and id
    table_div = soup.find('div', {'class': 'table_container', 'id': table_id})
    # find the table element inside the table container div
    table = table_div.find('table')

    # extract the table headers
    headers = [th.text for th in table.find('thead').find_all('th')]
    headers.pop(0)
    # extract the table rows and data
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        row = [td.text for td in tr.find_all('td')]
        rows.append(row)
        
    return pd.DataFrame(rows, columns=headers)

In [20]:
def player_stats_cleaner(df):
    #Drop blank rows and columns
    df.dropna(inplace = True)
    #Fill 0s for players who did not shoot any 3s
    df = df.fillna(value='')
    df = df.replace('', 0)
    #Remove multiple team rows
    df = df[df['Tm'] != 'TOT']
    df['G'] = df['G'].astype(int)
    df = df.loc[df.groupby('Player')['G'].idxmax()]
    return df

In [21]:
def stats_collection(start_year, end_year, bballref_tables = bballref_tables):
   
    year_range = [year for year in range(start_year, end_year +1)]
    for i in year_range:
        total = len(year_range)
        year = i
        
        #Scrape Team Wins        
        standings_url = f'https://www.basketball-reference.com/leagues/NBA_{year}_standings.html'
        standings = standings_scraper(standings_url)
        team_df = team_stats_cleaner(df = standings, year = i)
        
        #Scrape Player Stats: Counting + Advanced
        for index, row in bballref_tables.iterrows():
            table_id = row['table_id']
            url = row['url'].format(year)
            table_name = row['table_name']
            name = table_name + "_" + str(year)
            df = bball_ref_scraper(url = url, year = year, table_id = table_id)
            df_cleaned = player_stats_cleaner(df)
            time.sleep(1)
            #Merge the dataframes
            if index == 0:
                merged_df = df_cleaned
            else:
                merged_df = pd.merge(merged_df, df_cleaned, on=["Player", "Pos", "Age", "Tm", 
                                                                "G"], how="inner")
        
        merged_df.drop(columns=['\xa0'], inplace = True)        
        merged_df[merged_df.columns[4:]] = merged_df[merged_df.columns[4:]].astype(float)
        merged_df.drop(columns = 'MP_y', inplace = True)
        merged_df['Season'] = year
        last_col = merged_df.columns[-1]
        col_to_move = merged_df.pop(last_col)
        merged_df.insert(2, last_col, col_to_move)
        merged_df = merged_df.merge(team_df, on = ['Tm', 'Season'], how = 'left')
        name = "stats_df_"+str(i)
        vars()[name] = merged_df
        if i % total == (total/2):
            print('Compiled 50% of {} season'.format(total))
            
        if i % total == 0:
            print('Compiled 100% of {} season'.format(total))
        
    #Compile all the season into one df
    dfs = []
    for year in year_range:
        df_name = f"stats_df_{year}"
        dfs.append(vars()[df_name])
    stats_df = pd.concat(dfs)
    return stats_df

In [24]:
stats_collection(start_year = 2023, end_year = 2023).to_csv('2023_data', index = False)

Compiled 100% of 1 season


Unnamed: 0,Player,Pos,Season,Age,Tm,G,GS,MP_x,FG,FGA,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Wins
470,Stanley Umude,SG,2023,23,DET,1.0,0.0,2.0,0.0,1.0,...,40.0,0.0,0.0,0.0,0.626,15.9,32.7,48.6,0.0,17
508,Tyler Dorsey,SG,2023,26,DAL,3.0,0.0,2.7,1.3,1.7,...,28.3,0.1,0.0,0.1,0.367,16.5,1.6,18.0,0.0,38
399,Nikola Jokić,C,2023,27,DEN,69.0,69.0,33.7,9.4,14.8,...,27.2,11.2,3.8,14.9,0.308,8.5,4.5,13.0,8.8,53
252,Joel Embiid,C,2023,28,PHI,66.0,66.0,34.6,11.0,20.1,...,37.0,8.4,3.9,12.3,0.259,6.8,2.3,9.2,6.4,54
166,Giannis Antetokounmpo,PF,2023,28,MIL,63.0,63.0,32.1,11.2,20.3,...,38.8,4.9,3.7,8.6,0.204,5.8,2.7,8.5,5.4,58
283,Justin Champagnie,SF,2023,21,TOR,3.0,0.0,3.7,1.0,1.0,...,11.6,0.1,0.0,0.1,0.332,6.1,1.9,8.0,0.0,41
341,Luka Dončić,PG,2023,23,DAL,66.0,66.0,36.2,10.9,22.0,...,37.6,7.3,2.9,10.2,0.204,7.6,1.4,9.0,6.6,38
22,Anthony Davis,C,2023,29,LAL,56.0,54.0,34.0,9.7,17.2,...,28.4,5.5,3.4,9.0,0.226,4.6,1.7,6.3,4.0,43
247,Jimmy Butler,SF,2023,33,MIA,64.0,64.0,33.4,7.5,13.9,...,25.6,9.4,2.9,12.3,0.277,6.7,2.0,8.7,5.8,44
463,Shai Gilgeous-Alexander,PG,2023,24,OKC,68.0,68.0,35.5,10.4,20.3,...,32.8,8.4,3.0,11.4,0.226,5.8,1.5,7.3,5.6,40


In [26]:
stats_23.to_csv('2023_player_data', index = False)