In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import time

In [3]:
def pct_stf(string):
    return round(float(string) / 100, 3)

#function scrapes college player's performance statistics from their most recent season from sports-reference.com...
#if the the most recent season is unavailable it scrapes the player's performance statistics averaged across...
#their college basketball career.
def get_player_data(first, last):
    try:
        url = 'https://www.sports-reference.com/cbb/players/' + first.lower() + '-' + last.lower() + '-1.html'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        info = soup.find('div', attrs={'id': 'info', 'class': 'players'}).find('div', attrs={'id':'meta'})
    except AttributeError:
        url = 'https://www.sports-reference.com/cbb/players/' + first.lower() + '-' + last.lower() + '-2.html'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        info = soup.find('div', attrs={'id': 'info', 'class': 'players'}).find('div', attrs={'id':'meta'})
    except:
        print('URL error.')

    #scrapes player's position, height, and weight.
    position = info.p.text[19:]
    position = position.replace('\n\n\n\n', '')
    info_list = info.findAll('p')
    height = info_list[1].findAll('span')[0].text
    weight = info_list[1].findAll('span')[1].text
    
    try:
        #scrapes most recent season number of games played, points per game, rebounds per game, and assists per game.
        p1 = soup.find('div', attrs={'class':'stats_pullout'}).find('div', attrs={'class':'p1'})
        gp = int(p1.findAll('div')[0].p.text)
        ppg = float(p1.findAll('div')[1].p.text)
        rpg = float(p1.findAll('div')[2].p.text)
        apg = float(p1.findAll('div')[3].p.text)

        #scrapes most recent season field goal percentage, 3-pointer field goal percentage, free...
        #throw percentage, and effective field goal percentage.
        p2 = soup.find('div', attrs={'class':'stats_pullout'}).find('div', attrs={'class':'p2'})
        fgpct = pct_stf(p2.findAll('div')[0].p.text)
        fg3pct = pct_stf(p2.findAll('div')[1].p.text)
        ftpct = pct_stf(p2.findAll('div')[2].p.text)
        efgpct = pct_stf(p2.findAll('div')[3].p.text)
        
    except ValueError:
        #scrapes career number of games played, points per game, rebounds per game, and assists per game.
        print('Could not find most recent season stats. Retrieving career stats.')
        p1 = soup.find('div', attrs={'class':'stats_pullout'}).find('div', attrs={'class':'p1'})
        gp = int(p1.findAll('div')[0].findAll('p')[1].text)
        ppg = float(p1.findAll('div')[1].findAll('p')[1].text)
        rpg = float(p1.findAll('div')[2].findAll('p')[1].text)
        apg = float(p1.findAll('div')[3].findAll('p')[1].text)

        #scrapes career field goal percentage, 3-pointer field goal percentage, free...
        #throw percentage, and effective field goal percentage.
        p2 = soup.find('div', attrs={'class':'stats_pullout'}).find('div', attrs={'class':'p2'})
        fgpct = pct_stf(p2.findAll('div')[0].findAll('p')[1].text)
        fg3pct = pct_stf(p2.findAll('div')[1].findAll('p')[1].text)
        ftpct = pct_stf(p2.findAll('div')[2].findAll('p')[1].text)
        efgpct = pct_stf(p2.findAll('div')[3].findAll('p')[1].text)
        
    except:
        print('Statistics scraping error.')

    #put all scraped information into a dictionary and return the dictionary.
    fullname = first + ' ' + last
    player_dict = {'fullname': fullname, 'position': position,
                   'height': height, 'weight': weight,
                   'gp': gp, 'ppg': ppg,
                   'rpg': rpg, 'apg': apg,
                   'fgpct': fgpct, 'fg3pct': fg3pct,
                   'ftpct': ftpct, 'efgpct': efgpct}
    
    return player_dict

In [7]:
#uses the get_player_data() function for the first 10 players selected in the 2024 NBA...
#draft that played college basketball in the prior season.
#the retrieved data is inserted in a list of dictionaries, converted into a dataframe, and...
#exported in an excel file.
dict = {'fullname':[],
        'ppg': [],
        'rpg': [],
        'apg': [],
        'fgpct': [],
        'fg3pct': []}

# Reed Sheppard was select 3rd in the 2024 NBA draft
# Stephon Castle was selected 4th in the 2024 NBA draft
# Donovan Clingan was selected 7th in the 2024 NBA draft
# Rob Dillingham was selected 8th in the 2024 NBA draft
# Zach Edey was selected 9th in the 2024 NBA draft
# Cody Williams was selected 10th in the 2024 NBA draft
# Devin Carter was selected was selected 13th in the 2024 NBA draft
# Carlton 'Bub' Carrington was selected 14th in the 2024 NBA draft
# Kel'el Ware was selected 15th in the 2024 NBA draft
# Jared McCain was selected 16th in the 2024 NBA draft

top_10_college_drafted = [{'first': 'Reed', 'last': 'Sheppard'},
                          {'first': 'Stephon', 'last': 'Castle'},
                          {'first': 'Donovan', 'last': 'Clingan'},
                          {'first': 'Rob', 'last': 'Dillingham'},
                          {'first': 'Zach', 'last': 'Edey'},
                          {'first': 'Cody', 'last': 'Williams'},
                          {'first': 'Devin', 'last': 'Carter'},
                          {'first': 'Carlton', 'last': 'Carrington'},
                          {'first': 'Kelel', 'last': 'Ware'},
                          {'first': 'Jared', 'last': 'McCain'}]

for player in top_10_college_drafted:
    player_dict = get_player_data(player['first'], player['last'])

    for key in dict:
        dict[key].append(player_dict[key])

    #sidesteps server requests limit
    lag = np.random.uniform(low=7, high=15)
    print(f'waiting {round(lag, 1)} seconds...')
    time.sleep(lag)

df = pd.DataFrame(dict)
df

waiting 11.3 seconds...
waiting 14.3 seconds...
waiting 10.0 seconds...
waiting 7.1 seconds...
waiting 9.5 seconds...
waiting 7.3 seconds...
Could not find most recent season stats. Retrieving career stats.
waiting 7.6 seconds...
waiting 12.9 seconds...
waiting 9.4 seconds...
waiting 9.2 seconds...


Unnamed: 0,fullname,ppg,rpg,apg,fgpct,fg3pct
0,Reed Sheppard,12.5,4.1,4.5,0.536,0.521
1,Stephon Castle,11.1,4.7,2.9,0.472,0.267
2,Donovan Clingan,13.0,7.4,1.5,0.639,0.25
3,Rob Dillingham,15.2,2.9,3.9,0.475,0.444
4,Zach Edey,25.2,12.2,2.0,0.623,0.5
5,Cody Williams,11.9,3.0,1.6,0.552,0.415
6,Devin Carter,14.3,3.4,1.2,0.439,0.402
7,Carlton Carrington,13.8,5.2,4.1,0.412,0.322
8,Kelel Ware,15.9,9.9,1.5,0.586,0.425
9,Jared McCain,14.3,5.0,1.9,0.462,0.414


In [9]:
df.to_excel('2024_drafted_college_players_v2.xlsx', index=False)