In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

In [2]:
def SoupFromURL(url, suppressOutput=True):

    if not suppressOutput:
        print(url)
    try:
        r = requests.get(url)
    except:
        return None

    return BeautifulSoup(r.text, "html5lib")

### Scraping player page for stats

The function below scrapes any given player url page for stats

In [3]:
def ncaa_stats_dict(url):
    player_page = SoupFromURL(str(url))
    
    #player profile info
    a = player_page.find_all('div', attrs={'class':'nothumb'})[0]
    a_soup = BeautifulSoup(str(a),"html5lib")
    a = player_page.find_all('div', attrs={'class':'nothumb'})[0]
    
    try:
        n = str(a_soup.text.split('\n')[1])
        name = n.replace("\t","")
    except:
        name = None
    
    try:
        p = str(a_soup.text.split('\n')[13])
        position = p.replace("  ","")
    except:
        position = None
    
    try:
        ft = float(str(a_soup.text.split('\n')[17])[2])*12
        inch = float(str(a_soup.text.split('\n')[17])[4])
        height = ft+inch
    except:
        height = None
    
    try:
        weight = str(a_soup.text.split('\n')[17])[7:12]
        weight = re.sub('[^0-9]','', weight)
    except:
        weight = None
    
    ### high school
    hs_list = []
    try:
        hs = str(a_soup.text.split('\n')[22])
        high_school = hs.replace("  High School: ","")
        hs_list.append(high_school)
    
        hs_2 = str(a_soup.text.split('\n')[26])
        high_school_2 = hs_2.replace("  High School: ","")
        hs_list.append(high_school_2)
        
        hs_list = [x for x in hs_list if not '  ' in x]
        hs_list.append(None)
    except:
        hs_list.append(None)
    
    while '' in hs_list:
        hs_list.remove('')
        
    ### college
    col_list = []
    try:
        c_4 = str(a_soup.text.split('\n')[18])
        college_4 = c_4.replace("  School: ","")
        college_4 = college_4.replace("  Schools: ","")
        col_list.append(college_4)
    
        c = str(a_soup.text.split('\n')[28])
        college = c.replace("  School: ","")
        college = college.replace("  Schools: ","")
        col_list.append(college)
    
        c_2 = str(a_soup.text.split('\n')[32])
        college_2 = c_2.replace("  School: ","")
        college_2 = college_2.replace("  Schools: ","")
        col_list.append(college_2)
    
        c_3 = str(a_soup.text.split('\n')[24])
        college_3 = c_3.replace("  School: ","")
        college_3 = college_3.replace("  Schools: ","")
        col_list.append(college_3)

        col_list = [x for x in col_list if not '  ' in x]
        col_list.append(None)
    except:
        col_list.append(None)

    while '' in col_list:
        col_list.remove('')
    
    #player stats
    stat_list = []
    try:
        p = player_page.find_all('div', attrs={'class':'stats_pullout'})[0]
        p_soup = BeautifulSoup(str(p),"html5lib")
        stat = p_soup.text.split('\n')[1::4][2:]
    
        #stat_list = []
        for s in stat:
            try:
                if s == str(""):
                    stat_list.append(None)
                else:
                    stat_list.append(float(s))
            except:
                stat_list.append(None)
    except:
        stat_list.extend([None,None,None,None,None,None,None,None,None,None])
    
    #season count and draft year
    season_count = player_page.find_all('div', attrs={'class':'overthrow table_container'})[0]
    season_count_soup = BeautifulSoup(str(season_count),"html5lib")
    season_count = season_count_soup.text.split('\n')
    season_count = season_count[33:37]
    season_count = [x for x in season_count if not 'Career' in x]
    season_count = [x for x in season_count if not '  ' in x]
    final_season_count = len(list(filter(None, season_count)))
    
    draft_year_list = list(filter(None, season_count))
    if len(draft_year_list) == 4:
        final_year = str(draft_year_list[3])[5:7]
    elif len(draft_year_list) == 3:
        final_year = str(draft_year_list[2])[5:7]
    elif len(draft_year_list) == 2:
        final_year = str(draft_year_list[1])[5:7]
    elif len(draft_year_list) == 1:
        final_year = str(draft_year_list[0])[5:7]

    if int(final_year[0]) in [0,1]:
        final_draft_year = int(final_year) + 2000
    elif int(final_year[0]) in [9,8,7,6,5,4,3,2]:
        final_draft_year = int(final_year) + 1900
    
    #creating player dictionary
    player_dict = {
        'player_name':name,
        'position':position,
        'height(inches)':height,
        'weight(lbs)':weight,
        'high_school':hs_list[0],
        'college':col_list[0],
        'draft_year':final_draft_year,
        'years_in_college':final_season_count,
        'games':stat_list[0],
        'points':stat_list[1],
        'rebounds':stat_list[2],
        'assists':stat_list[3],
        'fg_percent':stat_list[4],
        '3_fg_percent':stat_list[5],
        'free_throw_percent':stat_list[6],
        'effective_fg_percent':stat_list[7],
        'player_efficiency_rating':stat_list[8],
        'win_shares':stat_list[9]
    }
    return player_dict

In [4]:
ncaa_stats_dict('https://www.sports-reference.com/cbb/players/rasheed-wallace-1.html')

{'3_fg_percent': 25.0,
 'assists': 0.8,
 'college': 'UNC',
 'draft_year': 1995,
 'effective_fg_percent': 63.6,
 'fg_percent': 63.5,
 'free_throw_percent': 62.1,
 'games': 69.0,
 'height(inches)': 73.0,
 'high_school': None,
 'player_efficiency_rating': None,
 'player_name': 'Rasheed Wallace',
 'points': 13.0,
 'position': 'Forward',
 'rebounds': 7.4,
 'weight(lbs)': '225',
 'win_shares': None,
 'years_in_college': 2}

### Get index page urls

I noticed that there is a pattern to the player index page url's on sports-reference. I followed the pattern to get a list of url's.

In [5]:
import string
index_urls = []
for letter in string.ascii_lowercase:
    letter_page = 'https://www.sports-reference.com/cbb/players/{}-index.html'.format(letter)
    index_urls.append(letter_page)

index_urls

['https://www.sports-reference.com/cbb/players/a-index.html',
 'https://www.sports-reference.com/cbb/players/b-index.html',
 'https://www.sports-reference.com/cbb/players/c-index.html',
 'https://www.sports-reference.com/cbb/players/d-index.html',
 'https://www.sports-reference.com/cbb/players/e-index.html',
 'https://www.sports-reference.com/cbb/players/f-index.html',
 'https://www.sports-reference.com/cbb/players/g-index.html',
 'https://www.sports-reference.com/cbb/players/h-index.html',
 'https://www.sports-reference.com/cbb/players/i-index.html',
 'https://www.sports-reference.com/cbb/players/j-index.html',
 'https://www.sports-reference.com/cbb/players/k-index.html',
 'https://www.sports-reference.com/cbb/players/l-index.html',
 'https://www.sports-reference.com/cbb/players/m-index.html',
 'https://www.sports-reference.com/cbb/players/n-index.html',
 'https://www.sports-reference.com/cbb/players/o-index.html',
 'https://www.sports-reference.com/cbb/players/p-index.html',
 'https:

### Get player urls

Within each of the index pages, there was a pattern within the HTML code that references all the links on the page. I scraped all the links and discarded the ones that weren't relevant such as the links to schools or other parts of the site that weren't player pages.

In [6]:
def get_player_urls(index_url_link):

    player_urls = []
    index_page = SoupFromURL(index_url_link)
    index_names = index_page.find_all('p')
    index_soup = BeautifulSoup(str(index_names),"html5lib")
    links = index_soup('a', href=True)
    
    for l in links:
        try:
            player_urls.append('https://www.sports-reference.com' + l.attrs['href'])
        except:
            pass
    
    player_urls = [x for x in player_urls if not 'schools' in x]
    player_urls.pop()
    player_urls.pop()
    player_urls.pop()
    player_urls.pop()
    return player_urls

In [7]:
get_player_urls('https://www.sports-reference.com/cbb/players/x-index.html')

['https://www.sports-reference.com/cbb/players/jeff-xavier-1.html',
 'https://www.sports-reference.com/cbb/players/ioannis-xenakis-1.html',
 'https://www.sports-reference.com/cbb/players/aaron-xia-1.html',
 'https://www.sports-reference.com/cbb/players/ji-xiang-1.html',
 'https://www.sports-reference.com/cbb/players/oliver-xu-1.html',
 'https://www.sports-reference.com/cbb/players/tao-xu-1.html']

### Generate Stats Dataframe Of All Players On Index Page

Now that I have a function to scrape player pages for their stats and a list of index_urls with a function to get all the page urls, I decided to create another function below where you pass in the index url to generate a dataframe with player stats for all players listed on that page.

In [8]:
def generate_player_df(first_letter):
    
    index_url_link = 'https://www.sports-reference.com/cbb/players/{}-index.html'.format(first_letter)
    
    player_urls = []
    index_page = SoupFromURL(index_url_link)
    index_names = index_page.find_all('p')
    index_soup = BeautifulSoup(str(index_names),"html5lib")
    links = index_soup('a', href=True)
    
    for l in links:
        try:
            player_urls.append('https://www.sports-reference.com' + l.attrs['href'])
        except:
            pass
    
    player_urls = [x for x in player_urls if not 'schools' in x]
    player_urls.pop()
    player_urls.pop()
    player_urls.pop()
    player_urls.pop()
    
    player_stats_list=[]
    for url in player_urls:
        try:
            player_stats_list.append(ncaa_stats_dict(url))
            print(url)
        except:
            pass
    
    df = pd.DataFrame(player_stats_list)
    df = df[[
        'player_name',
        'position',
        'height(inches)',
        'weight(lbs)',
        'high_school',
        'college',
        'draft_year',
        'years_in_college',
        'games',
        'points',
        'rebounds',
        'assists',
        'fg_percent',
        '3_fg_percent',
        'free_throw_percent',
        'effective_fg_percent',
        'player_efficiency_rating',
        'win_shares'
    ]]
    
    return df

In [9]:
x_df = generate_player_df('x')

https://www.sports-reference.com/cbb/players/jeff-xavier-1.html
https://www.sports-reference.com/cbb/players/ioannis-xenakis-1.html
https://www.sports-reference.com/cbb/players/aaron-xia-1.html
https://www.sports-reference.com/cbb/players/ji-xiang-1.html
https://www.sports-reference.com/cbb/players/oliver-xu-1.html
https://www.sports-reference.com/cbb/players/tao-xu-1.html


In [10]:
x_df

Unnamed: 0,player_name,position,height(inches),weight(lbs),high_school,college,draft_year,years_in_college,games,points,rebounds,assists,fg_percent,3_fg_percent,free_throw_percent,effective_fg_percent,player_efficiency_rating,win_shares
0,Jeff Xavier,Guard,72.0,183,,Manhattan and Providence,2009,4,124.0,11.5,3.8,1.8,40.8,34.2,78.8,51.5,,9.5
1,Ioannis Xenakis,Center,84.0,213,,Delaware,2003,2,24.0,2.6,0.9,0.0,44.4,,43.8,44.4,,0.1
2,Aaron Xia,Forward,80.0,203,,Citadel,2006,3,73.0,2.6,2.1,0.1,45.7,30.8,69.2,47.0,,1.4
3,Ji Xiang,Forward,73.0,208,,Hawaii,2010,2,28.0,1.4,1.2,0.1,40.9,0.0,55.6,40.9,,0.3
4,Oliver Xu,Guard,74.0,170,Hong Kong International School,Rice,2016,1,11.0,0.5,0.2,0.1,33.3,33.3,50.0,41.7,-7.1,-0.1
5,Tao Xu,Center,73.0,277,Haverford School (PA),San Francisco,2014,2,36.0,2.6,1.1,0.2,42.5,,66.7,42.5,2.1,-0.2


In [None]:
player_page = SoupFromURL(str(url))

#player profile info
a = player_page.find_all('div', attrs={'class':'nothumb'})[0]
a_soup = BeautifulSoup(str(a),"html5lib")
a = player_page.find_all('div', attrs={'class':'nothumb'})[0]

try:
    n = str(a_soup.text.split('\n')[1])
    name = n.replace("\t","")
except:
    name = None

try:
    p = str(a_soup.text.split('\n')[13])
    position = p.replace("  ","")
except:
    position = None

try:
    ft = float(str(a_soup.text.split('\n')[17])[2])*12
    inch = float(str(a_soup.text.split('\n')[17])[4])
    height = ft+inch
except:
    height = None

Code below to troubleshoot college column

In [None]:
player_page = SoupFromURL('https://www.sports-reference.com/cbb/players/jordan-aaberg-1.html')
    
a = player_page.find_all('div', attrs={'class':'nothumb'})[0]
a_soup = BeautifulSoup(str(a),"html5lib")


col_list = []
try:
    c_4 = str(a_soup.text.split('\n')[18])
    college_4 = c_4.replace("  School: ","")
    college_4 = college_4.replace("  Schools: ","")
    col_list.append(college_4)
    
    c = str(a_soup.text.split('\n')[28])
    college = c.replace("  School: ","")
    college = college.replace("  Schools: ","")
    col_list.append(college)
    
    c_2 = str(a_soup.text.split('\n')[32])
    college_2 = c_2.replace("  School: ","")
    college_2 = college_2.replace("  Schools: ","")
    col_list.append(college_2)
    
    c_3 = str(a_soup.text.split('\n')[24])
    college_3 = c_3.replace("  School: ","")
    college_3 = college_3.replace("  Schools: ","")
    col_list.append(college_3)

        #final_college = list(filter(None, col_list))
        #final_college.append(None)
    col_list = [x for x in col_list if not '  ' in x]
    col_list.append(None)
except:
    col_list.append(None)

while '' in col_list:
    col_list.remove('')
    
col_list

In [None]:
player_page = SoupFromURL('https://www.sports-reference.com/cbb/players/_-ziegenhorn-1.html')

col_list_2 = []
a = player_page.find_all('div', attrs={'class':'nothumb'})[0]
a_soup = BeautifulSoup(str(a),"html5lib")

col_5 = str(a_soup.text.split('\n')[18])

col_list_2.append(col_5)

col_list_2