In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

In [2]:
def SoupFromURL(url, suppressOutput=True):

    if not suppressOutput:
        print(url)
    try:
        r = requests.get(url)
    except:
        return None

    return BeautifulSoup(r.text, "html5lib")

def remove_values_from_list(the_list, val):
    return [value for value in the_list if value != val]

### Scraping player page for stats

The function below scrapes any given player url page for stats

In [3]:
def ncaa_stats_dict(url):
    player_page = SoupFromURL(str(url))
    
    #player profile info
    a = player_page.find_all('div', attrs={'class':'nothumb'})[0]
    a_soup = BeautifulSoup(str(a),"html5lib")
    n = a_soup.text.split('\n')
    n = remove_values_from_list(n,'')
    n = remove_values_from_list(n,'  ')
    removal = ['(','  Hometown','  High School', '  More','  Position']
    for remove in removal:
        n = [x for x in n if not x.startswith(remove)]

    try:
        name = n[0]
        name = name.replace('\t','')
    except:
        name = None

    try:
        position = n[2]
        position = position.replace('  ','')
    except:
        position=None
    
    try:
        college = n[4]
        college = college.replace('  School: ','')
    except:
        college = None
    
    try:
        ft = n[3][:3]
        ft = int(ft.replace(' ',''))*12
        inch = n[3][3:6]
        inch = int(re.sub('[^0-9]','',inch))
        height = ft + inch
    except:
        height = None
    
    try:
        weight = n[3][7:11]
        weight = int(re.sub('[^0-9]','',weight))
    except:
        weight=None
    
    #player stats
    stat_list = []
    try:
        p = player_page.find_all('div', attrs={'class':'stats_pullout'})[0]
        p_soup = BeautifulSoup(str(p),"html5lib")
        stat = p_soup.text.split('\n')[1::4][2:]
    
        #stat_list = []
        for s in stat:
            try:
                if s == str(""):
                    stat_list.append(None)
                else:
                    stat_list.append(float(s))
            except:
                stat_list.append(None)
    except:
        stat_list.extend([None,None,None,None,None,None,None,None,None,None])
    
    #season count and draft year
    season_count = player_page.find_all('div', attrs={'class':'overthrow table_container'})[0]
    season_count_soup = BeautifulSoup(str(season_count),"html5lib")
    season_count = season_count_soup.text.split('\n')
    season_count = season_count[20:37]
    season_count = [x for x in season_count if not 'Career' in x]
    season_count = [x for x in season_count if not '  ' in x]
    final_season_count = len(list(filter(None, season_count)))
    
    draft_year_list = list(filter(None, season_count))
    if len(draft_year_list) == 4:
        final_year = str(draft_year_list[3])[5:7]
    elif len(draft_year_list) == 3:
        final_year = str(draft_year_list[2])[5:7]
    elif len(draft_year_list) == 2:
        final_year = str(draft_year_list[1])[5:7]
    elif len(draft_year_list) == 1:
        final_year = str(draft_year_list[0])[5:7]

    if int(final_year[0]) in [0,1]:
        final_draft_year = int(final_year) + 2000
    elif int(final_year[0]) in [9,8,7,6,5,4,3,2]:
        final_draft_year = int(final_year) + 1900
    
    #steals, rebounds, turnovers and mins played per game
    additional_stats = player_page.find_all('td', attrs={'class':'right'})
    additional_soup = BeautifulSoup(str(additional_stats),"html5lib")
    try:
        steal_stat = additional_soup.text.split(',')[-5]
        steals = float(steal_stat)
    except:
        steals = None
    
    try:
        block_stat = additional_soup.text.split(',')[-4]
        blocks = float(block_stat)
    except:
        blocks = None
    
    try:
        turnover_stat = additional_soup.text.split(',')[-3]
        turnovers = float(turnover_stat)
    except:
        turnovers = None
        
    try:
        minutes_stat = additional_soup.text.split(',')[-20]
        minutes = float(minutes_stat)
    except:
        minutes=None
    
    #creating player dictionary
    player_dict = {
        'player_name':name,
        'position':position,
        'height_inches':height,
        'weight_lbs':weight,
        'college':college,
        'draft_year':final_draft_year,
        'years_in_college':final_season_count,
        'games':stat_list[0],
        'minutes_per_game':minutes,
        'points':stat_list[1],
        'rebounds':stat_list[2],
        'assists':stat_list[3],
        'steals':steals,
        'blocks':blocks,
        'turnovers':turnovers,
        'fg_percent':stat_list[4],
        '3_fg_percent':stat_list[5],
        'free_throw_percent':stat_list[6],
        'effective_fg_percent':stat_list[7],
        'player_efficiency_rating':stat_list[8],
        'win_shares':stat_list[9]
    }
    return player_dict

In [4]:
ncaa_stats_dict('https://www.sports-reference.com/cbb/players/larry-bird-1.html')

{'3_fg_percent': None,
 'assists': 4.6,
 'blocks': None,
 'college': 'Indiana State',
 'draft_year': 1979,
 'effective_fg_percent': None,
 'fg_percent': 53.3,
 'free_throw_percent': 82.2,
 'games': 94.0,
 'height_inches': 81,
 'minutes_per_game': None,
 'player_efficiency_rating': None,
 'player_name': 'Larry Bird',
 'points': 30.3,
 'position': 'Forward',
 'rebounds': 13.3,
 'steals': 0.9,
 'turnovers': 1.8,
 'weight_lbs': 220,
 'win_shares': None,
 'years_in_college': 3}

### Get index page urls

I noticed that there is a pattern to the player index page url's on sports-reference. I followed the pattern to get a list of url's.

In [5]:
import string
index_urls = []
for letter in string.ascii_lowercase:
    letter_page = 'https://www.sports-reference.com/cbb/players/{}-index.html'.format(letter)
    index_urls.append(letter_page)

index_urls

['https://www.sports-reference.com/cbb/players/a-index.html',
 'https://www.sports-reference.com/cbb/players/b-index.html',
 'https://www.sports-reference.com/cbb/players/c-index.html',
 'https://www.sports-reference.com/cbb/players/d-index.html',
 'https://www.sports-reference.com/cbb/players/e-index.html',
 'https://www.sports-reference.com/cbb/players/f-index.html',
 'https://www.sports-reference.com/cbb/players/g-index.html',
 'https://www.sports-reference.com/cbb/players/h-index.html',
 'https://www.sports-reference.com/cbb/players/i-index.html',
 'https://www.sports-reference.com/cbb/players/j-index.html',
 'https://www.sports-reference.com/cbb/players/k-index.html',
 'https://www.sports-reference.com/cbb/players/l-index.html',
 'https://www.sports-reference.com/cbb/players/m-index.html',
 'https://www.sports-reference.com/cbb/players/n-index.html',
 'https://www.sports-reference.com/cbb/players/o-index.html',
 'https://www.sports-reference.com/cbb/players/p-index.html',
 'https:

### Get player urls

Within each of the index pages, there was a pattern within the HTML code that references all the links on the page. I scraped all the links and discarded the ones that weren't relevant such as the links to schools or other parts of the site that weren't player pages.

In [6]:
def get_player_urls(index_url_link):

    player_urls = []
    index_page = SoupFromURL(index_url_link)
    index_names = index_page.find_all('p')
    index_soup = BeautifulSoup(str(index_names),"html5lib")
    links = index_soup('a', href=True)
    
    for l in links:
        try:
            player_urls.append('https://www.sports-reference.com' + l.attrs['href'])
        except:
            pass
    
    player_urls = [x for x in player_urls if not 'schools' in x]
    player_urls.pop()
    player_urls.pop()
    player_urls.pop()
    player_urls.pop()
    return player_urls

In [7]:
get_player_urls('https://www.sports-reference.com/cbb/players/x-index.html')

['https://www.sports-reference.com/cbb/players/jeff-xavier-1.html',
 'https://www.sports-reference.com/cbb/players/ioannis-xenakis-1.html',
 'https://www.sports-reference.com/cbb/players/aaron-xia-1.html',
 'https://www.sports-reference.com/cbb/players/ji-xiang-1.html',
 'https://www.sports-reference.com/cbb/players/oliver-xu-1.html',
 'https://www.sports-reference.com/cbb/players/tao-xu-1.html']

### Generate Stats Dataframe Of All Players On Index Page

Now that I have a function to scrape player pages for their stats and a list of index_urls with a function to get all the page urls, I decided to create another function below where you pass in the index url to generate a dataframe with player stats for all players listed on that page.

In [8]:
def generate_player_df(first_letter):
    
    index_url_link = 'https://www.sports-reference.com/cbb/players/{}-index.html'.format(first_letter)
    
    player_urls = []
    index_page = SoupFromURL(index_url_link)
    index_names = index_page.find_all('p')
    index_soup = BeautifulSoup(str(index_names),"html5lib")
    links = index_soup('a', href=True)
    
    for l in links:
        try:
            player_urls.append('https://www.sports-reference.com' + l.attrs['href'])
        except:
            pass
    
    player_urls = [x for x in player_urls if not 'schools' in x]
    player_urls.pop()
    player_urls.pop()
    player_urls.pop()
    player_urls.pop()
    
    player_stats_list=[]
    for url in player_urls:
        try:
            player_stats_list.append(ncaa_stats_dict(url))
            print(url)
        except:
            pass
    
    df = pd.DataFrame(player_stats_list)
    df = df[[
        'player_name',
        'position',
        'height_inches',
        'weight_lbs',
        'college',
        'draft_year',
        'years_in_college',
        'games',
        'minutes_per_game',
        'points',
        'rebounds',
        'assists',
        'steals',
        'blocks',
        'turnovers',
        'fg_percent',
        '3_fg_percent',
        'free_throw_percent',
        'effective_fg_percent',
        'player_efficiency_rating',
        'win_shares'
    ]]
    
    
    return df

In [9]:
x_df = generate_player_df('x')

https://www.sports-reference.com/cbb/players/jeff-xavier-1.html
https://www.sports-reference.com/cbb/players/ioannis-xenakis-1.html
https://www.sports-reference.com/cbb/players/aaron-xia-1.html
https://www.sports-reference.com/cbb/players/ji-xiang-1.html
https://www.sports-reference.com/cbb/players/oliver-xu-1.html
https://www.sports-reference.com/cbb/players/tao-xu-1.html


In [10]:
x_df

Unnamed: 0,player_name,position,height_inches,weight_lbs,college,draft_year,years_in_college,games,minutes_per_game,points,...,assists,steals,blocks,turnovers,fg_percent,3_fg_percent,free_throw_percent,effective_fg_percent,player_efficiency_rating,win_shares
0,Jeff Xavier,Guard,72,183,Schools: Manhattan and Providence,2008,3,124.0,28.0,11.5,...,1.8,1.8,0.2,1.9,40.8,34.2,78.8,51.5,,9.5
1,Ioannis Xenakis,Center,84,213,Delaware,2003,2,24.0,,2.6,...,0.0,0.0,0.2,,44.4,,43.8,44.4,,0.1
2,Aaron Xia,Forward,80,203,Citadel,2006,3,73.0,,2.6,...,0.1,0.2,0.4,,45.7,30.8,69.2,47.0,,1.4
3,Ji Xiang,Forward,82,208,Hawaii,2010,2,28.0,4.6,1.4,...,0.1,0.0,0.2,0.2,40.9,0.0,55.6,40.9,,0.3
4,Oliver Xu,Guard,74,170,Rice,2016,1,11.0,2.6,0.5,...,0.1,0.0,0.0,0.5,33.3,33.3,50.0,41.7,-7.1,-0.1
5,Tao Xu,Center,83,277,San Francisco,2014,2,36.0,11.9,2.6,...,0.2,0.2,0.1,1.1,42.5,,66.7,42.5,2.1,-0.2


### Versions utilized for scraping

In [2]:
import sys
import bs4
import re

In [3]:
print('Python version:', sys.version_info)
print('BeautifulSoup version:', bs4.__version__)
print('Pandas version:', pd.__version__)
print('Numpy version:',np.__version__)
print('RegEx version:',re.__version__)

Python version: sys.version_info(major=3, minor=5, micro=2, releaselevel='final', serial=0)
BeautifulSoup version: 4.5.3
Pandas version: 0.19.2
Numpy version: 1.12.1
RegEx version: 2.2.1
