In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
def SoupFromURL(url, suppressOutput=True):

    if not suppressOutput:
        print(url)
    try:
        r = requests.get(url)
    except:
        return None

    return BeautifulSoup(r.text, "html5lib")

### Scraping player page for stats

The function below scrapes any given player url page for stats

In [3]:
def ncaa_stats_dict(url):
    player_page = SoupFromURL(str(url))
    
    #player profile info
    a = player_page.find_all('div', attrs={'class':'nothumb'})[0]
    a_soup = BeautifulSoup(str(a),"html5lib")
    a = player_page.find_all('div', attrs={'class':'nothumb'})[0]
    
    try:
        n = str(a_soup.text.split('\n')[1])
        name = n.replace("\t","")
    except:
        name = None
    
    try:
        p = str(a_soup.text.split('\n')[13])
        position = p.replace("  ","")
    except:
        position = None
    
    try:
        ft = float(str(a_soup.text.split('\n')[17])[2])*12
        inch = float(str(a_soup.text.split('\n')[17])[4])
        height = ft+inch
    except:
        height = None
    
    ### high school
    hs_list = []
    try:
        hs = str(a_soup.text.split('\n')[22])
        high_school = hs.replace("  High School: ","")
        hs_list.append(high_school)
    
        hs_2 = str(a_soup.text.split('\n')[26])
        high_school_2 = hs_2.replace("  High School: ","")
        hs_list.append(high_school_2)
        
        hs_list = [x for x in hs_list if not '  ' in x]
        hs_list.append(None)
    except:
        hs_list.append(None)
    
    while '' in hs_list:
        hs_list.remove('')
        
    ### college
    col_list = []
    try:
        c_4 = str(a_soup.text.split('\n')[18])
        college_4 = c_4.replace("  School: ","")
        college_4 = college_4.replace("  Schools: ","")
        col_list.append(college_4)
    
        c = str(a_soup.text.split('\n')[28])
        college = c.replace("  School: ","")
        college = college.replace("  Schools: ","")
        col_list.append(college)
    
        c_2 = str(a_soup.text.split('\n')[32])
        college_2 = c_2.replace("  School: ","")
        college_2 = college_2.replace("  Schools: ","")
        col_list.append(college_2)
    
        c_3 = str(a_soup.text.split('\n')[24])
        college_3 = c_3.replace("  School: ","")
        college_3 = college_3.replace("  Schools: ","")
        col_list.append(college_3)

        col_list = [x for x in col_list if not '  ' in x]
        col_list.append(None)
    except:
        col_list.append(None)

    while '' in col_list:
        col_list.remove('')
    
    #player stats
    stat_list = []
    try:
        p = player_page.find_all('div', attrs={'class':'stats_pullout'})[0]
        p_soup = BeautifulSoup(str(p),"html5lib")
        stat = p_soup.text.split('\n')[1::4][2:]
    
        #stat_list = []
        for s in stat:
            try:
                if s == str(""):
                    stat_list.append(None)
                else:
                    stat_list.append(float(s))
            except:
                stat_list.append(None)
    except:
        stat_list.extend([None,None,None,None,None,None,None,None,None,None])
    
    #season count
    season_count = player_page.find_all('div', attrs={'class':'overthrow table_container'})[0]
    season_count_soup = BeautifulSoup(str(season_count),"html5lib")
    season_count = season_count_soup.text.split('\n')
    season_count = season_count[33:37]
    season_count = [x for x in season_count if not 'Career' in x]
    season_count = [x for x in season_count if not '  ' in x]
    final_season_count = len(list(filter(None, season_count)))
    
    #creating player dictionary
    player_dict = {
        'player_name':name,
        'position':position,
        'height(inches)':height,
        'high_school':hs_list[0],
        'college':col_list[0],
        'years_in_college':final_season_count,
        'games':stat_list[0],
        'points':stat_list[1],
        'rebounds':stat_list[2],
        'assists':stat_list[3],
        'fg_percent':stat_list[4],
        '3_fg_percent':stat_list[5],
        'free_throw_percent':stat_list[6],
        'effective_fg_percent':stat_list[7],
        'player_efficiency_rating':stat_list[8],
        'win_shares':stat_list[9]
    }
    return player_dict

In [4]:
ncaa_stats_dict('https://www.sports-reference.com/cbb/players/jordan-aaberg-1.html')

{'3_fg_percent': None,
 'assists': 0.2,
 'college': 'North Dakota State',
 'effective_fg_percent': 61.9,
 'fg_percent': 61.9,
 'free_throw_percent': 78.8,
 'games': 115.0,
 'height(inches)': 81.0,
 'high_school': 'Rothsay',
 'player_efficiency_rating': 17.9,
 'player_name': 'Jordan Aaberg',
 'points': 3.5,
 'position': 'Forward',
 'rebounds': 2.3,
 'win_shares': 4.6,
 'years_in_college': 4}

### Get index page urls

I noticed that there is a pattern to the player index page url's on sports-reference. I followed the pattern to get a list of url's.

In [5]:
import string
index_urls = []
for letter in string.ascii_lowercase:
    letter_page = 'https://www.sports-reference.com/cbb/players/{}-index.html'.format(letter)
    index_urls.append(letter_page)

index_urls

['https://www.sports-reference.com/cbb/players/a-index.html',
 'https://www.sports-reference.com/cbb/players/b-index.html',
 'https://www.sports-reference.com/cbb/players/c-index.html',
 'https://www.sports-reference.com/cbb/players/d-index.html',
 'https://www.sports-reference.com/cbb/players/e-index.html',
 'https://www.sports-reference.com/cbb/players/f-index.html',
 'https://www.sports-reference.com/cbb/players/g-index.html',
 'https://www.sports-reference.com/cbb/players/h-index.html',
 'https://www.sports-reference.com/cbb/players/i-index.html',
 'https://www.sports-reference.com/cbb/players/j-index.html',
 'https://www.sports-reference.com/cbb/players/k-index.html',
 'https://www.sports-reference.com/cbb/players/l-index.html',
 'https://www.sports-reference.com/cbb/players/m-index.html',
 'https://www.sports-reference.com/cbb/players/n-index.html',
 'https://www.sports-reference.com/cbb/players/o-index.html',
 'https://www.sports-reference.com/cbb/players/p-index.html',
 'https:

### Get player urls

Within each of the index pages, there was a pattern within the HTML code that references all the links on the page. I scraped all the links and discarded the ones that weren't relevant such as the links to schools or other parts of the site that weren't player pages.

In [6]:
def get_player_urls(index_url_link):

    player_urls = []
    index_page = SoupFromURL(index_url_link)
    index_names = index_page.find_all('p')
    index_soup = BeautifulSoup(str(index_names),"html5lib")
    links = index_soup('a', href=True)
    
    for l in links:
        try:
            player_urls.append('https://www.sports-reference.com' + l.attrs['href'])
        except:
            pass
    
    player_urls = [x for x in player_urls if not 'schools' in x]
    player_urls.pop()
    player_urls.pop()
    player_urls.pop()
    player_urls.pop()
    return player_urls

In [7]:
get_player_urls('https://www.sports-reference.com/cbb/players/x-index.html')

['https://www.sports-reference.com/cbb/players/jeff-xavier-1.html',
 'https://www.sports-reference.com/cbb/players/ioannis-xenakis-1.html',
 'https://www.sports-reference.com/cbb/players/aaron-xia-1.html',
 'https://www.sports-reference.com/cbb/players/ji-xiang-1.html',
 'https://www.sports-reference.com/cbb/players/oliver-xu-1.html',
 'https://www.sports-reference.com/cbb/players/tao-xu-1.html']

### Generate Stats Dataframe Of All Players On Index Page

Now that I have a function to scrape player pages for their stats and a list of index_urls with a function to get all the page urls, I decided to create another function below where you pass in the index url to generate a dataframe with player stats for all players listed on that page.

In [8]:
def generate_player_df(index_url_link):
    
    player_urls = []
    index_page = SoupFromURL(index_url_link)
    index_names = index_page.find_all('p')
    index_soup = BeautifulSoup(str(index_names),"html5lib")
    links = index_soup('a', href=True)
    
    for l in links:
        try:
            player_urls.append('https://www.sports-reference.com' + l.attrs['href'])
        except:
            pass
    
    player_urls = [x for x in player_urls if not 'schools' in x]
    player_urls.pop()
    player_urls.pop()
    player_urls.pop()
    player_urls.pop()
    
    player_stats_list=[]
    for url in player_urls:
        player_stats_list.append(ncaa_stats_dict(url))
        print(url)
    
    df = pd.DataFrame(player_stats_list)
    df = df[[
        'player_name',
        'position',
        'height(inches)',
        'high_school',
        'college',
        'years_in_college',
        'games',
        'points',
        'rebounds',
        'assists',
        'fg_percent',
        '3_fg_percent',
        'free_throw_percent',
        'effective_fg_percent',
        'player_efficiency_rating',
        'win_shares'
    ]]
    
    return df

In [9]:
z_df = generate_player_df('https://www.sports-reference.com/cbb/players/z-index.html')

https://www.sports-reference.com/cbb/players/marvin-zaandam-1.html
https://www.sports-reference.com/cbb/players/kevin-zabo-1.html
https://www.sports-reference.com/cbb/players/dan-zachary-1.html
https://www.sports-reference.com/cbb/players/danny-zachary-1.html
https://www.sports-reference.com/cbb/players/michael-zachary-1.html
https://www.sports-reference.com/cbb/players/randy-zachary-1.html
https://www.sports-reference.com/cbb/players/richard-zacher-1.html
https://www.sports-reference.com/cbb/players/nick-zachery-1.html
https://www.sports-reference.com/cbb/players/scott-zack-1.html
https://www.sports-reference.com/cbb/players/steve-zack-1.html
https://www.sports-reference.com/cbb/players/bill-zadel-1.html
https://www.sports-reference.com/cbb/players/manny-zafires-1.html
https://www.sports-reference.com/cbb/players/manny-zafiros-1.html
https://www.sports-reference.com/cbb/players/petrolsav-zafirov-1.html
https://www.sports-reference.com/cbb/players/todd-zafirovski-1.html
https://www.spo

https://www.sports-reference.com/cbb/players/john-zeides-1.html
https://www.sports-reference.com/cbb/players/_-zeiff-1.html
https://www.sports-reference.com/cbb/players/_-zeiff-2.html
https://www.sports-reference.com/cbb/players/dick-zeiger-1.html
https://www.sports-reference.com/cbb/players/bo-zeigler-1.html
https://www.sports-reference.com/cbb/players/dave-zeigler-1.html
https://www.sports-reference.com/cbb/players/demitrius-zeigler-1.html
https://www.sports-reference.com/cbb/players/derrick-zeigler-1.html
https://www.sports-reference.com/cbb/players/ernie-zeigler-2.html
https://www.sports-reference.com/cbb/players/kenny-zeigler-1.html
https://www.sports-reference.com/cbb/players/marcus-zeigler-1.html
https://www.sports-reference.com/cbb/players/mitchell-zeigler-1.html
https://www.sports-reference.com/cbb/players/omar-zeigler-1.html
https://www.sports-reference.com/cbb/players/trey-zeigler-1.html
https://www.sports-reference.com/cbb/players/nick-zeisloft-1.html
https://www.sports-ref

https://www.sports-reference.com/cbb/players/david-zingg-1.html
https://www.sports-reference.com/cbb/players/john-zinirich-1.html
https://www.sports-reference.com/cbb/players/dave-zink-1.html
https://www.sports-reference.com/cbb/players/gary-zinkgraf-1.html
https://www.sports-reference.com/cbb/players/chuck-zinky-1.html
https://www.sports-reference.com/cbb/players/jim-zinn-1.html
https://www.sports-reference.com/cbb/players/rob-zinn-1.html
https://www.sports-reference.com/cbb/players/terry-zinn-1.html
https://www.sports-reference.com/cbb/players/walt-zinn-1.html
https://www.sports-reference.com/cbb/players/paul-zinser-1.html
https://www.sports-reference.com/cbb/players/bill-zinsky-1.html
https://www.sports-reference.com/cbb/players/jim-zinsky-1.html
https://www.sports-reference.com/cbb/players/mark-ziolko-1.html
https://www.sports-reference.com/cbb/players/john-zipp-1.html
https://www.sports-reference.com/cbb/players/_-zippel-1.html
https://www.sports-reference.com/cbb/players/jon-ziri

In [None]:
z_df.head()

Code below to troubleshoot college column

In [None]:
player_page = SoupFromURL('https://www.sports-reference.com/cbb/players/jordan-aaberg-1.html')
    
a = player_page.find_all('div', attrs={'class':'nothumb'})[0]
a_soup = BeautifulSoup(str(a),"html5lib")


col_list = []
try:
    c_4 = str(a_soup.text.split('\n')[18])
    college_4 = c_4.replace("  School: ","")
    college_4 = college_4.replace("  Schools: ","")
    col_list.append(college_4)
    
    c = str(a_soup.text.split('\n')[28])
    college = c.replace("  School: ","")
    college = college.replace("  Schools: ","")
    col_list.append(college)
    
    c_2 = str(a_soup.text.split('\n')[32])
    college_2 = c_2.replace("  School: ","")
    college_2 = college_2.replace("  Schools: ","")
    col_list.append(college_2)
    
    c_3 = str(a_soup.text.split('\n')[24])
    college_3 = c_3.replace("  School: ","")
    college_3 = college_3.replace("  Schools: ","")
    col_list.append(college_3)

        #final_college = list(filter(None, col_list))
        #final_college.append(None)
    col_list = [x for x in col_list if not '  ' in x]
    col_list.append(None)
except:
    col_list.append(None)

while '' in col_list:
    col_list.remove('')
    
col_list

In [None]:
player_page = SoupFromURL('https://www.sports-reference.com/cbb/players/_-ziegenhorn-1.html')

col_list_2 = []
a = player_page.find_all('div', attrs={'class':'nothumb'})[0]
a_soup = BeautifulSoup(str(a),"html5lib")

col_5 = str(a_soup.text.split('\n')[18])

col_list_2.append(col_5)

col_list_2