In [1]:
from bs4 import BeautifulSoup
import requests
from IPython.core.display import display, HTML
import numpy as np
import re
import pickle
import pandas as pd
import re

## Grab names for all NCAA teams

In [2]:
url = "https://www.sports-reference.com/cbb/schools/"
response = requests.get(url)
teams_text = response.text
team_soup = BeautifulSoup(teams_text, "lxml")
team_table = team_soup.find('table')
team_rows = team_table.find_all('tr')


In [3]:
#note that after every 20 teams, there is another header row
teams_list = {}
for i in [i for i in range(1,len(team_rows)) if i % 21 != 0]:
    items = team_rows[i].find_all('td')
    link = items[0].find('a')
    school, url = link.text, link['href']
    teams_list[school] = [url] + [i.text for i in items]

In [4]:
with open('teams_list.pickle', 'wb') as handle:
    pickle.dump(teams_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Create Dictionary of D1 Teams for each season

In [5]:
def teams(teams_list, season):
    '''
    Find teams in D1 for particular season
    
    input: season, this will be the year of interest
    output: a list of all the D1 teams that played that year, with their formal name
    '''
    season_teams = {}
    for team in teams_list.keys():
        if int(teams_list[team][3]) <= season <= int(teams_list[team][4]):
            season_teams[team] = teams_list[team]
    return season_teams

In [6]:
def team_name_dict(season_teams, season):
    '''
    Create team_lookup for names - url name as key, values include name, formal name, and team url
    
    input: the teams identified for a particular season, the year of the season
    output: a dictionary where the keys are the simple names 
    '''
    team_lookup = {}
    for key in season_teams:
        sched_url = "https://www.sports-reference.com" + season_teams[key][0] + str(season) +"-schedule.html"
        sched_response = requests.get(sched_url)
        sched_text = sched_response.text
        sched_soup = BeautifulSoup(sched_text,"lxml")
        sched_table = sched_soup.find('table')
        simple_name = re.split("\/",season_teams[key][0])[3]
        sched_rows = sched_table.find_all('tr')
        name = sched_table.find('a').text
        formal_name = season_teams[key][1]
        team_lookup[simple_name] = [name] + [formal_name] + [season_teams[key][0]]
    return team_lookup

In [11]:
#analysis was done for years 2014-2019, omitting 2020 because the season was cut short due to covid-19
for i in [2014, 2015, 2016, 2017, 2018, 2019]:
    season_teams = teams(teams_list, i)
    team_lookup = team_name_dict(season_teams, 2019)
    with open(f'team_lookup_{i}.pickle', 'wb') as handle:
        pickle.dump(team_lookup, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
team_lookup = pickle.load(open('team_lookup.pickle', "rb" ))

## Helper functions to webscrap data

In [7]:
def grab_gamelog(season_teams, season):
    '''
    This grabs the gamelogs for all teams, for the season that is specified.
    
    Input: Dictionary where the keys are the teams for specified season; specified season
    Output: A dictionary, where each key stores a season's worth of individual game statistics for one team;
            The dictionary will hold all teams' information
    
    '''
    gamelog = {}
    for key in season_teams:
        game_url = "https://www.sports-reference.com" + season_teams[key][0] + str(season) +"-gamelogs.html"
        print(game_url)
        game_response = requests.get(game_url)
        gamelog_text = game_response.text
        gamelog_soup = BeautifulSoup(gamelog_text,"lxml")
        gamelog_table = gamelog_soup.find('table')
        gamelog_rows = gamelog_table.find_all('tr')
        team = re.split("\/",season_teams[key][0])[3]
        for i in [i for i in range(2,len(gamelog_rows)) if i % 22 != 0 and i % 23 != 0 and i % 44 != 0 and i % 45 != 0]:
            items = gamelog_rows[i].find_all('td')
            link = items[0].find('a')
            if link == None:
                pass
            else:
                date, url = link.text, link['href']
                gamelog[url + team] = [url] + [team] + [i.text for i in items]
    return gamelog

In [8]:
def grab_gamelog_advanced(season_teams, season):
    '''
    This grabs the gamelogs with ADVANCED STATS for all teams, for the season that is specified.
    
    Note this is exactly the same as the grab_gamelog function, except goes to a different website
    
    Input: Dictionary where the keys are the teams for specified season; specified season
    Output: A dictionary, where each key stores a season's worth of individual game statistics for one team;
            The dictionary will hold all teams' information
    
    '''
    gamelog = {}
    for key in season_teams:
        game_url = "https://www.sports-reference.com" + season_teams[key][0] + str(season) +"-gamelogs-advanced.html"
        game_response = requests.get(game_url)
        gamelog_text = game_response.text
        gamelog_soup = BeautifulSoup(gamelog_text,"lxml")
        gamelog_table = gamelog_soup.find('table')
        gamelog_rows = gamelog_table.find_all('tr')
        team = re.split("\/",season_teams[key][0])[3]
        for i in [i for i in range(2,len(gamelog_rows)) if i % 22 != 0 and i % 23 != 0 and i % 44 != 0 and i % 45 != 0]:
            items = gamelog_rows[i].find_all('td')
            link = items[0].find('a')
            if link == None:
                pass
            else:
                date, url = link.text, link['href']
                gamelog[url + team] = [url] + [team] + [i.text for i in items]
    return gamelog

In [9]:
def combine(gamelog, advanced_gamelog):
    '''
    Combines the two dictionaries so that there is a single dictionary housing both basic and advanced stats
    '''
    combo_log = {}
    for game in gamelog.keys():
        gamelog[game].extend(advanced_gamelog[game])
    return gamelog

## Get Line Data

In [34]:
def get_line_data(year_nums):
    """
    Helper function to load data from http://www.thepredictiontracker.com/basketball.php
    """
    url = "http://www.thepredictiontracker.com/ncaabb{}.csv"
    dfs = []
    for year in year_nums:
        file_url = url.format(year-1)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)

In [35]:
lines = get_line_data([14,15,16,17,18,19])
with open(f'lines.pickle', 'wb') as handle:
    pickle.dump(lines, handle, protocol=pickle.HIGHEST_PROTOCOL)

## CODE TO GRAB THE DATA

In [12]:
## Pull multiple seasons

my_seasons = [2015, 2019]

for i in my_seasons:
    this_season = teams(teams_list, i)
    
    this_gamelog = grab_gamelog(this_season, i)
    with open(f'gamelog_{i}.pickle', 'wb') as handle:
        pickle.dump(this_gamelog, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f'gamelog {i}')        
        
    this_advanced_gamelog = grab_gamelog_advanced(this_season, i)
    with open(f'advanced_gamelog_{i}.pickle', 'wb') as handle:
        pickle.dump(this_advanced_gamelog, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f'advanced_gamelog {i}')  
    
    this_combo_log = combine(this_gamelog, this_advanced_gamelog)
    with open(f'combo_log_{i}.pickle', 'wb') as handle:
        pickle.dump(this_combo_log, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f'done with {i} season')

https://www.sports-reference.com/cbb/schools/abilene-christian/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/air-force/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/akron/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/alabama-am/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/alabama/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/alabama-state/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/alabama-birmingham/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/albany-ny/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/alcorn-state/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/american/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/appalachian-state/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/arizona-state/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/arizona/2015-gamelogs.html
https://ww

https://www.sports-reference.com/cbb/schools/hartford/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/harvard/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/hawaii/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/high-point/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/hofstra/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/holy-cross/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/houston-baptist/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/houston/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/howard/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/idaho-state/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/idaho/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/illinois/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/illinois-state/2015-gamelogs.html
https://www.sports-reference.com/cbb/scho

https://www.sports-reference.com/cbb/schools/northern-kentucky/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/northwestern-state/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/northwestern/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/notre-dame/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/oakland/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/ohio/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/ohio-state/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/oklahoma/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/oklahoma-state/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/old-dominion/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/nebraska-omaha/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/oral-roberts/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/oregon/2015-gamelogs.html
https://www

https://www.sports-reference.com/cbb/schools/valparaiso/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/vanderbilt/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/vermont/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/villanova/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/virginia/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/virginia-commonwealth/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/virginia-military-institute/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/virginia-tech/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/wagner/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/wake-forest/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/washington/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/washington-state/2015-gamelogs.html
https://www.sports-reference.com/cbb/schools/weber-state/2015-gamelogs.h

https://www.sports-reference.com/cbb/schools/eastern-illinois/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/eastern-kentucky/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/eastern-michigan/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/eastern-washington/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/elon/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/evansville/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/fairfield/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/fairleigh-dickinson/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/florida-am/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/florida-atlantic/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/florida/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/florida-gulf-coast/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/florida-internat

https://www.sports-reference.com/cbb/schools/mount-st-marys/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/murray-state/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/navy/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/nebraska/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/nevada/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/nevada-las-vegas/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/new-hampshire/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/new-mexico/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/new-mexico-state/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/new-orleans/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/niagara/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/nicholls-state/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/njit/2019-gamelogs.html
https://www.sports-r

https://www.sports-reference.com/cbb/schools/stony-brook/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/syracuse/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/temple/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/tennessee-state/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/tennessee-tech/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/tennessee/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/tennessee-martin/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/texas-am/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/texas-am-corpus-christi/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/texas-christian/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/texas/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/texas-southern/2019-gamelogs.html
https://www.sports-reference.com/cbb/schools/texas-state/2019-gamelogs.html


## Future work

## Grab data from single game - hold off on pulling individual game data for now

Input url for each game
Output data table

In [None]:
def game_box_score(url)

# example url = 'https://www.sports-reference.com/cbb/boxscores/2020-03-11-21-stanford.html'
#note this will grab 8 tables (the last 4 will be basic and advanced stats, for each team)
    box = requests.get(url)
    boxscores = box.text
    soup = BeautifulSoup(boxscores, "html5lib")
    tables = soup.find_all('table')
    
    return tables

Pull Data from class "scorebox_meta": Date, Location <- this does not show up in a table

Pull Data from table "line-score": <- this also does not show up in a table...
Output: Away Team, Home Team, Away Score 1H, Away Score 2H, Away Score F, Home Score 1H, Home Score 2h, Home Score F

? What about overtimes...

https://stackoverflow.com/questions/49766150/when-scraping-data-from-basketball-reference-how-come-certain-tables-are-comment

# Debugging - To delete

In [None]:
advanced_gamelog_2015 = grab_gamelog2(season15, 2015)
with open(f'advanced_gamelog_2015.pickle', 'wb') as handle:
    pickle.dump(advanced_gamelog_2015, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f'advanced_gamelog {i}')  

In [None]:
advanced_gamelog_2015

In [13]:

game_url = "https://www.sports-reference.com/cbb/schools/michigan/2018-gamelogs.html"
print(game_url)
game_response = requests.get(game_url)
gamelog_text = game_response.text
gamelog_soup = BeautifulSoup(gamelog_text,"lxml")
gamelog_table = gamelog_soup.find('table')
gamelog_rows = gamelog_table.find_all('tr')
team = "michigan"

https://www.sports-reference.com/cbb/schools/michigan/2018-gamelogs.html


In [33]:
gamelog = {}
for i in [i for i in range(2,len(gamelog_rows)) if i % 22 != 0 and i % 23 != 0 and i % 44 != 0 and i % 45 != 0]:
    items = gamelog_rows[i].find_all('td')
    link = items[0].find('a')
    if link == None:
        pass
    else:
        date, url = link.text, link['href']
        gamelog[url + team] = [url] + [team] + [i.text for i in items]

In [29]:
gamelog

{'/cbb/boxscores/2017-11-11-19-michigan.htmlmichigan': ['/cbb/boxscores/2017-11-11-19-michigan.html',
  'michigan',
  '2017-11-11',
  '',
  'North Florida',
  'W',
  '86',
  '66',
  '33',
  '71',
  '.465',
  '9',
  '25',
  '.360',
  '11',
  '16',
  '.688',
  '12',
  '35',
  '21',
  '12',
  '2',
  '10',
  '12',
  '',
  '22',
  '50',
  '.440',
  '9',
  '15',
  '.600',
  '13',
  '17',
  '.765',
  '6',
  '28',
  '11',
  '5',
  '4',
  '23',
  '19'],
 '/cbb/boxscores/2017-11-13-19-michigan.htmlmichigan': ['/cbb/boxscores/2017-11-13-19-michigan.html',
  'michigan',
  '2017-11-13',
  '',
  'Central Michigan',
  'W',
  '72',
  '65',
  '26',
  '59',
  '.441',
  '10',
  '34',
  '.294',
  '10',
  '12',
  '.833',
  '6',
  '25',
  '14',
  '5',
  '3',
  '4',
  '11',
  '',
  '22',
  '51',
  '.431',
  '10',
  '24',
  '.417',
  '11',
  '12',
  '.917',
  '7',
  '32',
  '12',
  '2',
  '2',
  '14',
  '18'],
 '/cbb/boxscores/2017-11-16-19-michigan.htmlmichigan': ['/cbb/boxscores/2017-11-16-19-michigan.html'

In [27]:
gamelog_rows[46]

<tr id="sgl-basic.20180402"><th class="right" data-stat="game_season" scope="row">41</th><td class="left" data-stat="date_game"><a href="/cbb/boxscores/2018-04-02-21-villanova.html">2018-04-02</a></td><td class="left" data-stat="game_location">N</td><td class="left" data-stat="opp_id"><a href="/cbb/schools/villanova/2018.html">Villanova</a></td><td class="left" csk="2" data-stat="game_result">L</td><td class="right" data-stat="pts">62</td><td class="right" data-stat="opp_pts">79</td><td class="right" data-stat="fg">24</td><td class="right" data-stat="fga">55</td><td class="right" data-stat="fg_pct">.436</td><td class="right" data-stat="fg3">3</td><td class="right" data-stat="fg3a">23</td><td class="right" data-stat="fg3_pct">.130</td><td class="right" data-stat="ft">11</td><td class="right" data-stat="fta">18</td><td class="right" data-stat="ft_pct">.611</td><td class="right" data-stat="orb">5</td><td class="right" data-stat="trb">26</td><td class="right" data-stat="ast">6</td><td clas

In [31]:
items = gamelog_rows[46].find_all('td')
link = items[0].find('a')
if link == None:
    pass
else:
    date, url = link.text, link['href']
    gamelog[url + team] = [url] + [team] + [i.text for i in items]

In [32]:
gamelog

{'/cbb/boxscores/2017-11-11-19-michigan.htmlmichigan': ['/cbb/boxscores/2017-11-11-19-michigan.html',
  'michigan',
  '2017-11-11',
  '',
  'North Florida',
  'W',
  '86',
  '66',
  '33',
  '71',
  '.465',
  '9',
  '25',
  '.360',
  '11',
  '16',
  '.688',
  '12',
  '35',
  '21',
  '12',
  '2',
  '10',
  '12',
  '',
  '22',
  '50',
  '.440',
  '9',
  '15',
  '.600',
  '13',
  '17',
  '.765',
  '6',
  '28',
  '11',
  '5',
  '4',
  '23',
  '19'],
 '/cbb/boxscores/2017-11-13-19-michigan.htmlmichigan': ['/cbb/boxscores/2017-11-13-19-michigan.html',
  'michigan',
  '2017-11-13',
  '',
  'Central Michigan',
  'W',
  '72',
  '65',
  '26',
  '59',
  '.441',
  '10',
  '34',
  '.294',
  '10',
  '12',
  '.833',
  '6',
  '25',
  '14',
  '5',
  '3',
  '4',
  '11',
  '',
  '22',
  '51',
  '.431',
  '10',
  '24',
  '.417',
  '11',
  '12',
  '.917',
  '7',
  '32',
  '12',
  '2',
  '2',
  '14',
  '18'],
 '/cbb/boxscores/2017-11-16-19-michigan.htmlmichigan': ['/cbb/boxscores/2017-11-16-19-michigan.html'