In [1]:
from bs4 import BeautifulSoup
import requests
from IPython.core.display import display, HTML
import numpy as np
import re
import pickle
import pandas as pd
import re

## Grab names for all NCAA teams

In [2]:
url = "https://www.sports-reference.com/cbb/schools/"
response = requests.get(url)
teams_text = response.text
team_soup = BeautifulSoup(teams_text, "lxml")
team_table = team_soup.find('table')
team_rows = team_table.find_all('tr')


In [3]:
#note that after every 20 teams, there is another header row
teams_list = {}
for i in [i for i in range(1,len(team_rows)) if i % 21 != 0]:
    items = team_rows[i].find_all('td')
    link = items[0].find('a')
    school, url = link.text, link['href']
    teams_list[school] = [url] + [i.text for i in items]

In [5]:
with open('teams_list.pickle', 'wb') as handle:
    pickle.dump(teams_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Create Team_

In [6]:
#find teams in D1 for particular season
def teams(teams_list, season):
    '''
    input: season, this will be the year of interest
    output: a list of all the D1 teams that played that year, with their formal name
    '''
    season_teams = {}
    for team in teams_list.keys():
        print(team)
        if int(teams_list[team][3]) <= season <= int(teams_list[team][4]):
            season_teams[team] = teams_list[team]
    return season_teams

In [7]:
season_teams = teams(teams_list, 2019)

Abilene Christian Wildcats
Air Force Falcons
Akron Zips
Alabama A&M Bulldogs
Alabama Crimson Tide
Alabama State Hornets
Alabama-Birmingham Blazers
Albany (NY) Great Danes
Alcorn State Braves
Allegheny Gators
American Eagles
Amherst Lord Jeffs
Appalachian State Mountaineers
Arizona State Sun Devils
Arizona Wildcats
Arkansas Razorbacks
Arkansas State Red Wolves
Arkansas-Pine Bluff Golden Lions
Armstrong Pirates
Army Black Knights
Auburn Tigers
Augusta State Jaguars
Augustana (IL) Vikings
Austin Peay Governors
Baker University Wildcats
Baldwin-Wallace Yellow Jackets
Ball State Cardinals
Baltimore Super Bees
Baylor Bears
Belmont Bruins
Beloit Buccaneers
Bethune-Cookman Wildcats
Binghamton Bearcats
Birmingham-Southern Panthers
Bloomsburg Huskies
Boise State Broncos
Boston College Eagles
Boston University Terriers
Bowling Green State Falcons
Bradley Braves
Brigham Young College 
Brigham Young Cougars
Brooklyn Bulldogs
Brown Bears
Bryant Bulldogs
Bucknell Bison
Buffalo Bulls
Butler Bulldogs
C

In [None]:
#create team_lookup for names - url name as key, values include name, formal name, and team url

def team_name_dict(season_teams, season):
    '''
    input: the teams
    output: a dictionary where the keys are the simple names, 
    '''
    team_lookup = {}
    for key in season_teams:
        sched_url = "https://www.sports-reference.com" + season_teams[key][0] + str(season) +"-schedule.html"
        sched_response = requests.get(sched_url)
        sched_text = sched_response.text
        sched_soup = BeautifulSoup(sched_text,"lxml")
        sched_table = sched_soup.find('table')
        simple_name = re.split("\/",season_teams[key][0])[3]
        sched_rows = sched_table.find_all('tr')
        name = sched_table.find('a').text
        formal_name = season_teams[key][1]
        team_lookup[simple_name] = [name] + [formal_name] + [season_teams[key][0]]
    return team_lookup

In [None]:
team_lookup = team_name_dict(season_teams, 2019)

In [None]:
with open('team_lookup.pickle', 'wb') as handle:
    pickle.dump(team_lookup, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
team_lookup = pickle.load(open('team_lookup.pickle', "rb" ))

## GRAB gamelogs for each team for 2019

In [None]:
gamelog = {}
for key in season_teams:
    game_url = "https://www.sports-reference.com" + season_teams[key][0] + str(season) +"-gamelogs.html"
    game_response = requests.get(game_url)
    gamelog_text = game_response.text
    gamelog_soup = BeautifulSoup(gamelog_text,"lxml")
    gamelog_table = gamelog_soup.find('table')
    gamelog_rows = gamelog_table.find_all('tr')
    team = re.split("\/",teams[key][0])[3]
    for i in [i for i in range(2,len(gamelog_rows)) if i % 22 != 0 and i % 23 != 0 ]:
        items = gamelog_rows[i].find_all('td')
        link = items[0].find('a')
        date, url = link.text, link['href']
        gamelog[url + team] = [url] + [team] + [i.text for i in items]

In [None]:
with open('gamelog.pickle', 'wb') as handle:
    pickle.dump(gamelog, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
advanced_gamelog = {}

def grab_advanced_gamelog(season_teams, season):
    for key in season_teams:
        advanced_game_url = "https://www.sports-reference.com" + season_teams[key][0] + str(season) +"-gamelogs-advanced.html"
        advanced_game_response = requests.get(game_url)
        advanced_gamelog_text = advanced_game_response.text
        advanced_gamelog_soup = BeautifulSoup(advanced_gamelog_text,"lxml")
        advanced_gamelog_table = advanced_gamelog_soup.find('table')
        advanced_gamelog_rows = advanced_gamelog_table.find_all('tr')
        team = re.split("\/",season_teams[key][0])[3]
        for i in [i for i in range(2,len(advanced_gamelog_rows)) if i % 22 != 0 and i % 23 != 0 ]:
            advanced_items = advanced_gamelog_rows[i].find_all('td')
            advanced_link = items[0].find('a')
            date, url = link.text, link['href']
            advanced_gamelog[url + team] = [url] + [team] + [i.text for i in items]
            
            
            
    return advanced_gamelog

In [18]:
advanced_gamelog = grab_advanced_gamelog(season_teams, 2019)

In [19]:
with open('advanced_gamelog.pickle', 'wb') as handle:
    pickle.dump(advanced_gamelog, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
gamelog = pickle.load(open('gamelog.pickle', "rb" ))
advanced_gamelog = pickle.load(open('advanced_gamelog.pickle', "rb" ))

In [20]:
advanced_gamelog

{'/cbb/boxscores/2018-11-06-19-abilene-christian.htmlabilene-christian': ['/cbb/boxscores/2018-11-06-19-abilene-christian.html',
  'abilene-christian',
  '2018-11-06',
  '',
  'Arlington Baptist',
  'W',
  '107',
  '54',
  '142.7',
  '72.0',
  '74.8',
  '.318',
  '.348',
  '.704',
  '69.1',
  '63.4',
  '17.3',
  '16.2',
  '',
  '.705',
  '12.6',
  '50.0',
  '.212',
  '',
  '.418',
  '27.7',
  '86.2',
  '.265'],
 '/cbb/boxscores/2018-11-09-20-abilene-christian.htmlabilene-christian': ['/cbb/boxscores/2018-11-09-20-abilene-christian.html',
  'abilene-christian',
  '2018-11-09',
  '',
  'Arkansas State',
  'W',
  '94',
  '73',
  '122.1',
  '94.8',
  '77.1',
  '.564',
  '.255',
  '.674',
  '59.4',
  '54.8',
  '10.4',
  '0.0',
  '',
  '.636',
  '20.5',
  '39.3',
  '.436',
  '',
  '.492',
  '18.5',
  '75.0',
  '.254'],
 '/cbb/boxscores/2018-11-15-21-denver.htmlabilene-christian': ['/cbb/boxscores/2018-11-15-21-denver.html',
  'abilene-christian',
  '2018-11-15',
  '@',
  'Denver',
  'W',
  '

In [21]:
def combine(gamelog, advanced_gamelog):
    combo_log = {}
    for game in gamelog.keys():
        combo_log[game] = gamelog[game].extend(advanced_gamelog[game])
    return combo_log

In [None]:
combo_log = combine(gamelog, advanced_gamelog)


with open('combo_log.pickle', 'wb') as handle:
    pickle.dump(combo_log, handle, protocol=pickle.HIGHEST_PROTOCOL)

## GRAB URL FOR ALL GAMES

## Grab data from single game - hold off on pulling individual game data for now

Input url for each game
Output data table

In [None]:
def game_box_score(url)

# example url = 'https://www.sports-reference.com/cbb/boxscores/2020-03-11-21-stanford.html'
#note this will grab 8 tables (the last 4 will be basic and advanced stats, for each team)
    box = requests.get(url)
    boxscores = box.text
    soup = BeautifulSoup(boxscores, "html5lib")
    tables = soup.find_all('table')
    
    return tables

Pull Data from class "scorebox_meta": Date, Location <- this does not show up in a table

Pull Data from table "line-score": <- this also does not show up in a table...
Output: Away Team, Home Team, Away Score 1H, Away Score 2H, Away Score F, Home Score 1H, Home Score 2h, Home Score F

? What about overtimes...

https://stackoverflow.com/questions/49766150/when-scraping-data-from-basketball-reference-how-come-certain-tables-are-comment