In [455]:
from bs4 import BeautifulSoup
import requests
from IPython.core.display import display, HTML
import numpy as np
import re
import pickle
import pandas as pd
import re

## Grab names for all NCAA teams

In [80]:
url = "https://www.sports-reference.com/cbb/schools/"
response = requests.get(url)
teams_text = response.text
team_soup = BeautifulSoup(teams_text, "lxml")
team_table = team_soup.find('table')
team_rows = team_table.find_all('tr')

In [250]:
#note that after every 20 teams, there is another header row
teams = {}
for i in [i for i in range(1,len(team_rows)) if i % 21 != 0]:
    items = team_rows[i].find_all('td')
    link = items[0].find('a')
    school, url = link.text, link['href']
    teams[school] = [url] + [i.text for i in items]

In [370]:
with open('teams_list.pickle', 'wb') as handle:
    pickle.dump(teams, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [381]:
#find teams in D1 for particular season
season = 2019
season_teams = []
for team in teams.keys():
    if int(teams[team][3]) <= season <= int(teams[team][4]):
        season_teams.append(team)


In [560]:
#create team_lookup for names - url name as key, values include name, formal name, and team url
team_lookup = {}
for key in season_teams:
    sched_url = "https://www.sports-reference.com" + teams[key][0] + str(season) +"-schedule.html"
    sched_response = requests.get(sched_url)
    sched_text = sched_response.text
    sched_soup = BeautifulSoup(sched_text,"lxml")
    sched_table = sched_soup.find('table')
    simple_name = re.split("\/",teams[key][0])[3]
    sched_rows = sched_table.find_all('tr')
    name = sched_table.find('a').text
    formal_name = teams[key][1]
    team_lookup[simple_name] = [name] + [formal_name] + [teams[key][0]]

In [564]:
with open('teams_lookup.pickle', 'wb') as handle:
    pickle.dump(team_lookup, handle, protocol=pickle.HIGHEST_PROTOCOL)

## GRAB gamelogs for each team for 2019

In [None]:
gamelog = {}
for key in season_teams:
    game_url = "https://www.sports-reference.com" + teams[key][0] + str(season) +"-gamelogs.html"
    game_response = requests.get(game_url)
    gamelog_text = game_response.text
    gamelog_soup = BeautifulSoup(gamelog_text,"lxml")
    gamelog_table = gamelog_soup.find('table')
    gamelog_rows = gamelog_table.find_all('tr')
    team = re.split("\/",teams[key][0])[3]
    for i in [i for i in range(2,len(gamelog_rows)) if i % 22 != 0 and i % 23 != 0 ]:
        items = gamelog_rows[i].find_all('td')
        link = items[0].find('a')
        date, url = link.text, link['href']
        gamelog[url + team] = [url] + [team] + [i.text for i in items]

In [384]:
with open('gamelog.pickle', 'wb') as handle:
    pickle.dump(gamelog, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [427]:
games = pd.DataFrame.from_dict(gamelog, orient = "index")

In [441]:
columns = ['url','Team','Date','Court','Opponent','Result','Tm','Opp','FG','FGA','FG%','3P','3PA','3P%','FT','FTA',
           'FT%','ORB','TRB','AST','STL','BLK','TOV','PF','blank','FG_O','FGA_O','FG%_O','3P_O','3PA_O','3P%_O','FT_O',
           'FTA_O','FT%_O','RB_O','TRB_O','AST_O','STL_O','BLK_O','TOV_O','PF_O']
games.columns=columns
games.drop(columns='blank');

In [636]:
#formatting data frame
games['Date'] = pd.to_datetime(games['Date'])
num_cols_basic = ['Tm','Opp','FG','FGA','FG%','3P','3PA','3P%','FT','FTA','FT%','ORB','TRB','AST','STL','BLK',
                  'TOV','PF','blank','FG_O','FGA_O','FG%_O','3P_O','3PA_O','3P%_O','FT_O','FTA_O','FT%_O','RB_O',
                  'TRB_O','AST_O','STL_O','BLK_O','TOV_O','PF_O']
for i in num_cols_basic:
    games[i] = pd.to_numeric(games[i])

In [484]:
#Grab advanced stats for each team
advanced_gamelog = {}
for key in season_teams:
    game_url = "https://www.sports-reference.com" + teams[key][0] + str(season) +"-gamelogs-advanced.html"
    game_response = requests.get(game_url)
    gamelog_text = game_response.text
    gamelog_soup = BeautifulSoup(gamelog_text,"lxml")
    gamelog_table = gamelog_soup.find('table')
    gamelog_rows = gamelog_table.find_all('tr')
    team = re.split("\/",teams[key][0])[3]
    for i in [i for i in range(2,len(gamelog_rows)) if i % 22 != 0 and i % 23 != 0 ]:
        items = gamelog_rows[i].find_all('td')
        link = items[0].find('a')
        date, url = link.text, link['href']
        advanced_gamelog[url + team] = [url] + [team] + [i.text for i in items]

In [485]:
with open('advanced_gamelog.pickle', 'wb') as handle:
    pickle.dump(advanced_gamelog, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [487]:
advanced_games = pd.DataFrame.from_dict(advanced_gamelog, orient = "index")

In [540]:
columns = ['url','Team','Date','Court','Opponent','Result','Tm','Opp','ORtg','DRtg','Pace','FTr','3PAr','TS%','TRB%',
           'AST%','STL%','BLK%','blank1','OeFG%','OTOV%','ORB%','OFT/FGA','blank2','DeFG%','DTOV%','DRB%','DFT/FGA']
advanced_games.columns=columns
advanced_games.drop(columns=['blank1','blank2']);

In [640]:
advanced_games['Date'] = pd.to_datetime(advanced_games['Date'])
num_cols_advanced = ['Tm','Opp','ORtg','DRtg','Pace','FTr','3PAr','TS%','TRB%',
           'AST%','STL%','BLK%','blank1','OeFG%','OTOV%','ORB%','OFT/FGA','blank2','DeFG%','DTOV%','DRB%','DFT/FGA']
for i in num_cols_advanced:
    advanced_games[i] = pd.to_numeric(advanced_games[i])

In [587]:
advanced_games[advanced_games['Team']=='california'][1:2]

Unnamed: 0,url,Team,Date,Court,Opponent,Result,Tm,Opp,ORtg,DRtg,...,blank1,OeFG%,OTOV%,ORB%,OFT/FGA,blank2,DeFG%,DTOV%,DRB%,DFT/FGA
/cbb/boxscores/2018-11-13-22-california.htmlcalifornia,/cbb/boxscores/2018-11-13-22-california.html,california,2018-11-13,,Hampton,W,80,66,108.1,89.2,...,,0.577,13.4,17.2,0.385,,0.331,10.0,70.0,0.268


In [641]:
# Final DF
df = pd.DataFrame(games['url'])
df['Team'] = [team_lookup[x][0] for x in games['Team']]
df['Date'] = games['Date']
df['Opponent'] = games['Opponent']
df['Court'] = games['Court']

In [642]:
games[games['Team']=='california'][1:2]

Unnamed: 0,url,Team,Date,Court,Opponent,Result,Tm,Opp,FG,FGA,...,FT_O,FTA_O,FT%_O,RB_O,TRB_O,AST_O,STL_O,BLK_O,TOV_O,PF_O
/cbb/boxscores/2018-11-13-22-california.htmlcalifornia,/cbb/boxscores/2018-11-13-22-california.html,california,2018-11-13,,Hampton,W,80,66,26,52,...,19,22,0.864,12,36,5,4,1,9,22


In [643]:
df['PD'] = games['Tm']-games['Opp']

In [644]:
df.head()

Unnamed: 0,url,Team,Date,Opponent,Court,PD
/cbb/boxscores/2018-11-09-23-california.htmlcalifornia,/cbb/boxscores/2018-11-09-23-california.html,California,2018-11-09,Yale,N,-17
/cbb/boxscores/2018-11-13-22-california.htmlcalifornia,/cbb/boxscores/2018-11-13-22-california.html,California,2018-11-13,Hampton,,14
/cbb/boxscores/2018-11-19-19-california.htmlcalifornia,/cbb/boxscores/2018-11-19-19-california.html,California,2018-11-19,St. John's (NY),N,-3
/cbb/boxscores/2018-11-20-17-temple.htmlcalifornia,/cbb/boxscores/2018-11-20-17-temple.html,California,2018-11-20,Temple,N,-17
/cbb/boxscores/2018-11-26-21-california.htmlcalifornia,/cbb/boxscores/2018-11-26-21-california.html,California,2018-11-26,Santa Clara,,12


In [620]:
games.Court.unique()

array(['N', '', '@'], dtype=object)

## GRAB URL FOR ALL GAMES

## Grab data from single game - hold off on pulling individual game data for now

Input url for each game
Output data table

In [7]:
url = 'https://www.sports-reference.com/cbb/boxscores/2020-03-11-21-stanford.html'

box = requests.get(url)
boxscores = box.text
soup = BeautifulSoup(boxscores, "html5lib")

In [65]:
tables = soup.find_all('table')
#note this will grab 8 tables (the last 4 will be basic and advanced stats, for each team)

Pull Data from class "scorebox_meta": Date, Location <- this does not show up in a table

Pull Data from table "line-score": <- this also does not show up in a table...
Output: Away Team, Home Team, Away Score 1H, Away Score 2H, Away Score F, Home Score 1H, Home Score 2h, Home Score F

? What about overtimes...

## Scratchwork

In [583]:
#test case for gamelogs
cal_url = "https://www.sports-reference.com/cbb/schools/california/2019-gamelogs.html"
cal_response = requests.get(cal_url)
gamelog_text = cal_response.text
gamelog_soup = BeautifulSoup(gamelog_text,"lxml")
gamelog_table = gamelog_soup.find('table')
gamelog_rows = gamelog_table.find_all('tr')
#test case continued
cal_log = {}
team = 'california'
for i in [i for i in range(2,len(gamelog_rows)) if i % 22 != 0 and i % 23 != 0 ]:
    items = gamelog_rows[i].find_all('td')
    link = items[0].find('a')
    date, url = link.text, link['href']
    cal_log[url + team] = [url] + [team] + [i.text for i in items]