In [None]:
from bs4 import BeautifulSoup
import requests
from IPython.core.display import display, HTML
import numpy as np
import re
import pickle
import pandas as pd
import re

## Grab names for all NCAA teams

In [None]:
url = "https://www.sports-reference.com/cbb/schools/"
response = requests.get(url)
teams_text = response.text
team_soup = BeautifulSoup(teams_text, "lxml")
team_table = team_soup.find('table')
team_rows = team_table.find_all('tr')


In [None]:
#note that after every 20 teams, there is another header row
teams_list = {}
for i in [i for i in range(1,len(team_rows)) if i % 21 != 0]:
    items = team_rows[i].find_all('td')
    link = items[0].find('a')
    school, url = link.text, link['href']
    teams_list[school] = [url] + [i.text for i in items]

In [None]:
with open('teams_list.pickle', 'wb') as handle:
    pickle.dump(teams_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Create Dictionary of D1 Teams for each season

In [None]:
def teams(teams_list, season):
    '''
    Find teams in D1 for particular season
    
    input: season, this will be the year of interest
    output: a list of all the D1 teams that played that year, with their formal name
    '''
    season_teams = {}
    for team in teams_list.keys():
        if int(teams_list[team][3]) <= season <= int(teams_list[team][4]):
            season_teams[team] = teams_list[team]
    return season_teams

In [None]:
def team_name_dict(season_teams, season):
    '''
    Create team_lookup for names - url name as key, values include name, formal name, and team url
    
    input: the teams identified for a particular season, the year of the season
    output: a dictionary where the keys are the simple names 
    '''
    team_lookup = {}
    for key in season_teams:
        sched_url = "https://www.sports-reference.com" + season_teams[key][0] + str(season) +"-schedule.html"
        sched_response = requests.get(sched_url)
        sched_text = sched_response.text
        sched_soup = BeautifulSoup(sched_text,"lxml")
        sched_table = sched_soup.find('table')
        simple_name = re.split("\/",season_teams[key][0])[3]
        sched_rows = sched_table.find_all('tr')
        name = sched_table.find('a').text
        formal_name = season_teams[key][1]
        team_lookup[simple_name] = [name] + [formal_name] + [season_teams[key][0]]
    return team_lookup

In [None]:
#analysis was done for years 2014-2019, omitting 2020 because the season was cut short due to covid-19
for i in [2014, 2015, 2016, 2017, 2018, 2019]:
    season_teams = teams(teams_list, i)
    team_lookup = team_name_dict(season_teams, 2019)
    with open(f'team_lookup_{i}.pickle', 'wb') as handle:
        pickle.dump(team_lookup, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
team_lookup = pickle.load(open('team_lookup.pickle', "rb" ))

## Helper functions to webscrap data

In [None]:
def grab_gamelog(season_teams, season):
    '''
    This grabs the gamelogs for all teams, for the season that is specified.
    
    Input: Dictionary where the keys are the teams for specified season; specified season
    Output: A dictionary, where each key stores a season's worth of individual game statistics for one team;
            The dictionary will hold all teams' information
    
    '''
    gamelog = {}
    for key in season_teams:
        game_url = "https://www.sports-reference.com" + season_teams[key][0] + str(season) +"-gamelogs.html"
        print(game_url)
        game_response = requests.get(game_url)
        gamelog_text = game_response.text
        gamelog_soup = BeautifulSoup(gamelog_text,"lxml")
        gamelog_table = gamelog_soup.find('table')
        gamelog_rows = gamelog_table.find_all('tr')
        team = re.split("\/",season_teams[key][0])[3]
        for i in [i for i in range(2,len(gamelog_rows)) if i % 22 != 0 and i % 23 != 0 and i % 44 != 0 and i % 45 != 0]:
            items = gamelog_rows[i].find_all('td')
            link = items[0].find('a')
            if link == None:
                pass
            else:
                date, url = link.text, link['href']
                gamelog[url + team] = [url] + [team] + [i.text for i in items]
    return gamelog

In [None]:
def grab_gamelog_advanced(season_teams, season):
    '''
    This grabs the gamelogs with ADVANCED STATS for all teams, for the season that is specified.
    
    Note this is exactly the same as the grab_gamelog function, except goes to a different website
    
    Input: Dictionary where the keys are the teams for specified season; specified season
    Output: A dictionary, where each key stores a season's worth of individual game statistics for one team;
            The dictionary will hold all teams' information
    
    '''
    gamelog = {}
    for key in season_teams:
        game_url = "https://www.sports-reference.com" + season_teams[key][0] + str(season) +"-gamelogs-advanced.html"
        game_response = requests.get(game_url)
        gamelog_text = game_response.text
        gamelog_soup = BeautifulSoup(gamelog_text,"lxml")
        gamelog_table = gamelog_soup.find('table')
        gamelog_rows = gamelog_table.find_all('tr')
        team = re.split("\/",season_teams[key][0])[3]
        for i in [i for i in range(2,len(gamelog_rows)) if i % 22 != 0 and i % 23 != 0 and i % 44 != 0 and i % 45 != 0]:
            items = gamelog_rows[i].find_all('td')
            link = items[0].find('a')
            if link == None:
                pass
            else:
                date, url = link.text, link['href']
                gamelog[url + team] = [url] + [team] + [i.text for i in items]
    return gamelog

In [None]:
def combine(gamelog, advanced_gamelog):
    '''
    Combines the two dictionaries so that there is a single dictionary housing both basic and advanced stats
    '''
    combo_log = {}
    for game in gamelog.keys():
        gamelog[game].extend(advanced_gamelog[game])
    return gamelog

## Get Line Data

In [None]:
def get_line_data(year_nums):
    """
    Helper function to load data from http://www.thepredictiontracker.com/basketball.php
    """
    url = "http://www.thepredictiontracker.com/ncaabb{}.csv"
    dfs = []
    for year in year_nums:
        file_url = url.format(year-1)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)

In [None]:
lines = get_line_data([14,15,16,17,18,19])
with open(f'lines.pickle', 'wb') as handle:
    pickle.dump(lines, handle, protocol=pickle.HIGHEST_PROTOCOL)

## CODE TO GRAB THE DATA

In [None]:
## Pull multiple seasons

my_seasons = [2015, 2019]

for i in my_seasons:
    this_season = teams(teams_list, i)
    
    this_gamelog = grab_gamelog(this_season, i)
    with open(f'gamelog_{i}.pickle', 'wb') as handle:
        pickle.dump(this_gamelog, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f'gamelog {i}')        
        
    this_advanced_gamelog = grab_gamelog_advanced(this_season, i)
    with open(f'advanced_gamelog_{i}.pickle', 'wb') as handle:
        pickle.dump(this_advanced_gamelog, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f'advanced_gamelog {i}')  
    
    this_combo_log = combine(this_gamelog, this_advanced_gamelog)
    with open(f'combo_log_{i}.pickle', 'wb') as handle:
        pickle.dump(this_combo_log, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f'done with {i} season')