This is the web scraper for getting NCAA Men's Basketball season averages. It also gets matchup data for every year.

In [1]:
import pandas as pd
import statsmodels.formula.api as stats
import statsmodels.api as sm
import numpy as np
import math
import sklearn.metrics as sk
from bs4 import BeautifulSoup
import requests
import json
from os import path

In [2]:
#Returns Page data from given URL
def GetPageData(URL):
    page = requests.get(URL)
    return(page)

In [3]:
#Sample data scraper for barttorvik.com main stats page. Addittional data available
URL = 'https://barttorvik.com/trank.php?year=2010&sort=&top=0&conlimit=All#'

TeamData = {}

page = GetPageData(URL)
soup = BeautifulSoup(page.content, 'html.parser')

#College Name
Names = []
TeamName = soup.find_all(class_='teamname')
for teams in TeamName:
    Names.append(teams.text.split('(')[0])
TeamData['Name']=Names

#Team Record
Games=[]
GamesPlayed = soup.find_all(class_='6 mobileout')
for games in GamesPlayed:
    Games.append(games.text)    
Games.pop(0)
TeamData['GamesPlayed']=Games


ConferenceRecord= []
TotalRecord = []
Record = soup.find_all(class_='5')
for rec in Record:
    if(rec.text != 'Rec'):
        TotalRecord.append("'"+rec.find('a').text)
        ConferenceRecord.append("'"+rec.find(class_='lowrow').text)
TeamData['ConfRecord']=ConferenceRecord
TeamData['TotalRecord']=TotalRecord

#Offensive Efficency
OffEff = []
OffensiveEfficencey = soup.find_all(class_='1')
for eff in OffensiveEfficencey:
    if(eff.text != 'AdjOE'):
        OffEff.append(eff.text[:len(eff.text)-len(eff.find(class_='lowrow').text)])
TeamData['OffEff']=OffEff

#Deffensive Efficency
DefEff = []
DeffensiveEfficencey = soup.find_all(class_='2')
for eff in DeffensiveEfficencey:
    if(eff.text != 'AdjDE'):
        DefEff.append(eff.text[:len(eff.text)-len(eff.find(class_='lowrow').text)])
TeamData['DefEff']=DefEff

#Turnovers
Turnovers = []
Turnover = soup.find_all(class_='11')
for eff in Turnover:
    try:
        Turnovers.append(eff.text[:len(eff.text)-len(eff.find(class_='lowrow').text)])
    except:
        pass
TeamData['Turnovers']=Turnovers

#ForcedTurnovers
fTurnovers = []
fTurnover = soup.find_all(class_='12')
for eff in fTurnover:
    try:
        fTurnovers.append(eff.text[:len(eff.text)-len(eff.find(class_='lowrow').text)])
    except:
        pass
TeamData['ForcedTurnovers']=fTurnovers


#2%FGOffensive
Perc2Of = []
Perc2Ofo = soup.find_all(class_='16')
for eff in Perc2Ofo:
    try:
        Perc2Of.append(eff.text[:len(eff.text)-len(eff.find(class_='lowrow').text)])
    except:
        pass
TeamData['2PFGOff']=Perc2Of


#2%FGDeffensive
Perc2D = []
Perc2Ofd = soup.find_all(class_='17')
for eff in Perc2Ofd:
    try:
        Perc2D.append(eff.text[:len(eff.text)-len(eff.find(class_='lowrow').text)])
    except:
        pass
TeamData['2PFGDef']=Perc2D


#3%FGOffensive
Perc3Of = []
Perc3Ofo = soup.find_all(class_='18')
for eff in Perc3Ofo:
    try:
        Perc3Of.append(eff.text[:len(eff.text)-len(eff.find(class_='lowrow').text)])
    except:
        pass
TeamData['3PFGOff']=Perc3Of

#3%FGDeffensive
Perc3D = []
Perc3Ofd = soup.find_all(class_='19')
for eff in Perc3Ofd:
    try:
        Perc3D.append(eff.text[:len(eff.text)-len(eff.find(class_='lowrow').text)])
    except:
        pass
TeamData['3PFGDef']=Perc3D


#FreeThrowsOffensive
PercFTOf = []
PercFTOfo = soup.find_all(class_='9')
for eff in PercFTOfo:
    try:
        PercFTOf.append(eff.text[:len(eff.text)-len(eff.find(class_='lowrow').text)])
    except:
        pass
TeamData['FTOff']=PercFTOf


#FreeThrowsDeffensive
PercFTD = []
PercFTOD = soup.find_all(class_='10')
for eff in PercFTOD:
    try:
        PercFTD.append(eff.text[:len(eff.text)-len(eff.find(class_='lowrow').text)])
    except:
        pass
TeamData['FTDef']=PercFTD

Stats = pd.DataFrame(data = TeamData)
Stats.to_csv('FullSeasonStats_2010.csv', index = False)


In [12]:
#Remove Tournament seeding in title of old team names
years = list(range(2021,2022))
for year in years:
    filename = 'FullStats\FullSeasonStats_'+str(year)+'.csv'
    with open(filename,'r') as f:
        df = pd.read_csv(f)
    CorrectedName = []
    for nam in df['Name']:
        temp = ''
        Second = False
        NotFound=True
        for part in nam.split():
            if part.isdigit() != 1 and NotFound and part != 'vs.':
                if Second:
                    temp+=' '
                temp+=part
                Second=True
            else:
                NotFound=False
        CorrectedName.append(temp.replace('vs.',''))
    df['Name'] = CorrectedName
    df.to_csv(filename, index = False)

In [31]:
#Retrieve stats from a team's season
def GetTeamStats(TeamName,Year):
    TeamName = TeamName.replace('&','%26')
    Url ='https://www.barttorvik.com/team.php?team=' + str(TeamName).replace(' ','+') + '&year=' + str(Year) 
    TeamData = {}
    page = GetPageData(Url)
    soup = BeautifulSoup(page.content, 'html.parser')    
    return(soup)

#Scrape the team's opponent, score, game result, and +/- for every game that year (includes postseason)
def ScrapOpponent(Soup,year,team):
    Data = {}
    PlusMinus=[]
    Opponent=[]
    score=[]
    Results=[]
    Score=[]
    GameStats = Soup.find_all(class_='mobileout')
    i = 23
    while i < len(GameStats):
        try:
            PlusMinus.append(GameStats[i+18].text)
            Opponent.append(GameStats[i].text)
        except:
            pass
        i+=21
    #Get the Score
    pas = Soup.find_all('tr')
    index = 34
    while len(Score) != len(PlusMinus):
        pas2 = pas[index].find_all('a')
        index+=1
        try:
            score = pas2[3].text
            sp = score.split(',')
            if sp[0] == 'W' or sp[0] =='L':
                Results.append(sp[0])
                Score.append(sp[1].replace('-','_'))
            else:
                PlusMinus.pop()
                Opponent.pop()
        except:
            pass
    
    Data['Opponent']=Opponent
    Data['Results']=Results
    Data['Score']=Score
    Data['PlusMinus']=PlusMinus
    Stats = pd.DataFrame(data = Data)
    filename = 'Games\\'+str(year)+'\\'+team+'_'+str(year)+'_Opponent.csv'
    Stats.to_csv(filename, index = False)
    return(0)

In [501]:
#Testing
year = 2010
team = 'Grambling St.'
bean = df.Name
#print(bean)
tem = ['Gonzaga','Vdog',"Houston"]
for teams in tem:
    if teams in list(df.Name):
        print(teams)
#TeamData = GetTeamStats(team,year)
#ScrapOpponent(TeamData,year,team)

Gonzaga
Houston


In [30]:
#Sample on scraping regular season stats for a team
TeamData = GetTeamStats('Michigan',2018)
ScrapOpponent(TeamData,2018,'Michigan')

0

In [19]:
#Iterates every year, every team, and scrapes their season's games
years = list(range(2021,2022))
for year in years:
    #Get the relevant year's full stats
    print("Starting Year: "+str(year))
    filename = 'FullStats\FullSeasonStats_'+str(year)+'.csv'
    with open(filename,'r') as f:
        df = pd.read_csv(f)
    for name in df.Name:
        #if not path.exists('Games\\'+str(year)+'\\'+team+'_'+str(year)+'_Opponent.csv'):
        try:
            TeamData = GetTeamStats(name,year)
            ScrapOpponent(TeamData,year,name)
        except:
            print("Bad Team: "+name)

Starting Year: 2021
Bad Team: Tulsa


In [None]:
#Gives every game for the season with game stats in JSON
dataMich = GetTeamStats('Michigan','2019')
relevant = dataMich.find_all(class_='teamOne')
dataStuff = relevant[0].find('script')
rawj = str(dataStuff)
j = rawj.split('var alldata = ')[1]
dataArray = j.rsplit('var gdata',1)[0]
tup = json.loads(dataArray[:34700])
for keys in tup:
    print(keys + ': ' + str(tup[keys]))