In [30]:
import string
import re
import warnings
warnings.filterwarnings('ignore')
import urllib.request
from itertools import compress

import lxml.html as LH
import requests
from bs4 import BeautifulSoup as bs

import pandas as pd

from utils import get_request

In [69]:
class bbref_scrape:
    def __init__(self, year, sport_type, url):
        self.year = year
        self.sport_type = sport_type
        self.url = url.replace("YYYY", str(year))

    def get_player_ids(self):
        r = get_request(self.url)
        all_tags = bs(r.content, "html.parser")
        tmp = [x for x in all_tags.find_all("td", class_ = "left")]
        ids = []
        for x in tmp:
            try:
                ids.append(x["data-append-csv"])
            except:
                next
        return(list(set(ids)))

    def get_player_links(self):
        r = get_request(self.url)
        all_tags = bs(r.content, "html.parser")
        ids_bool = [bool(re.search(pattern="players/\w/.+", string = x["href"])) for x in all_tags.find_all("a")]
        ids = list(compress([x["href"] for x in all_tags.find_all("a")], ids_bool))
        return(list(set([re.sub(pattern="[.](html)",string=x, repl="") for x in ids])))
    
    def get_player_gamelogs(self, link):
        def text(elt):
            return elt.text_content().replace(u'\xa0', u' ')
        
        if(self.sport_type == "basketball"):
            ref_link = "basketball-reference.com/"
            n = 30
            tbl_xpath = '//*[@id="pgl_basic"]'
            game_log_cols = ['bbrefID', 'G', 'Date', 'Age', 'Tm', 'is_away', 'Opp', 'game_outcome', 'GS', 'MP', 'FG',
                               'FGA', 'FG_pct', '3P', '3PA', '3P_pct', 'FT', 'FTA', 'FT_pct', 'ORB', 'DRB',
                                   'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', 'plus_minus']
        else:
            ref_link = "hockey-reference.com/"
            n = 29
            tbl_xpath = '//*[@id="gamelog"]'

        bbrefID = re.findall(string=link, pattern="(?<=[/])\w+|\d+")[2]
        url = "https://www."+ ref_link + link + "/gamelog/" + str(self.year)
        r = get_request(url)
        all_tags = LH.fromstring(r.content)
    
        for table in all_tags.xpath(tbl_xpath):
            header = [text(th) for th in table.xpath('//th')][1:n]
            data = [[text(td) for td in tr.xpath('td')]  
                    for tr in table.xpath('//tr')][1:]
            data = [row for row in data if len(row)==len(header)]
            data = pd.DataFrame(data, columns = header)
            df = pd.concat([pd.DataFrame({"bbrefID":[bbrefID for bbref in range(len(data))]}), data], axis=1)
            
            if df is not None and self.sport_type == "basketball":
                df.columns = game_log_cols
                df[["FG", "3P", "TRB", "AST", "STL", "BLK", "TOV"]] = df[["FG", "3P", "TRB", "AST", "STL", "BLK", "TOV"]].astype(float)
                df["dk"] = (1*df["FG"]) + ((1/2)*df["3P"]) + ((5/4)*df["TRB"]) + ((3/2)*df["AST"]) + (2*df["STL"]) + (2*df["BLK"]) + ((1/2)*df["TOV"])
                double_double = pd.Series(df[["FG", "TRB", "AST", "STL", "BLK", "TOV"]].apply(lambda x: sum(x>=10), axis = 1) > 1)
                df["dk"][double_double] += 1.5
            return(df)
    
    def run(self):
        player_ids = self.get_player_links()
        player_gamelog_list = [self.get_player_gamelogs(link = x) for x in player_ids[0:5]]
        return pd.concat([x for x in player_gamelog_list if x is not None], axis=0, ignore_index=True)

In [70]:
player_pg = "https://www.basketball-reference.com/leagues/NBA_YYYY_per_game.html"
scraper = bbref_scrape(sport_type="basketball", year=2017, url=player_pg)

In [71]:
tmp = scraper.run()

0 0 0 0 0 0 

In [72]:
tmp.to_gbq(project_id="scarlet-labs", destination_table="basketball.gamelogs_2017")

ContextualVersionConflict: (pyasn1 0.1.9 (c:\users\neste\anaconda3\lib\site-packages), Requirement.parse('pyasn1<0.5.0,>=0.4.1'), {'pyasn1-modules'})

In [65]:
tmp

Unnamed: 0,bbrefID,G,Date,Age,Tm,is_away,Opp,game_outcome,GS,MP,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,plus_minus,dk
0,mcrobjo01,1,2016-11-12,29-258,MIA,,UTA,L (-11),0,12:50,...,0.0,1.0,0.0,0.0,1.0,2,4,0.9,+3,4.00
1,mcrobjo01,2,2016-11-14,29-260,MIA,@,SAS,L (-4),0,13:59,...,3.0,3.0,2.0,0.0,0.0,2,0,0.0,-4,12.25
2,mcrobjo01,3,2016-11-15,29-261,MIA,,ATL,L (-3),0,8:59,...,5.0,1.0,0.0,0.0,0.0,2,0,-0.3,-3,7.75
3,mcrobjo01,4,2016-11-21,29-267,MIA,@,PHI,L (-7),0,11:41,...,4.0,0.0,0.0,1.0,1.0,0,2,1.7,-7,8.50
4,mcrobjo01,5,2016-11-23,29-269,MIA,@,DET,L (-23),0,15:39,...,7.0,2.0,1.0,0.0,0.0,0,2,3.4,+5,14.75
5,mcrobjo01,6,2016-11-25,29-271,MIA,@,MEM,W (+9),0,12:04,...,5.0,3.0,0.0,1.0,0.0,2,5,4.4,-4,14.75
6,mcrobjo01,7,2016-11-26,29-272,MIA,,MEM,L (-3),0,12:55,...,2.0,1.0,2.0,0.0,0.0,1,2,4.6,-4,9.00
7,mcrobjo01,8,2016-11-28,29-274,MIA,,BOS,L (-8),0,6:37,...,2.0,2.0,0.0,0.0,1.0,1,2,2.0,-4,7.00
8,mcrobjo01,9,2016-11-30,29-276,MIA,@,DEN,W (+8),1,16:28,...,3.0,3.0,0.0,1.0,1.0,3,1,-0.3,-6,10.75
9,mcrobjo01,10,2016-12-01,29-277,MIA,@,UTA,W (+1),1,23:32,...,5.0,1.0,0.0,0.0,2.0,0,10,6.0,-8,13.75


In [4]:
hp = HTMLTableParser()
#NBA
player_pg = "https://www.basketball-reference.com/leagues/NBA_2017_per_game.html"
player_p100 = "https://www.basketball-reference.com/leagues/NBA_2017_per_poss.html"
player_p36 = "https://www.basketball-reference.com/leagues/NBA_2017_per_minute.html"
player_advanced = "https://www.basketball-reference.com/leagues/NBA_2017_advanced.html"
#NHL
skater_basic = "https://www.hockey-reference.com/leagues/NHL_2018_skaters.html"
skater_advanced = "https://www.hockey-reference.com/leagues/NHL_2018_skaters-advanced.html"

In [24]:
#NHL Skater Game Logs
year = 2018
game_log_cols = ['bbrefID', 'Date', 'G', 'Age', 'Team', 'is_away', 'Opp', 'win_loss', 'goals', 'assists', 'pts',
       'plus_minus', 'pim', 'g_ev', 'g_pp', 'g_sh', 'g_gw', 'a_ev', 'a_pp', 'a_sh', 'shots', 'shooting_pct', 'shifts', 'toi',
       'hits', 'blocks', 'fow', 'fol', 'fo_pct']
ids = bbref_scrape.get_player_links(skater_basic)
game_logs = [bbref_scrape.get_player_gamelogs(sport_type = "hockey", link = x, year = year) for x in ids]
game_logs_new = list(compress(game_logs, [x is not None for x in game_logs]))
game_logs_df = pd.concat(list(compress(game_logs_new, ["Goalie Stats" not in x.columns for x in game_logs_new])))
game_logs_df.columns = game_log_cols
game_logs_df.to_csv("C:\\Users\\neste\\Google Drive\\NBA-NHL\\game_logs\\nhl_game_logs_%s.csv" % str(year))