In [76]:
from os import path
import argparse
from datetime import datetime
import string
import re
import warnings
warnings.filterwarnings('ignore')
import urllib.request
from itertools import compress

import lxml.html as LH
import requests
from bs4 import BeautifulSoup as bs

import pandas as pd
import numpy as np

from utils import get_request

In [6]:
def get_player_info(url):
        r = get_request(url)
        all_tags = bs(r.content, "html.parser")

        tbl = all_tags.find("table", attrs={"class" : "sortable stats_table"})
        tbl_rows = tbl.find_all('tr')
        df_columns = ["player", "pos", "age", "tm", "g", "gs", "mp"]
        line = []
        for tr in tbl_rows:
            td = tr.find_all('td')
            row = [tr.text for tr in td]
            line.append(row)
        df = pd.DataFrame(line).iloc[1:,0:7]
        df.columns = df_columns
        df = df.set_index("player")

        ids_raw = [x for x in all_tags.find_all("td", class_ = "left")]
        ids = []
        player_name = []
        for x in ids_raw:
            try:
                player_name.append(x.get_text().strip())
                ids.append(x["data-append-csv"])
            except:
                next
        player_id_df = pd.DataFrame({"player":player_name[::2], "bbrefID":ids}).set_index("player")

        df_combined = df.join(player_id_df, how="inner").reset_index().drop_duplicates()
        return df_combined

In [7]:
test_url = 'https://www.basketball-reference.com/leagues/NBA_2018_per_game.html'
tmp = get_player_info(test_url)

In [25]:
tmp['mp'] = tmp['mp'].apply(lambda x: (int(x.split(".")[0])*60) + int((x.split(".")[1])))

In [26]:
tmp

Unnamed: 0,player,pos,age,tm,g,gs,mp,bbrefID
0,Aaron Brooks,PG,33,MIN,32,1,309,brookaa01
1,Aaron Gordon,PF,22,ORL,58,57,1929,gordoaa01
2,Aaron Harrison,SG,23,DAL,9,3,1509,harriaa01
3,Aaron Jackson,PG,31,HOU,1,0,2100,jacksaa01
4,Abdel Nader,SF,24,BOS,48,1,609,naderab01
5,Adreian Payne,PF,26,ORL,5,0,486,paynead01
6,Al Horford,C,31,BOS,72,72,1866,horfoal01
7,Al Jefferson,C,33,IND,36,1,784,jeffeal01
8,Al-Farouq Aminu,PF,27,POR,69,67,1800,aminual01
9,Alan Williams,PF,25,PHO,5,0,840,willial03


In [77]:
url = 'https://www.basketball-reference.com/players/a/arizatr01/gamelog/2018'

r = get_request(url)
all_tags = bs(r.content, "html.parser")

tbl = all_tags.find("table", attrs={"class" : "row_summable sortable stats_table"})
tbl_rows = tbl.find_all('tr')
line = []
for tr in tbl_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    line.append(row)

game_log_cols = ['G', 'date', 'age', 'tm', 'is_away', 'opp', 'game_outcome', 'GS', 'MP', 'FG',
                           'FGA', 'FG_pct', 'ThreeP', 'ThreePA', 'ThreeP_pct', 'FT', 'FTA', 'FT_pct', 'ORB', 'DRB',
                               'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', 'plus_minus']
df = pd.DataFrame(line).iloc[1:,:]
df.columns = game_log_cols
df = df[df['G'] != ''].dropna(axis=0).reset_index().iloc[:,1:]

df['secs_played'] = df['MP'].apply(lambda x: (int(x.split(":")[0])*60) + int((x.split(":")[1])))
df['is_away'] = np.where(df['is_away'] == '@', 'away', 'home')
df[["FG", "ThreeP", "TRB", "AST", "STL", "BLK", "TOV"]] = df[["FG", "ThreeP", "TRB", "AST", "STL", "BLK", "TOV"]].astype(float)
df["dk"] = (1*df["FG"]) + ((1/2)*df["ThreeP"]) + ((5/4)*df["TRB"]) + ((3/2)*df["AST"]) + (2*df["STL"]) + (2*df["BLK"]) + ((1/2)*df["TOV"])
double_double = pd.Series(df[["FG", "TRB", "AST", "STL", "BLK", "TOV"]].apply(lambda x: sum(x>=10), axis = 1) > 1)
df["dk"][double_double] += 1.5

In [102]:
url = 'https://www.basketball-reference.com/players/{first_letter}/{bbrefID}/gamelog-advanced/2018/'.format(first_letter = bbrefID[0], bbrefID=bbrefID)

r = get_request(url)
all_tags = bs(r.content, "html.parser")

tbl = all_tags.find("table", attrs={"class" : "row_summable sortable stats_table"})
tbl_rows = tbl.find_all('tr')
line = []
for tr in tbl_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    line.append(row)
    
df = pd.DataFrame(line).iloc[1:,:]
game_log_cols = ['G', 'date', 'age', 'tm', 'is_away', 'opp', 'game_outcome', 'GS', 'MP', "TS_pct", "eFG_pct", "ORB_pct",
                "DRB_pct", "TRB_pct", "AST_pct", "STL_pct", "BLK_pct", "TOV_pct", "USG_pct", "ORtg", "DRtg", "GmSc"]
df.columns = game_log_cols
df = df[df['G'] != ''].dropna(axis=0).reset_index().iloc[:,1:]

df['is_away'] = np.where(df['is_away'] == '@', 'away', 'home')

In [79]:
bbrefID = 'arizatr01'

In [110]:
class bbrefGameLogs:
    def __init__(self, year):
        self.year = year
    
    def get_player_game_logs(self, bbrefID, game_log_type):
        if game_log_type == "standard":
            url = 'https://www.basketball-reference.com/players/{first_letter}/{bbrefID}/gamelog/{year}/'.format(first_letter = bbrefID[0], 
                                                                                                                 bbrefID=bbrefID,
                                                                                                                 year=self.year)
            game_log_cols = ['G', 'date', 'age', 'tm', 'venue', 'opp', 'game_outcome', 'GS', 'MP', 'FG',
                               'FGA', 'FG_pct', 'ThreeP', 'ThreePA', 'ThreeP_pct', 'FT', 'FTA', 'FT_pct', 'ORB', 'DRB',
                                   'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', 'plus_minus']
        
        elif game_log_type == "advanced":
            url = url = 'https://www.basketball-reference.com/players/{first_letter}/{bbrefID}/gamelog-advanced/{year}/'.format(first_letter = bbrefID[0], 
                                                                                                                                bbrefID=bbrefID, 
                                                                                                                                year=self.year)
            game_log_cols = ['G', 'date', 'age', 'tm', 'venue', 'opp', 'game_outcome', 'GS', 'MP', "TS_pct", "eFG_pct", "ORB_pct",
                    "DRB_pct", "TRB_pct", "AST_pct", "STL_pct", "BLK_pct", "TOV_pct", "USG_pct", "ORtg", "DRtg", "GmSc"]
        
        #make request for table
        r = get_request(url)
        all_tags = bs(r.content, "html.parser")

        tbl = all_tags.find("table", attrs={"class" : "row_summable sortable stats_table"})
        tbl_rows = tbl.find_all('tr')
        line = []
        for tr in tbl_rows:
            td = tr.find_all('td')
            row = [tr.text for tr in td]
            line.append(row)
            
        #clean the table
        df = pd.DataFrame(line).iloc[1:,:]
        df.columns = game_log_cols
        df = df[df['G'] != ''].dropna(axis=0).reset_index().iloc[:,1:]
        
        if game_log_type == "standard":
            df['secs_played'] = df['MP'].apply(lambda x: (int(x.split(":")[0])*60) + int((x.split(":")[1])))
            df['venue'] = np.where(df['venue'] == '@', 'away', 'home')
            df[["FG", "ThreeP", "TRB", "AST", "STL", "BLK", "TOV"]] = df[["FG", "ThreeP", "TRB", "AST", "STL", "BLK", "TOV"]].astype(float)
            df["dk"] = (1*df["FG"]) + ((1/2)*df["ThreeP"]) + ((5/4)*df["TRB"]) + ((3/2)*df["AST"]) + (2*df["STL"]) + (2*df["BLK"]) + ((1/2)*df["TOV"])
            double_double = pd.Series(df[["FG", "TRB", "AST", "STL", "BLK", "TOV"]].apply(lambda x: sum(x>=10), axis = 1) > 1)
            df["dk"][double_double] += 1.5
        
        elif game_log_type =="advanced":
            df['venue'] = np.where(df['venue'] == '@', 'away', 'home')
        
        return df

In [111]:
game_log_scrape = bbrefGameLogs(year=2018)

In [114]:
tmp = game_log_scrape.get_player_game_logs(bbrefID='jamesle01', game_log_type='advanced')

In [115]:
tmp

Unnamed: 0,G,date,age,tm,venue,opp,game_outcome,GS,MP,TS_pct,...,DRB_pct,TRB_pct,AST_pct,STL_pct,BLK_pct,TOV_pct,USG_pct,ORtg,DRtg,GmSc
0,1,2017-10-17,32-291,CLE,home,BOS,W (+3),1,41:12,.698,...,35.0,19.4,43.7,0.0,4.2,16.2,26.0,126,94,28.2
1,2,2017-10-20,32-294,CLE,away,MIL,W (+19),1,37:25,.711,...,11.9,8.7,32.9,1.4,2.7,22.9,26.6,123,106,20.2
2,3,2017-10-21,32-295,CLE,home,ORL,L (-21),1,31:12,.624,...,13.7,6.5,14.9,1.6,2.8,5.4,25.2,125,116,17.6
3,4,2017-10-24,32-298,CLE,home,CHI,W (+7),1,37:15,.766,...,6.6,3.3,63.8,4.2,0.0,8.3,30.7,155,118,35.1
4,5,2017-10-25,32-299,CLE,away,BRK,L (-5),1,41:22,.616,...,21.8,13.7,60.2,0.0,9.9,25.4,32.2,110,107,25.1
5,6,2017-10-28,32-302,CLE,away,NOP,L (-22),1,30:46,.660,...,11.1,5.6,46.9,0.0,0.0,22.7,25.9,117,128,13.6
6,7,2017-10-29,32-303,CLE,home,NYK,L (-19),1,39:28,.518,...,24.3,13.2,37.9,2.5,0.0,20.6,21.9,94,111,13.8
7,8,2017-11-01,32-306,CLE,home,IND,L (-17),1,37:28,.682,...,15.6,9.2,56.2,2.6,2.0,24.8,37.2,111,121,27.0
8,9,2017-11-03,32-308,CLE,away,WAS,W (+8),1,42:41,.751,...,19.3,17.4,39.1,3.4,4.0,7.3,42.5,152,117,53.2
9,10,2017-11-05,32-310,CLE,home,ATL,L (-2),1,40:35,.662,...,8.3,6.5,51.0,1.2,3.9,23.4,25.5,121,115,24.3
