In [1]:
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import datetime
import json
import random

import bs4 as bs
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

import utils

In [50]:
def GetTableFromSoup(table_html, headers_html, include_href = True):

    colnames = []
    headers = headers_html.find_all("th")
    for header in headers:
        colnames.append(header.text)
    if include_href:
        colnames.append("href")

    data = []
    rows = table_html.find_all("tr")
    for row in rows:
        row.find_all("td")
        # get the href
        cell_data = [ele.text.strip() for ele in row.find_all("td")]
        if include_href:
            
            try:
                href = row.find("a").get("href")
                cell_data.append(href)
            except:
                cell_data.append(None)

        data.append(cell_data)

    data = pd.DataFrame(data, columns = colnames)

    return data

In [None]:
# set up browser (use headless when deployed)
url = "https://www.espn.com/golf/stats/player"
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless = False)

In [115]:
all_seasons = pd.DataFrame()
for season in range(2015,2024):
    url = f"https://www.espn.com/golf/stats/player/_/season/{season}"
    browser.visit(url)
    time.sleep(2.5)

    while True:
        # click show more
        try:
            browser.find_by_text('Show More').click()
            time.sleep(1)
        except:
            break

    soup = BeautifulSoup(browser.html, 'html.parser')

    tabs = soup.find_all("table")
    headers = soup.find_all("thead")
    a = GetTableFromSoup(tabs[0], headers[0], include_href=True)
    b = GetTableFromSoup(tabs[1], headers[1], include_href=False)
    # combine a and b into a dataframe
    df = pd.concat([a, b], axis = 1)
    # drop row with none values
    df = df.dropna()
    # add season column
    df['season'] = season

    # concat
    all_seasons = pd.concat([all_seasons, df], axis = 0)


In [116]:
# convert earnings to numeric
all_seasons['earnings'] = all_seasons['earnings'].str.replace('$', '').str.replace(',', '').astype(float)
# rename all to snake case
all_seasons.columns = [col.lower().replace(' ', '_') for col in all_seasons.columns]
all_seasons

Unnamed: 0,rk,name,age,href,earnings,cup,evnts,rnds,cuts,top10,wins,score,ddis,dacc,gir,putts,sand,birds,season
1,1,Jordan Spieth,29,https://www.espn.com/golf/player/_/id/5467/jor...,"$12,030,465",6392,26,92,22,16,6,68.9,291.8,62.9,64.9,1.699,58.1,4.620,2015
2,2,Jason Day,35,https://www.espn.com/golf/player/_/id/1680/jas...,"$9,403,330",6970,21,76,19,12,5,68.9,313.7,55.9,67.1,1.712,61.1,4.711,2015
3,3,Bubba Watson,44,https://www.espn.com/golf/player/_/id/780/bubb...,"$6,876,797",4009,20,72,18,10,2,69.3,315.2,56.6,64.5,1.756,46.9,4.278,2015
4,4,Rickie Fowler,34,https://www.espn.com/golf/player/_/id/3702/ric...,"$5,773,430",4196,22,76,18,8,2,70.3,296.8,62.1,61.5,1.734,55.8,4.053,2015
5,5,Dustin Johnson,39,https://www.espn.com/golf/player/_/id/3448/dus...,"$5,509,467",2854,21,73,18,11,1,68.9,317.7,55.5,67.1,1.715,38.6,4.164,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,235,D.A. Points,46,https://www.espn.com/golf/player/_/id/1196/da-...,"$15,390",9,8,0,1,0,0,0.0,0.0,0.0,0.0,0.000,0.0,0.000,2023
236,236,Scott Gutschewski,46,https://www.espn.com/golf/player/_/id/1195/sco...,"$14,690",5,1,0,1,0,0,0.0,0.0,0.0,0.0,0.000,0.0,0.000,2023
237,237,D.J. Trahan,42,https://www.espn.com/golf/player/_/id/431/dj-t...,"$12,350",7,6,0,1,0,0,0.0,0.0,0.0,0.0,0.000,0.0,0.000,2023
238,238,Kevin Stadler,43,https://www.espn.com/golf/player/_/id/860/kevi...,"$8,968",4,6,0,1,0,0,0.0,0.0,0.0,0.0,0.000,0.0,0.000,2023


In [80]:
# unique hrefs:
hrefs = all_seasons['href'].unique()

player_bio = pd.DataFrame()
for href in hrefs:
    browser.visit(href)
    try:
        time.sleep(0.2)
        soup = BeautifulSoup(browser.html, 'html.parser')
        # find ul with player header in the class
        bio_elems = soup.find('ul', class_ = re.compile('PlayerHeader__Bio')).find_all('li')
        info = []
        names = []
        for elem in bio_elems:
            divs = elem.find_all('div')
            names.append(divs[0].text.strip())
            info.append(divs[1].text.strip())

        df = pd.DataFrame(info, index = names).T
        df['href'] = href
        player_bio = pd.concat([player_bio, df], axis = 0)

    except:
        pass

In [101]:
# clean up
birthdates = player_bio['Birthdate'].str.split(' ')
cleaned = []
for birthdate in birthdates:
    try:
        cleaned.append(birthdate[0])
    except:
        cleaned.append(None)
        
player_bio['Birthdate'] = cleaned
# extract id from href
player_bio['espn_id'] = player_bio['href'].str.split('/').str[-2]
# drop height and weight
player_bio = player_bio.drop(['Height', 'Weight', 'HT/WT'], axis = 1)
# rename all to snake case
player_bio.columns = [col.lower().replace(' ', '_') for col in player_bio.columns]


In [117]:
# merge all_seasons and player_bio
all_seasons = all_seasons.merge(player_bio[['href','espn_id']], on = 'href', how = 'left')
# drop href
all_seasons = all_seasons.drop('href', axis = 1)
all_seasons.head()


Unnamed: 0,rk,name,age,earnings,cup,evnts,rnds,cuts,top10,wins,score,ddis,dacc,gir,putts,sand,birds,season,espn_id
0,1,Jordan Spieth,29,"$12,030,465",6392,26,92,22,16,6,68.9,291.8,62.9,64.9,1.699,58.1,4.62,2015,5467
1,2,Jason Day,35,"$9,403,330",6970,21,76,19,12,5,68.9,313.7,55.9,67.1,1.712,61.1,4.711,2015,1680
2,3,Bubba Watson,44,"$6,876,797",4009,20,72,18,10,2,69.3,315.2,56.6,64.5,1.756,46.9,4.278,2015,780
3,4,Rickie Fowler,34,"$5,773,430",4196,22,76,18,8,2,70.3,296.8,62.1,61.5,1.734,55.8,4.053,2015,3702
4,5,Dustin Johnson,39,"$5,509,467",2854,21,73,18,11,1,68.9,317.7,55.5,67.1,1.715,38.6,4.164,2015,3448


In [119]:
utils.write_to_db(all_seasons, "espn_stats", append=False)

2023-07-15 22:56:54,698 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-07-15 22:56:54,700 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-07-15 22:56:54,753 INFO sqlalchemy.engine.Engine select current_schema()
2023-07-15 22:56:54,754 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-07-15 22:56:54,803 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-07-15 22:56:54,804 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-07-15 22:56:54,862 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2023-07-15 22:56:54,863 INFO sqlalchemy.engine.Engine [generated in 0.00062s] {'name': 'espn_stats'}
2023-07-15 22:56:54,922 INFO sqlalchemy.engine.Engine 
CREATE TABLE espn_stats (
	rk TEXT, 
	name TEXT, 
	age TEXT, 
	earnings FLOAT(53), 
	cup TEXT, 
	evnts TEXT, 
	rnds TEXT, 
	cuts TEXT, 
	top10 TEXT, 
	wins TEXT, 
	score TEXT, 
	ddis TEXT, 
	dacc TEXT, 

In [120]:
utils.write_to_db(player_bio, "espn_bio", append=False)

2023-07-15 22:57:30,840 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-07-15 22:57:30,841 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-07-15 22:57:30,977 INFO sqlalchemy.engine.Engine select current_schema()
2023-07-15 22:57:30,978 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-07-15 22:57:31,066 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-07-15 22:57:31,066 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-07-15 22:57:31,138 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2023-07-15 22:57:31,139 INFO sqlalchemy.engine.Engine [generated in 0.00060s] {'name': 'espn_bio'}
2023-07-15 22:57:31,253 INFO sqlalchemy.engine.Engine 
CREATE TABLE espn_bio (
	birthdate TEXT, 
	birthplace TEXT, 
	college TEXT, 
	swing TEXT, 
	turned_pro TEXT, 
	href TEXT, 
	espn_id TEXT
)


2023-07-15 22:57:31,254 INFO sqlalchemy.engine.Engine [no key 0