In [186]:
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import datetime
import json
import random

import bs4 as bs
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

import utils

In [172]:
# set up browser (use headless when deployed)
url = "https://www.pgatour.com/stats/detail/109"
executable_path = {'executable_path': ChromeDriverManager().install()}
# chop = webdriver.ChromeOptions()
# chop.add_extension('adblock.crx')
browser = Browser('chrome', **executable_path, headless = False)

In [173]:
browser.visit(url)
time.sleep(1)
browser.find_by_value('Time Period').click()
time.sleep(1)
browser.find_by_value('Tournament Only').click()
time.sleep(1)

# scrape
soup = BeautifulSoup(browser.html, 'html.parser')
# find the div with class like "menu-list"
menu_lists = soup.find_all("div", class_ = lambda class_: class_ and "menu-list" in class_)

seasons = []
season_list = menu_lists[0].find_all("button")
for season in season_list:
    seasons.append(season.text)

In [174]:
def GetTourneyMoneyFromSoup(soup):
    # find the table in the html
    tab = soup.find("table")

    # get the table headers
    cols = []
    headers = tab.find("thead").find_all("th")
    for header in headers:
        cols.append(header.text)

    # get the table rows
    tab_rows = tab.find_all("tr")

    # collect the data
    all_data = []
    for row in tab_rows:

        # get the table data tags
        tds = row.find_all("td")
        # if no data, skip
        if len(tds) == 0:
            continue
        else:
            try:
                row_data = []
                for col in range(len(cols)):
                    row_data.append(tds[col].text)
                # create dictionary with cols mapped to row_data
                row_dict = dict(zip(cols, row_data))
                all_data.append(row_dict)
            except:
                pass

    df = pd.DataFrame(all_data)

    tourney_tag = soup.find("p", text = lambda text: text and text == "Tournament")
    tourney_tags = tourney_tag.find_parent("div").find_parent("div").find_all("p")
    tournament = tourney_tags[1].text

    season_tag = soup.find("p", text = lambda text: text and text == "Season")
    season_tags = season_tag.find_parent("div").find_parent("div").find_all("p")
    season = season_tags[1].text

    df["Tournament"] = tournament
    df["Season"] = season

    return df

In [175]:
seasons = seasons[:13]
all_dfs = []
counter = 0
for season in seasons:
    print(season)

    try:
        if counter > 0:
            # click the dropdown for the desired season
            browser.find_by_value('Season').click()
            browser.find_by_value(season).click()
            time.sleep(5)
            
            # scrape the page
            soup = BeautifulSoup(browser.html, 'html.parser')
            menu_lists = soup.find_all("div", class_ = lambda class_: class_ and "menu-list" in class_)
            
        # find all the tournaments
        tournaments = []
        tourney_list = menu_lists[2].find_all("button")
        for element in tourney_list:
            tournaments.append(element.text)
            
        # loop through each tournament and scrape the table
        for tourney in tournaments:
            try:
                # print(f"getting data for {tourney.upper()}...")
                
                browser.find_by_value('Tournament').click()
                browser.find_by_value(tourney).click()
                time.sleep(5)

                soup = BeautifulSoup(browser.html, 'html.parser')

                df = GetTourneyMoneyFromSoup(soup)

                all_dfs.append(df)
            except:
                print(f"ERROR: could not get data for {tourney.upper()}")
    except:
        print(f"ERROR: could not scrape {season}")
    
    counter += 1

2022-2023
2021-2022
2020-2021
2019-2020
2018-2019
2017-2018
2016-2017
2015-2016
2014-2015
2013-2014
2013
2012
ERROR: could not get data for THE HONDA CLASSIC
2011


In [178]:
final_df = pd.concat(all_dfs)
# turn money column into numeric
final_df["Money"] = final_df["Money"].str.replace("$", "").str.replace(",", "").astype(int)
# final_df.to_csv("earnings.csv")
# final_df.head()

In [193]:
import os
import pyodbc
from dotenv import load_dotenv

from sqlalchemy.engine import URL
from sqlalchemy import create_engine

import struct

load_dotenv()

PORT = os.getenv("PORT")
PWD = os.getenv("PWD")
HOST = os.getenv("HOST")
USER = os.getenv("USER")
DB = os.getenv("DB")

def db_connect():

    # construct connection string
    connection_string = f"postgresql+psycopg2://{USER}:{PWD}@{HOST}:{PORT}/{DB}"
    # print(connection_string)
    try:
        engine = create_engine(connection_string, echo=True)
        conn = engine.connect()
    except pyodbc.InterfaceError as ex:
            raise ex
        
    return conn

def write_to_db(df, table_name, append=False):
    conn = db_connect()
    if append:
        df.to_sql(table_name, conn, if_exists='append', index=False)
    else:
        df.to_sql(table_name, conn, if_exists='replace', index=False)
    conn.close()


write_to_db(final_df, "earnings", append=False)

2023-07-13 22:05:35,043 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-07-13 22:05:35,043 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-07-13 22:05:35,066 INFO sqlalchemy.engine.Engine select current_schema()
2023-07-13 22:05:35,066 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-07-13 22:05:35,096 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-07-13 22:05:35,097 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-07-13 22:05:35,137 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2023-07-13 22:05:35,137 INFO sqlalchemy.engine.Engine [generated in 0.00074s] {'name': 'earnings'}
2023-07-13 22:05:35,162 INFO sqlalchemy.engine.Engine 
CREATE TABLE earnings (
	"Rank" TEXT, 
	"Player" TEXT, 
	"Money" INTEGER, 
	"Tournament" TEXT, 
	"Season" TEXT
)


2023-07-13 22:05:35,163 INFO sqlalchemy.engine.Engine [no key 0.00079s] {}
2023-07-13 22: