In [3]:
import requests
import pandas as pd
import numpy as np
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

In [93]:
# Load exported data

df = pd.read_csv('../data/pga_champ_24')
r1_scores = pd.read_csv('../data/r1_scores')
r2_scores = pd.read_csv('../data/r2_scores')
r3_scores = pd.read_csv('../data/r3_scores')
r4_scores = pd.read_csv('../data/r4_scores')
round_1 = pd.read_csv('../data/round_1_hbh')
round_2 = pd.read_csv('../data/round_2_hbh')
round_3 = pd.read_csv('../data/round_3_hbh')
round_4 = pd.read_csv('../data/round_4_hbh')

# Get data on overall results using BeautifulSoup

In [2]:
x = requests.get('https://www.pgatour.com/leaderboard/hole-by-hole')
print(x.status_code)

200


In [3]:
soup = BeautifulSoup(x.content, 'lxml')
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0"
}
#print(soup.prettify())
#soup.find_all('script')

In [4]:
# Convert JSON data into Python dictionary
all_data = soup.find_all("script", {"type": "application/json"})

for data in all_data:
    jsn = json.loads(data.string)
  ##  print(json.dumps(jsn, indent=4))

In [5]:
# Extract individual player data from dictionary 
player_dicts = jsn["props"]["pageProps"]["leaderboard"]["players"]

In [6]:
# Extract player information using list comprehension
# Add if condition to remove cut line entry mid table
names = [player_dicts[elem]['player']['displayName'] for elem in range(len(player_dicts)) if 'player' in player_dicts[elem]]
countries = [player_dicts[elem]['player']['country'] for elem in range(len(player_dicts)) if 'player' in player_dicts[elem]]
ids = [player_dicts[elem]['player']['id'] for elem in range(len(player_dicts)) if 'player' in player_dicts[elem]]

# Extract scoring information using list comprehension
positions = [player_dicts[elem]['scoringData']['position'] for elem in range(len(player_dicts)) if 'scoringData' in player_dicts[elem]]
total_scores = [player_dicts[elem]['scoringData']['total'] for elem in range(len(player_dicts)) if 'scoringData' in player_dicts[elem]]
rounds = [player_dicts[elem]['scoringData']['rounds'] for elem in range(len(player_dicts)) if 'scoringData' in player_dicts[elem]]
total_strokes = [player_dicts[elem]['scoringData']['totalStrokes'] for elem in range(len(player_dicts)) if 'scoringData' in player_dicts[elem]]
round_status = [player_dicts[elem]['scoringData']['roundStatus'] for elem in range(len(player_dicts)) if 'scoringData' in player_dicts[elem]]

# Fill missing status values for players that withdrew
round_status[-3:] = ["Withdrew","Withdrew","Withdrew"]

# Extract individual round scores
r1 = [rounds[elem][0] for elem in range(len(rounds))]
r2 = [rounds[elem][1] for elem in range(len(rounds))]
r3 = [rounds[elem][2] for elem in range(len(rounds))]
r4 = [rounds[elem][3] for elem in range(len(rounds))]

In [7]:
player_dicts[0]

{'__typename': 'PlayerRowV3',
 'id': '48081',
 'leaderboardSortOrder': 0,
 'player': {'id': '48081',
  'firstName': 'Xander',
  'lastName': 'Schauffele',
  'amateur': False,
  'displayName': 'Xander Schauffele',
  'abbreviations': '',
  'abbreviationsAccessibilityText': '',
  'country': 'USA',
  'countryFlag': 'USA',
  'shortName': 'X. Schauffele',
  'lineColor': '#0084FF',
  'tourBound': False},
 'scoringData': {'position': '1',
  'total': '-21',
  'totalSort': -21,
  'thru': 'F',
  'thruSort': 19,
  'score': '-6',
  'scoreSort': -6,
  'courseId': '671',
  'groupNumber': 39,
  'currentRound': 4,
  'backNine': False,
  'roundHeader': 'R4',
  'rounds': ['62', '68', '68', '65'],
  'movementDirection': 'CONSTANT',
  'movementAmount': '0',
  'playerState': 'COMPLETE',
  'rankingMovement': 'CONSTANT',
  'rankingMovementAmount': '-',
  'rankingMovementAmountSort': 0,
  'rankLogoLight': 'https://res.cloudinary.com/pgatour-prod/temp/standings/profile/Comcast-Light.png',
  'rankLogoDark': 'http

# Get hole-by-hole data using Selenium

In [4]:
driver = webdriver.Edge()
driver.get('https://www.pgatour.com/tournaments/2024/pga-championship/R2024033/leaderboard/hole-by-hole')
# print("Page title is: %s" %(driver.title))

In [10]:
r1_holes = [driver.find_element(By.XPATH, f'//tr[@class="player-{elem} css-paaamq"]').text for elem in ids]
r1_split = pd.Series(r1_holes).apply(lambda x: x.split('\n'))

In [11]:
r2_holes = [driver.find_element(By.XPATH, f'//tr[@class="player-{elem} css-paaamq"]').text for elem in ids]
r2_split = pd.Series(r2_holes).apply(lambda x: x.split('\n'))

In [50]:
r3_holes = [driver.find_element(By.XPATH, f'//tr[@class="player-{elem} css-paaamq"]').text for elem in ids]
r3_split = pd.Series(r3_holes).apply(lambda x: x.split('\n'))

In [191]:
r4_holes = [driver.find_element(By.XPATH, f'//tr[@class="player-{elem} css-paaamq"]').text for elem in ids]
r4_split = pd.Series(r4_holes).apply(lambda x: x.split('\n'))

In [130]:
par_str = driver.find_elements(By.XPATH, '//span[@class="css-p11w71"]')
# par_scores = pd.Series(par_str).apply(lambda x: x.split('\n'))
holes_pars = []
for elem in par_str:
    i = elem.text
    holes_pars.append(i)

In [162]:
r3_scores[0]

['4',
 '4',
 '3',
 '4',
 '4',
 '4',
 '4',
 '3',
 '4',
 '4',
 '3',
 '4',
 '4',
 '2',
 '6',
 '4',
 '3',
 '4']

In [17]:
driver.quit()

In [13]:
print(r4_split[0][3:12],r4_split[0][14:22])

['3', '4', '3', '3', '4', '4', '4', '3', '3'] ['2', '3', '4', '3', '4', '4', '4', '4']


In [200]:
r1_scores = [r1_split[elem][3:12]+r1_split[elem][13:22] for elem in range(len(r4_split))] 
r2_scores = [r2_split[elem][3:12]+r2_split[elem][13:22] for elem in range(len(r4_split))] 
r3_scores = [r3_split[elem][3:12]+r3_split[elem][13:22] for elem in range(len(r4_split))] 
r4_scores = [r4_split[elem][3:12]+r4_split[elem][13:22] for elem in range(len(r4_split))] 


In [209]:
# Calculate par and compare scores to par scores to retrieve par/birdie/eagle tallies
par_fills = ["Par"," "]
par_scores = pd.Series(par_fills + holes_pars[21:30] + holes_pars[31:40])
par = holes_pars[21:30] + holes_pars[31:40]
par_scores = par_scores.values.reshape(1,20)

# Cast par scores and round scores as integers

# Par
# s_par = pd.Series(par).astype('int64')

# Round 1
r1_scores_int = []
z = 0
for substring in r1_scores:
    sub_2 = []
    for elem in substring:
        a = int(elem)
        sub_2.append(a)
        z += 1
    r1_scores_int.append(sub_2)

# Round 2
r2_scores_int = []
z = 0
for substring in r2_scores:
    sub_2 = []
    for elem in substring:
        if elem != '-':
            a = int(elem)
            sub_2.append(a)
            z += 1
        else:
            sub_2.append('-')
    r2_scores_int.append(sub_2)

# Round 3
r3_scores_int = []
z = 0
for substring in r3_scores:
    sub_2 = []
    for elem in substring:
        if elem != '-':
            a = int(elem)
            sub_2.append(a)
            z += 1
        else:
            sub_2.append('-')
    r3_scores_int.append(sub_2)

# Round 4
r4_scores_int = []
z = 0
for substring in r4_scores:
    sub_2 = []
    for elem in substring:
        if elem != '-':
            a = int(elem)
            sub_2.append(a)
            z += 1
        else:
            sub_2.append('-')
    r4_scores_int.append(sub_2)

In [229]:
r2_scores_int[0] - s_par

0     0
1     0
2    -1
3     0
4     0
5     0
6    -1
7     0
8    -1
9    -1
10    1
11    0
12    0
13    0
14    0
15    0
16    0
17    0
dtype: int64

In [247]:
# Subtract par score from player scores and add to new list
r1_minus_par = []
r1_minus_par = [r1_scores_int[elem] - s_par for elem in range(len(r1_scores))]
r2_minus_par = []
r2_minus_par = [r2_scores_int[elem] - s_par if '-' not in r2_scores_int[elem] else '-' for elem in range(len(r2_scores))]
r3_minus_par = []
r3_minus_par = [r3_scores_int[elem] - s_par if '-' not in r3_scores_int[elem] else '-' for elem in range(len(r3_scores))]
r4_minus_par = []
r4_minus_par = [r4_scores_int[elem] - s_par if '-' not in r4_scores_int[elem] else '-' for elem in range(len(r4_scores))]

In [254]:
# np.savetxt("r1_scores", r1_scores, delimiter =", ", fmt ='% s')
# np.savetxt("r2_scores", r2_scores, delimiter =", ", fmt ='% s')
# np.savetxt("r3_scores", r3_scores, delimiter =", ", fmt ='% s')
# np.savetxt("r4_scores", r4_scores, delimiter =", ", fmt ='% s')

In [None]:
# Create dictionary of par/birdie/eagle values with players as keys



In [199]:
len(r1_minus_par)

156

In [177]:
s_r1-s_par

0     0
1    -1
2     0
3    -1
4    -1
5     0
6    -1
7     0
8     0
9     0
10   -1
11    0
12   -1
13    0
14   -1
15   -1
16    0
17   -1
dtype: int64

In [201]:
holes = [f"{elem}" for elem in range(1,19)]
holes_20 = par_fills + holes

# Get Top 10 stats using Selenium

In [18]:
driver = webdriver.Edge()
driver.get('https://www.pgatour.com/tournaments/2024/pga-championship/R2024033/leaderboard/hole-by-hole')

In [64]:
stats = driver.find_elements(By.XPATH, '//div[@class="css-1b6nfy8"]//span[@class="chakra-text css-llafun"]')

In [55]:
len(stats)

801

In [57]:
stats[18].text == ''

True

In [76]:
import re

stats_list = []
temp_list = []

for elem in stats:
    i = elem.text
    if i != '':
        if len(temp_list) == 13:  # If the current list has 13 values
            stats_list.append(temp_list)  # Add the current list to stats_list
            temp_list = []  # Start a new list
            if re.match(r"^\d+\.\d{3}$", i):  # If a number with 3 decimal places is encountered and temp_list is not empty
                temp_list.append(i)  # Add the current list to stats_list
        elif len(temp_list) == 0 and not re.match(r"^\d+\.\d{3}$", i):
            continue
        else:
            temp_list.append(i)  # Add the current value to the current list

# If there are any remaining elements in temp_list after the loop
if temp_list:
    stats_list.append(temp_list)
# Populate values in dictionary of top 10 stats one by one per Selenium scraping
# dict_individual = dict(map(lambda i,j : (i,j) , stat_name,stats_list))
# stat_individual[top_10_names[len(stat_individual)]] = dict_individual

In [77]:
stats_list

[['5.227',
  '7.811',
  '0.424',
  '4.704',
  '18.165',
  '73.21% (41/56)',
  '310.10 yds',
  '330.00 yds',
  '83.33% (60/72)',
  '100.00% (5/5)',
  '83.33% (10/12)',
  '1.63',
  '95.00'],
 ['5.890',
  '2.261',
  '5.006',
  '4.009',
  '17.165',
  '62.50% (35/56)',
  '330.50 yds',
  '360.00 yds',
  '72.22% (52/72)',
  '100.00% (6/6)',
  '90.00% (18/20)',
  '1.64',
  '72.00'],
 ['4.575',
  '6.522',
  '1.468',
  '2.601',
  '15.165',
  '67.86% (38/56)',
  '303.30 yds',
  '328.00 yds',
  '77.78% (56/72)',
  '66.67% (6/9)',
  '81.25% (13/16)',
  '1.64',
  '88.00'],
 ['1.361',
  '0.703',
  '5.989',
  '4.113',
  '12.165',
  '66.07% (37/56)',
  '305.80 yds',
  '329.00 yds',
  '65.28% (47/72)',
  '83.33% (10/12)',
  '84.00% (21/25)',
  '1.64',
  '78.00'],
 ['3.674',
  '3.177',
  '6.078',
  '-0.763',
  '12.165',
  '78.57% (44/56)',
  '294.10 yds',
  '328.00 yds',
  '70.83% (51/72)',
  '50.00% (3/6)',
  '80.95% (17/21)',
  '1.71',
  '70.00'],
 ['1.772',
  '2.483',
  '-0.702',
  '7.613',
  '11.165'

In [78]:
stat_individual

{'Xander Schauffele': {'SG: Off The Tee': '5.227',
  'SG: Approach to Green': '7.811',
  'SG: Around The Green': '0.424',
  'SG: Putting': '4.704',
  'SG: Total': '18.165',
  'Driving Accuracy': '73.21% (41/56)',
  'Driving Distance': '310.10 yds',
  'Longest Drive': '330.00 yds',
  'Greens in Regulation': '83.33% (60/72)',
  'Sand Saves': '100.00% (5/5)',
  'Scrambling': '83.33% (10/12)',
  'Putts per GIR': '1.63',
  'Feet of Putts Made': '95.00',
  'Birdies': '25',
  'Pars': '44',
  'Bogeys': '2',
  'Double Bogeys': '1'}}

In [20]:
stats_name = driver.find_elements(By.XPATH, '//span[@class="chakra-text css-6vsfej"]')

In [34]:
stat_name = []
for elem in stats_name[0:17]:
    i = elem.text
    stat_name.append(i)

In [83]:
# Assuming that stats_list, stat_name_col1, and top_10_names are already defined
# and have the correct lengths

dict_list = []
for sublist in stats_list:
    dict_list.append(dict(zip(stat_name_col1, sublist)))

final_dict = dict(zip(top_10_names, dict_list))

<function dict.items>

In [94]:
# Convert the dictionary to a DataFrame
df_stats = pd.DataFrame.from_dict(final_dict, orient='index', columns=stat_name_col1)

# If you want the names to be a column instead of the index
df_stats.reset_index(inplace=True)
df_stats.rename(columns={'index': 'Name'}, inplace=True)

In [97]:
df_stats.to_csv('pga_stats')

In [None]:
stat_name[0:13]

In [21]:
stats_rank = driver.find_elements(By.XPATH, '//span[@class="chakra-text css-10i1s4"]')
stats_rankings = []

In [96]:
# stats_rankings = []
# for a in range(1):
#     stats_sub = []
#     for elem in stats_rank[0:17]:
#         i = elem.text
#         stats_sub.append(i)
#     stats_rankings.append(stats_sub)

In [None]:
stats_list = []
for i in range(10):
    stats_sub = []
    for elem in stats:
        i = elem.text
        stats_sub.append(i)
    stats_list.append(stats_sub)

In [14]:
top_10_names = df['Name'][0:10]
top_10_stats = {}
top_10_stats.keys = top_10_names

AttributeError: 'dict' object attribute 'keys' is read-only

In [None]:
# Populate values in dictionary of top 10 stats one by one per Selenium scraping
stat_individual = {}
stat_individual.keys = stats_list
stat_invidual.values = 

    



# Create master df

In [250]:
# Create principal df using dictionary
pga_dict = {'Name': names, 'Country': countries, 'Position': positions, 'Total score': total_scores, 'R1': r1, 'R2': r2, 'R3': r3, 'R4': r4, 'Total strokes': total_strokes, 'Status' : round_status}
df = pd.DataFrame(pga_dict)
# df.to_csv('pga_champ_24')

In [6]:
df

Unnamed: 0.1,Unnamed: 0,Name,Country,Position,Total score,R1,R2,R3,R4,Total strokes,Status
0,0,Xander Schauffele,USA,1,-21,62,68,68,65,263,R4 Completed
1,1,Bryson DeChambeau,USA,2,-20,68,65,67,64,264,R4 Completed
2,2,Viktor Hovland,NOR,3,-18,68,66,66,66,266,R4 Completed
3,3,Thomas Detry,BEL,T4,-15,66,67,70,66,269,R4 Completed
4,4,Collin Morikawa,USA,T4,-15,66,65,67,71,269,R4 Completed
...,...,...,...,...,...,...,...,...,...,...,...
151,151,Jeff Kellen,USA,CUT,+17,87,72,-,-,159,R2 Completed
152,152,Rich Beem,USA,CUT,+20,79,83,-,-,162,R2 Completed
153,153,John Daly,USA,WD,-,82,-,-,-,82,Withdrew
154,154,Ben Griffin,USA,WD,-,73,65,-,-,138,Withdrew


# Hole by hole df

In [251]:
# Create dictionary with hole by hole data using nested list comprehension

# Set up column names and par series
names_countries_dict = {"Name": names, "Country": countries}
player_df = pd.DataFrame(names_countries_dict)
col_names = [elem for elem in holes]
col_names_add = ["Name","Country"]
col_names_all = col_names_add + col_names
par_df = pd.DataFrame(par_scores, columns = col_names_all)

# Round 1
holes_1_dict = {str(hole): [r1_scores[player][hole-1] for player in range(len(r1_scores))] for hole in range(1,19)}
df_holes_1 = pd.DataFrame(holes_1_dict)
round_1_hbh = pd.concat([player_df, df_holes_1], axis = 1)
round_1 = pd.concat([par_df, round_1_hbh,], axis = 0)
# round_1.to_csv('round_1_hbh')

# Round 2
holes_2_dict = {str(hole): [r2_scores[player][hole-1] for player in range(len(r2_scores))] for hole in range(1,19)}
df_holes_2 = pd.DataFrame(holes_2_dict)
round_2_hbh = pd.concat([player_df, df_holes_2], axis = 1)
round_2 = pd.concat([par_df, round_2_hbh,], axis = 0)
# round_2.to_csv('round_2_hbh')

# Round 3
holes_3_dict = {str(hole): [r3_scores[player][hole-1] for player in range(len(r3_scores))] for hole in range(1,19)}
df_holes_3 = pd.DataFrame(holes_3_dict)
round_3_hbh = pd.concat([player_df, df_holes_3], axis = 1)
round_3 = pd.concat([par_df, round_3_hbh,], axis = 0)
# round_3.to_csv('round_3_hbh')

# Round 4
holes_4_dict = {str(hole): [r4_scores[player][hole-1] for player in range(len(r4_scores))] for hole in range(1,19)}
df_holes_4 = pd.DataFrame(holes_4_dict)
round_4_hbh = pd.concat([player_df, df_holes_4], axis = 1)
round_4 = pd.concat([par_df, round_4_hbh,], axis = 0)
# round_4.to_csv('round_4_hbh')

In [7]:
round_3

Unnamed: 0.1,Unnamed: 0,Name,Country,1,2,3,4,5,6,7,...,9,10,11,12,13,14,15,16,17,18
0,0,Par,,4,4,3,4,4,4,5,...,4,5,3,4,4,3,4,4,4,5
1,0,Xander Schauffele,USA,4,4,3,4,4,4,4,...,4,4,3,4,4,2,6,4,3,4
2,1,Bryson DeChambeau,USA,3,4,3,4,4,5,4,...,4,5,3,4,3,3,4,4,4,3
3,2,Viktor Hovland,NOR,4,4,3,4,4,4,4,...,4,5,3,4,4,3,3,4,3,4
4,3,Thomas Detry,BEL,4,4,4,2,4,5,6,...,3,4,3,4,4,4,4,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,151,Jeff Kellen,USA,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
153,152,Rich Beem,USA,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
154,153,John Daly,USA,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
155,154,Ben Griffin,USA,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


# Player performance statistics

In [165]:
birdies = driver.find_elements(By.XPATH, '//span[@class="chakra-text css-llafun"]')

In [167]:
birdies_list = []
for elem in birdies:
    i = elem.text
    birdies_list.append(i)

birdies_list

['5.227',
 '7.811',
 '0.424',
 '4.704',
 '18.165',
 '73.21% (41/56)',
 '310.10 yds',
 '330.00 yds',
 '83.33% (60/72)',
 '100.00% (5/5)',
 '83.33% (10/12)',
 '1.63',
 '95.00',
 '25',
 '44',
 '2',
 '1',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']