# Notebook setup 

In [306]:
import requests
import pandas as pd
import numpy as np
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import re

In [305]:
# YAML file 
import yaml
from yaml.loader import SafeLoader

# Open the file and load the file
with open('../PGA_Championship_project.yaml') as f:
    data = yaml.load_all(f, Loader=SafeLoader)
    for doc in data:
        print(doc)

{'Project details': [{'Author/DA': 'Matthew Asquith'}, {'Project': 'Ironhack solo project 1 (week 3)'}, {'Subject': 'PGA Championship Golf 2024 results'}]}
{'Data': [{'pga_champ_24': 'C:\\Users\\matth\\Documents\\Ironhack\\Week_3\\Week_3_project\\data\\pga_champ_24'}, {'pga_stats': 'C:\\Users\\matth\\Documents\\Ironhack\\Week_3\\Week_3_project\\data\\pga_stats'}, {'r1_scores': 'C:\\Users\\matth\\Documents\\Ironhack\\Week_3\\Week_3_project\\data\\r1_scores'}, {'r2_scores': 'C:\\Users\\matth\\Documents\\Ironhack\\Week_3\\Week_3_project\\data\\r2_scores'}, {'r3_scores': 'C:\\Users\\matth\\Documents\\Ironhack\\Week_3\\Week_3_project\\data\\r3_scores'}, {'r4_scores': 'C:\\Users\\matth\\Documents\\Ironhack\\Week_3\\Week_3_project\\data\\r4_scores'}, {'round_1': 'C:\\Users\\matth\\Documents\\Ironhack\\Week_3\\Week_3_project\\data\\round_1_hbh'}, {'round_2': 'C:\\Users\\matth\\Documents\\Ironhack\\Week_3\\Week_3_project\\data\\round_2_hbh'}, {'round_3': 'C:\\Users\\matth\\Documents\\Ironhack\\

In [307]:
# Load exported data
df = pd.read_csv('../data/pga_champ_24')
r1_scores = pd.read_csv('../data/r1_scores')
r2_scores = pd.read_csv('../data/r2_scores')
r3_scores = pd.read_csv('../data/r3_scores')
r4_scores = pd.read_csv('../data/r4_scores')
round_1 = pd.read_csv('../data/round_1_hbh')
round_2 = pd.read_csv('../data/round_2_hbh')
round_3 = pd.read_csv('../data/round_3_hbh')
round_4 = pd.read_csv('../data/round_4_hbh')
pga_stats = pd.read_csv('../data/pga_stats')

# Get data on overall results using BeautifulSoup

In [2]:
x = requests.get('https://www.pgatour.com/leaderboard/hole-by-hole')
print(x.status_code)

200


In [3]:
soup = BeautifulSoup(x.content, 'lxml')
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0"
}
#print(soup.prettify())
#soup.find_all('script')

In [4]:
# Convert JSON data into Python dictionary
all_data = soup.find_all("script", {"type": "application/json"})

for data in all_data:
    jsn = json.loads(data.string)
  ##  print(json.dumps(jsn, indent=4))

In [5]:
# Extract individual player data from dictionary 
player_dicts = jsn["props"]["pageProps"]["leaderboard"]["players"]

In [6]:
# Extract player information using list comprehension
# Add if condition to remove cut line entry mid table
names = [player_dicts[elem]['player']['displayName'] for elem in range(len(player_dicts)) if 'player' in player_dicts[elem]]
countries = [player_dicts[elem]['player']['country'] for elem in range(len(player_dicts)) if 'player' in player_dicts[elem]]
ids = [player_dicts[elem]['player']['id'] for elem in range(len(player_dicts)) if 'player' in player_dicts[elem]]

# Extract scoring information using list comprehension
positions = [player_dicts[elem]['scoringData']['position'] for elem in range(len(player_dicts)) if 'scoringData' in player_dicts[elem]]
total_scores = [player_dicts[elem]['scoringData']['total'] for elem in range(len(player_dicts)) if 'scoringData' in player_dicts[elem]]
rounds = [player_dicts[elem]['scoringData']['rounds'] for elem in range(len(player_dicts)) if 'scoringData' in player_dicts[elem]]
total_strokes = [player_dicts[elem]['scoringData']['totalStrokes'] for elem in range(len(player_dicts)) if 'scoringData' in player_dicts[elem]]
round_status = [player_dicts[elem]['scoringData']['roundStatus'] for elem in range(len(player_dicts)) if 'scoringData' in player_dicts[elem]]

# Fill missing status values for players that withdrew
round_status[-3:] = ["Withdrew","Withdrew","Withdrew"]

# Extract individual round scores
r1 = [rounds[elem][0] for elem in range(len(rounds))]
r2 = [rounds[elem][1] for elem in range(len(rounds))]
r3 = [rounds[elem][2] for elem in range(len(rounds))]
r4 = [rounds[elem][3] for elem in range(len(rounds))]

# Get hole-by-hole data using Selenium

In [4]:
driver = webdriver.Edge()
driver.get('https://www.pgatour.com/tournaments/2024/pga-championship/R2024033/leaderboard/hole-by-hole')
# print("Page title is: %s" %(driver.title))

In [10]:
r1_holes = [driver.find_element(By.XPATH, f'//tr[@class="player-{elem} css-paaamq"]').text for elem in ids]
r1_split = pd.Series(r1_holes).apply(lambda x: x.split('\n'))

In [11]:
r2_holes = [driver.find_element(By.XPATH, f'//tr[@class="player-{elem} css-paaamq"]').text for elem in ids]
r2_split = pd.Series(r2_holes).apply(lambda x: x.split('\n'))

In [50]:
r3_holes = [driver.find_element(By.XPATH, f'//tr[@class="player-{elem} css-paaamq"]').text for elem in ids]
r3_split = pd.Series(r3_holes).apply(lambda x: x.split('\n'))

In [191]:
r4_holes = [driver.find_element(By.XPATH, f'//tr[@class="player-{elem} css-paaamq"]').text for elem in ids]
r4_split = pd.Series(r4_holes).apply(lambda x: x.split('\n'))

In [130]:
par_str = driver.find_elements(By.XPATH, '//span[@class="css-p11w71"]')
# par_scores = pd.Series(par_str).apply(lambda x: x.split('\n'))
holes_pars = []
for elem in par_str:
    i = elem.text
    holes_pars.append(i)

In [280]:
driver.quit()

In [23]:
holes_pars = [4,4,3,4,4,4,5,3,4,5,3,4,4,3,4,4,4,5]

In [204]:
top_10_names

0    Xander Schauffele
1    Bryson DeChambeau
2       Viktor Hovland
3         Thomas Detry
4      Collin Morikawa
5          Justin Rose
6          Shane Lowry
7       Billy Horschel
8    Scottie Scheffler
9        Justin Thomas
Name: Name, dtype: object

# Get round 6 data for top 10 players

In [206]:
r1_6 = round_1['6'].iloc[0:10]
r2_6 = round_2['6'].iloc[0:10]
r3_6 = round_3['6'].iloc[0:10]
r4_6 = round_4['6'].iloc[0:10]

In [208]:
hole_6 = pd.concat([top_10_names,r1_6,r2_6,r3_6,r4_6], axis = 1)

In [243]:
hole_6

Unnamed: 0,Name,6,6.1,6.2,6.3
0,Xander Schauffele,4,4,4,4
1,Bryson DeChambeau,4,4,4,4
2,Viktor Hovland,4,4,5,3
3,Thomas Detry,4,3,4,3
4,Collin Morikawa,4,4,5,4
5,Justin Rose,3,3,4,4
6,Shane Lowry,4,4,4,5
7,Billy Horschel,4,4,4,4
8,Scottie Scheffler,4,4,4,4
9,Justin Thomas,4,4,4,4


In [282]:
temp_list = []
for elem in round_1['6']:
    a = int(elem)
    temp_list.append(a)
r1_6_mean = sum(temp_list)/len(temp_list)

In [283]:
temp_list = []
for elem in round_2['6']:
    if elem != '-':
        a = int(elem)
        temp_list.append(a)
r2_6_mean = sum(temp_list)/len(temp_list)

In [284]:
temp_list = []
for elem in round_3['6']:
    if elem != '-':
            a = int(elem)
            temp_list.append(a)
r3_6_mean = sum(temp_list)/len(temp_list)

In [285]:
temp_list = []
for elem in round_4['6']:
    if elem != '-':
            a = int(elem)
            temp_list.append(a)
r4_6_mean = sum(temp_list)/len(temp_list)

In [218]:
round_1['6']

0      4
1      4
2      4
3      4
4      4
      ..
152    6
153    4
154    5
155    4
156    5
Name: 6, Length: 157, dtype: int64

In [229]:
hole_6_means = ["AVERAGE",r1_6_mean,r2_6_mean,r3_6_mean,r4_6_mean]
sh6 = pd.Series(hole_6_means)

In [47]:
r1_scores.columns = range(1,19)
r2_scores.columns = range(1,19)
r3_scores.columns = range(1,19)
r4_scores.columns = range(1,19)
r1_df = pd.concat([df['Name'],r1_scores], axis = 1)
r2_df = pd.concat([df['Name'],r2_scores], axis = 1)
r3_df = pd.concat([df['Name'],r3_scores], axis = 1)
r4_df = pd.concat([df['Name'],r4_scores], axis = 1)

In [287]:
r1_6_int = pd.to_numeric(round_1['6'], errors = 'coerce')
r2_6_int = pd.to_numeric(round_2['6'], errors = 'coerce')
r3_6_int = pd.to_numeric(round_3['6'], errors = 'coerce')
r4_6_int = pd.to_numeric(round_4['6'], errors = 'coerce')

In [288]:
# Calucate mean scores on hole 6
r1_6_int.mean()
r2_6_int.mean()
r3_6_int.mean()
r4_6_int.mean()

4.012658227848101

In [13]:
print(r4_split[0][3:12],r4_split[0][14:22])

['3', '4', '3', '3', '4', '4', '4', '3', '3'] ['2', '3', '4', '3', '4', '4', '4', '4']


In [200]:
r1_scores = [r1_split[elem][3:12]+r1_split[elem][13:22] for elem in range(len(r4_split))] 
r2_scores = [r2_split[elem][3:12]+r2_split[elem][13:22] for elem in range(len(r4_split))] 
r3_scores = [r3_split[elem][3:12]+r3_split[elem][13:22] for elem in range(len(r4_split))] 
r4_scores = [r4_split[elem][3:12]+r4_split[elem][13:22] for elem in range(len(r4_split))] 


In [289]:
# Calculate par and compare scores to par scores to retrieve par/birdie/eagle tallies
par_fills = ["Par"," "]
par_scores = pd.Series(par_fills + holes_pars[21:30] + holes_pars[31:40])
par = holes_pars[21:30] + holes_pars[31:40]

In [247]:
# Subtract par score from player scores and add to new list
r1_minus_par = []
r1_minus_par = [r1_scores_int[elem] - s_par for elem in range(len(r1_scores))]
r2_minus_par = []
r2_minus_par = [r2_scores_int[elem] - s_par if '-' not in r2_scores_int[elem] else '-' for elem in range(len(r2_scores))]
r3_minus_par = []
r3_minus_par = [r3_scores_int[elem] - s_par if '-' not in r3_scores_int[elem] else '-' for elem in range(len(r3_scores))]
r4_minus_par = []
r4_minus_par = [r4_scores_int[elem] - s_par if '-' not in r4_scores_int[elem] else '-' for elem in range(len(r4_scores))]

In [254]:
# Save data into txt files
# np.savetxt("r1_scores", r1_scores, delimiter =", ", fmt ='% s')
# np.savetxt("r2_scores", r2_scores, delimiter =", ", fmt ='% s')
# np.savetxt("r3_scores", r3_scores, delimiter =", ", fmt ='% s')
# np.savetxt("r4_scores", r4_scores, delimiter =", ", fmt ='% s')

In [294]:
# Function to calculate par scores for each round

def pars_calculator(scores, par_scores, names, masterdf):
    q = 0
    nets = range(-3, 4, 1)
    par_dict = {}
    for elem in range(155):
        # Check if any scores contain hyphens
        if any(scores.loc[q].str.contains("-")):
            par_dict[names[q]] = "-"
        else:
            # Convert relevant columns to numeric
            scores_numeric = scores.loc[q].apply(pd.to_numeric, errors="coerce")
            net_scores = scores_numeric - par_scores
            net_dict = net_scores.value_counts().to_dict()
            par_dict[names[q]] = net_dict
        q += 1

    df = pd.DataFrame.from_dict(par_dict, orient='index', columns=nets)
    df = df.fillna("-")
    df = df.set_index('Name')
    df = df.reindex(index=masterdf['Name'])
    df = df.reset_index()
    return df

In [296]:
# r1_net = pars_calculator(r1_scores,holes_par,df['Name'])
# r2_net = pars_calculator(r2_scores,holes_par,df['Name'],df)
# r3_net = pars_calculator(r3_scores,holes_par,df['Name'])
# r4_net = pars_calculator(r4_scores,holes_par,df['Name'])


# Get Top 10 stats using Selenium

In [186]:
driver = webdriver.Edge()
driver.get('https://www.pgatour.com/tournaments/2024/pga-championship/R2024033/leaderboard/hole-by-hole')

In [64]:
stats = driver.find_elements(By.XPATH, '//div[@class="css-1b6nfy8"]//span[@class="chakra-text css-llafun"]')

In [76]:
stats_list = []
temp_list = []

for elem in stats:
    i = elem.text
    if i != '':
        if len(temp_list) == 13:  # If the current list has 13 values
            stats_list.append(temp_list)  # Add the current list to stats_list
            temp_list = []  # Start a new list
            if re.match(r"^\d+\.\d{3}$", i):  # If a number with 3 decimal places is encountered and temp_list is not empty
                temp_list.append(i)  # Add the current list to stats_list
        elif len(temp_list) == 0 and not re.match(r"^\d+\.\d{3}$", i):
            continue
        else:
            temp_list.append(i)  # Add the current value to the current list

# If there are any remaining elements in temp_list after the loop
if temp_list:
    stats_list.append(temp_list)
# Populate values in dictionary of top 10 stats one by one per Selenium scraping
dict_individual = dict(map(lambda i,j : (i,j) , stat_name,stats_list))
stat_individual[top_10_names[len(stat_individual)]] = dict_individual

In [185]:
pga_stats

Unnamed: 0.1,Unnamed: 0,Name,SG: Off The Tee,SG: Approach to Green,SG: Around The Green,SG: Putting,SG: Total,Driving Accuracy,Driving Distance,Longest Drive,Greens in Regulation,Sand Saves,Scrambling,Putts per GIR,Feet of Putts Made
0,0,Xander Schauffele,5.227,7.811,0.424,4.704,18.165,73.21% (41/56),310.10 yds,330.00 yds,83.33% (60/72),100.00% (5/5),83.33% (10/12),1.63,95.0
1,1,Bryson DeChambeau,5.89,2.261,5.006,4.009,17.165,62.50% (35/56),330.50 yds,360.00 yds,72.22% (52/72),100.00% (6/6),90.00% (18/20),1.64,72.0
2,2,Viktor Hovland,4.575,6.522,1.468,2.601,15.165,67.86% (38/56),303.30 yds,328.00 yds,77.78% (56/72),66.67% (6/9),81.25% (13/16),1.64,88.0
3,3,Thomas Detry,1.361,0.703,5.989,4.113,12.165,66.07% (37/56),305.80 yds,329.00 yds,65.28% (47/72),83.33% (10/12),84.00% (21/25),1.64,78.0
4,4,Collin Morikawa,3.674,3.177,6.078,-0.763,12.165,78.57% (44/56),294.10 yds,328.00 yds,70.83% (51/72),50.00% (3/6),80.95% (17/21),1.71,70.0
5,5,Justin Rose,1.772,2.483,-0.702,7.613,11.165,73.21% (41/56),300.40 yds,318.00 yds,65.28% (47/72),100.00% (4/4),72.00% (18/25),1.64,99.0
6,6,Shane Lowry,2.414,-1.683,1.277,9.158,11.165,69.64% (39/56),303.90 yds,325.00 yds,68.06% (49/72),66.67% (6/9),82.61% (19/23),1.65,114.0
7,7,Billy Horschel,3.132,0.821,6.643,10.165,67.86% (38/56),295.20 yds,323.00 yds,70.83% (51/72),83.33% (5/6),76.19% (16/21),1.69,101.0,1.0
8,8,Scottie Scheffler,3.584,6.618,0.061,-0.097,10.165,71.43% (40/56),306.60 yds,331.00 yds,72.22% (52/72),62.50% (5/8),65.00% (13/20),1.64,75.0
9,9,Justin Thomas,4.286,4.126,5.203,-3.449,10.165,60.71% (34/56),310.20 yds,327.00 yds,79.17% (57/72),50.00% (2/4),73.33% (11/15),1.83,65.0


In [20]:
stats_name = driver.find_elements(By.XPATH, '//span[@class="chakra-text css-6vsfej"]')

In [34]:
stat_name = []
for elem in stats_name[0:17]:
    i = elem.text
    stat_name.append(i)

In [94]:
# Convert the dictionary to a DataFrame
df_stats = pd.DataFrame.from_dict(final_dict, orient='index', columns=stat_name_col1)

# If you want the names to be a column instead of the index
df_stats.reset_index(inplace=True)
df_stats.rename(columns={'index': 'Name'}, inplace=True)

In [97]:
# df_stats.to_csv('pga_stats')

In [298]:
top_10_names = df['Name'][0:10]
top_10_stats = {}

# Create master df

In [250]:
# Create principal df using dictionary
pga_dict = {'Name': names, 'Country': countries, 'Position': positions, 'Total score': total_scores, 'R1': r1, 'R2': r2, 'R3': r3, 'R4': r4, 'Total strokes': total_strokes, 'Status' : round_status}
df = pd.DataFrame(pga_dict)
# df.to_csv('pga_champ_24')

In [6]:
df

Unnamed: 0.1,Unnamed: 0,Name,Country,Position,Total score,R1,R2,R3,R4,Total strokes,Status
0,0,Xander Schauffele,USA,1,-21,62,68,68,65,263,R4 Completed
1,1,Bryson DeChambeau,USA,2,-20,68,65,67,64,264,R4 Completed
2,2,Viktor Hovland,NOR,3,-18,68,66,66,66,266,R4 Completed
3,3,Thomas Detry,BEL,T4,-15,66,67,70,66,269,R4 Completed
4,4,Collin Morikawa,USA,T4,-15,66,65,67,71,269,R4 Completed
...,...,...,...,...,...,...,...,...,...,...,...
151,151,Jeff Kellen,USA,CUT,+17,87,72,-,-,159,R2 Completed
152,152,Rich Beem,USA,CUT,+20,79,83,-,-,162,R2 Completed
153,153,John Daly,USA,WD,-,82,-,-,-,82,Withdrew
154,154,Ben Griffin,USA,WD,-,73,65,-,-,138,Withdrew


# Create hole by hole dfs

In [251]:
# Create dictionary with hole by hole data using nested list comprehension

# Set up column names and par series
names_countries_dict = {"Name": names, "Country": countries}
player_df = pd.DataFrame(names_countries_dict)
col_names = [elem for elem in holes]
col_names_add = ["Name","Country"]
col_names_all = col_names_add + col_names
par_df = pd.DataFrame(par_scores, columns = col_names_all)

# Round 1
holes_1_dict = {str(hole): [r1_scores[player][hole-1] for player in range(len(r1_scores))] for hole in range(1,19)}
df_holes_1 = pd.DataFrame(holes_1_dict)
round_1_hbh = pd.concat([player_df, df_holes_1], axis = 1)
round_1 = pd.concat([par_df, round_1_hbh,], axis = 0)
# round_1.to_csv('round_1_hbh')

# Round 2
holes_2_dict = {str(hole): [r2_scores[player][hole-1] for player in range(len(r2_scores))] for hole in range(1,19)}
df_holes_2 = pd.DataFrame(holes_2_dict)
round_2_hbh = pd.concat([player_df, df_holes_2], axis = 1)
round_2 = pd.concat([par_df, round_2_hbh,], axis = 0)
# round_2.to_csv('round_2_hbh')

# Round 3
holes_3_dict = {str(hole): [r3_scores[player][hole-1] for player in range(len(r3_scores))] for hole in range(1,19)}
df_holes_3 = pd.DataFrame(holes_3_dict)
round_3_hbh = pd.concat([player_df, df_holes_3], axis = 1)
round_3 = pd.concat([par_df, round_3_hbh,], axis = 0)
# round_3.to_csv('round_3_hbh')

# Round 4
holes_4_dict = {str(hole): [r4_scores[player][hole-1] for player in range(len(r4_scores))] for hole in range(1,19)}
df_holes_4 = pd.DataFrame(holes_4_dict)
round_4_hbh = pd.concat([player_df, df_holes_4], axis = 1)
round_4 = pd.concat([par_df, round_4_hbh,], axis = 0)
# round_4.to_csv('round_4_hbh')