# Read Libs

In [1]:
import pandas as pd
import datetime as dt
import pdb
import numpy as np
import requests as req
import configparser
import os



# display and output settings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

display(HTML('''
<style>
    .container { width:85% !important; }
</style>
'''))

pd.set_option('display.min_rows', 100)

## Get Golf Data - Webscrape ESPN as Source

* espn changes tournament ids in April 1, 2018
* see a jump ffrom 3756 in Houston Open to 401025221 for the 2018 Masters
* RBC Heritage after the 2018 masters has ID 401025246 which jumps again
* 2018 tour championship ID is 401025268
* from there it appears sequential again
* 2020 masters in november has special id: 401219478 * need to manually add this into batch #2

In [2]:
from bs4 import BeautifulSoup

In [3]:
# chatgpt rec'd this to be added to avoid block from website
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [4]:
#batch 1 to pull data from 2001 to Apri 1 2018
batch_1 = list(range(0, 3756))

In [5]:
#manually had to get starting and ending tournament ids based on season since they jump in values
batch_2_years={

    '2017-2018': (401025221, 401025268),
    '2018-2019': (401056252, 401056542),
    '2019-2020': (401148233, 401148245),
    'Jan2020-Sep2020': (401155413, 401155476),
    'Sep2020-Dec2020': (401219793, 401219802),
    'masters2020': (401219478, 401219478),
    'Jan2021-Sep2021': (401242996, 401243402),
    'Sep2021-Aug2022': (401353193, 401353276),
    'Sep2022-Aug2022': (401465496, 401465545),
    'Sep2022-Dec2023':(401552854, 401552861),
    'Jan2024-Aug2024':(401580329, 401580366)
    
    
}

# looking at number of id's to request
# for k, v in batch_2_years.items():
#     print(f"{k}: {v[1] - v[0]}")

In [6]:
# for the dictionary in batch_2 ids, loop through and create ranges for start and end values
# have to add 1 to end value because range is exclusive on the end parameter

batch_2 = []
for values in batch_2_years.values():
    generate_ids = list(range(values[0], values[1]+1))
    batch_2 += generate_ids

In [7]:
# bach 2 for the jump in numbers
id_list = batch_1 + batch_2

In [8]:
# dictionary holds html elements for key details on webpage
data_info = {
    'tournament_title': {
        'element': 'h1',
        'css_class': 'headline headline__h1 Leaderboard__Event__Title'
    },
    'event_date': {
        'element': 'span',
        'css_class': 'Leaderboard__Event__Date n7'
    },
    'golf_course': {
        'element': 'div',
        'css_class': 'Leaderboard__Course__Location n8 clr-gray-04'
    },
    'course_details': {
        'element': 'div',
        'css_class': 'Leaderboard__Course__Location__Detail n8 clr-gray-04'
    },
    'purse': {
        'element': 'div',
        'css_class': 'n7 clr-gray-04'
    }
}


In [14]:
# initialize dataframe to store results
df_tournament_details_df = pd.DataFrame(columns = ['event_id'] + list(data_info.keys()))
df_tournament_details_df['event_id'] = id_list

In [15]:
timestamp_run = dt.datetime.now().strftime('%Y-%m-%d %Hh%Mm%Ss')

# for tournament_id in batch_1:
for tournament_id in batch_2:

    try:
        res = req.get(f"https://www.espn.com/golf/leaderboard/_/tournamentId/{tournament_id}", headers = headers)
        soup = BeautifulSoup(res.content, 'lxml')

        for col,v in data_info.items():
            try:
                value = soup.find(v['element'], class_= v['css_class']).get_text()
            except:
                value = f"No details found"
                print(f"No details found for tournament ID: {tournament_id} for field: {col}")

            # line of code inserts data into dataframe at appropriate tournement ID
            df_tournament_details_df.loc[df_tournament_details_df['event_id'] == tournament_id, col] = value
    except:
        print(f"Could not request page for tournament ID: {tournament_id}")

### Save Data to Folder


# output_path = f"/Users/Mishaun_Bhakta/Documents/Python & Projects/Projects/PGA Golf Data/data/raw_data/tournament_info/batch_1"
output_path = f"/Users/Mishaun_Bhakta/Documents/Python & Projects/Projects/PGA Golf Data/data/raw_data/tournament_info/batch_2"

os.mkdir(os.path.join(output_path, timestamp_run))
df_tournament_details_df.to_csv(f"{output_path}/{timestamp_run}/tournament_info.csv", index = False)

No details found for tournament ID: 401025222 for field: golf_course
No details found for tournament ID: 401025222 for field: course_details
No details found for tournament ID: 401025223 for field: golf_course
No details found for tournament ID: 401025223 for field: course_details
No details found for tournament ID: 401025224 for field: golf_course
No details found for tournament ID: 401025224 for field: course_details
No details found for tournament ID: 401025225 for field: golf_course
No details found for tournament ID: 401025225 for field: course_details
No details found for tournament ID: 401025226 for field: golf_course
No details found for tournament ID: 401025226 for field: course_details
No details found for tournament ID: 401025227 for field: golf_course
No details found for tournament ID: 401025227 for field: course_details
No details found for tournament ID: 401025228 for field: golf_course
No details found for tournament ID: 401025228 for field: course_details
No details fo

In [171]:
# test code for detecting elements on page

# try:
#     tournament_title = soup.find('h1', class_='headline headline__h1 Leaderboard__Event__Title').get_text()
#     event_date = soup.find('span', 'Leaderboard__Event__Date n7').get_text()

#     golf_course = soup.find('div', class_='Leaderboard__Course__Location n8 clr-gray-04').get_text()
#     course_details = soup.find('div', class_ = 'Leaderboard__Course__Location__Detail n8 clr-gray-04').get_text()
#     purse = soup.find('div', class_ = 'n7 clr-gray-04').get_text()

#     tournament_title
#     event_date
#     golf_course
#     course_details
#     purse
# except:
#     print(f"No details found for tournament ID = {tournament_id}")

## Gather Scores

In [None]:
timestamp_run = dt.datetime.now().strftime('%Y-%m-%d %Hh%Mm%Ss')
tournament_scores_df_list = []

for tournament_id in batch_1:
# for tournament_id in batch_2:
    
    print(f'running tournament id: {tournament_id}')

    try:
        url =f"https://www.espn.com/golf/leaderboard/_/tournamentId/{tournament_id}"
        html_tables = pd.read_html(url)
        
        #if the length of the tables in thh data set is just 1, then we will assume is is the scores
        if len(html_tables) == 1:
            scores = html_tables[0]
        #if there are more than 1 table, we need to loop into it and check length of records
        # this occurs when there is a playoff and there are more than 1 table shown on the tournament page
        elif len(html_tables) > 1:
            for index,table in enumerate(html_tables):
                if len(table) >=5: #using 5 as an adequate number of records for minimum
                    scores = html_tables[index]
                else:
                    scores = "Not Available"
                    
        scores['event_id'] = tournament_id
        tournament_scores_df_list.append(scores)
        
    except:
        print(f"tournament scores not available for ID: {tournament_id}")

### Save Data to Folder

tournament_scores_df = pd.concat(tournament_scores_df_list)

output_path = f"/Users/Mishaun_Bhakta/Documents/Python & Projects/Projects/PGA Golf Data/data/raw_data/tournament_scores/batch_1"
# output_path = f"/Users/Mishaun_Bhakta/Documents/Python & Projects/Projects/PGA Golf Data/data/raw_data/tournament_scores/batch_2"
os.mkdir(os.path.join(output_path, timestamp_run))
tournament_scores_df.to_csv(f"{output_path}/{timestamp_run}/tournament_scores.csv", index = False)