# Read Libs

In [35]:
import pandas as pd
import datetime as dt
import numpy as np
import requests as req
import os
from bs4 import BeautifulSoup
import html5lib
import time
import re

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# display and output settings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML




display(HTML('''
<style>
    .container { width:85% !important; }
</style>
'''))

pd.set_option('display.min_rows', 100)

# Read in Tournament & Weather URL Details

In [36]:
path_tournament_detail = "../data/transformed_data/tournament_info/2024-10-06 16h20m45s/tournament_details_transformed.csv"
path_tournament_scores = "../data/transformed_data/tournament_scores/2024-10-06 16h20m45s/tournament_scores_transformed.csv"
path_weather_urls = "../data/raw_data/weather/weather_city_base_urls/2024-10-26 12h17m18s/weather_city_base_urls.csv"

df_tournament_details = pd.read_csv(path_tournament_detail)
df_tournament_scores = pd.read_csv(path_tournament_scores)
df_weather_urls = pd.read_csv(path_weather_urls)

In [37]:
df_tournament_details.head()
df_weather_urls.head()
df_tournament_scores.head()

Unnamed: 0,event_id,tournament_title,event_date,purse,city,Yards,par,golf_course,start_date,end_date
0,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07
1,2,Mercedes Championships,"January 11 - 14, 2001",3500000.0,"Kapalua, HI",7411.0,73.0,Kapalua Resort (Plantation Course),2001-01-11,2001-01-14
2,3,Touchstone Energy Tucson Open,"January 11 - 14, 2001",3000000.0,"Tucson, AZ",7213.0,72.0,Omni Tucson National Golf Resort and Spa,2001-01-11,2001-01-14
3,4,Sony Open in Hawaii,"January 18 - 21, 2001",4000000.0,"Honolulu, HI",7044.0,70.0,Waialae Country Club,2001-01-18,2001-01-21
4,5,Phoenix Open,"January 25 - 28, 2001",4000000.0,"Scottsdale, AZ",,71.0,TPC Scottsdale (Stadium Course),2001-01-25,2001-01-28


Unnamed: 0,City,base_url,is_golf_club
0,"Kapalua, HI",https://www.wunderground.com/history/daily/RJOA/date/2024-10-26,False
1,"Tucson, AZ",https://www.wunderground.com/history/daily/KTUS/date/2024-10-26,False
2,"Honolulu, HI",https://www.wunderground.com/history/daily/PHNL/date/2024-10-26,False
3,"Scottsdale, AZ",https://www.wunderground.com/history/daily/KPHX/date/2024-10-26,False
4,"Pacific Palisades, CA",https://www.wunderground.com/history/daily/KLAX/date/2024-10-26,False


Unnamed: 0,event_id,POS,PLAYER,SCORE,R1,R2,R3,R4,TOT,EARNINGS,FEDEX PTS,par
0,2,1,Jim Furyk,-18,69,69,69,67,274,"$630,000",,73.0
1,2,2,Rory Sabbatini,-17,69,69,65,72,275,"$380,000",,73.0
2,2,T3,Ernie Els,-16,68,66,73,69,276,"$203,000",,73.0
3,2,T3,Vijay Singh,-16,71,67,67,71,276,"$203,000",,73.0
4,2,5,John Huston,-15,74,67,69,67,277,"$140,000",,73.0


# Clean Weather Urls

In [38]:
df_weather_urls['base_url'].apply(lambda x: len(str(x))).value_counts()


base_url
63    266
3     140
36      7
Name: count, dtype: int64

In [39]:
# replace urls if they are not valid by rules of length being less than 62
# takes away urls that could not find a city and are null
df_weather_urls['base_url'] = df_weather_urls['base_url'].apply(lambda x: None if len(str(x)) < 62 else x)

# Create tournament calendar | Round by Round Dates

In [40]:
df_tournament_details.head(3)
df_tournament_details.info()

Unnamed: 0,event_id,tournament_title,event_date,purse,city,Yards,par,golf_course,start_date,end_date
0,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07
1,2,Mercedes Championships,"January 11 - 14, 2001",3500000.0,"Kapalua, HI",7411.0,73.0,Kapalua Resort (Plantation Course),2001-01-11,2001-01-14
2,3,Touchstone Energy Tucson Open,"January 11 - 14, 2001",3000000.0,"Tucson, AZ",7213.0,72.0,Omni Tucson National Golf Resort and Spa,2001-01-11,2001-01-14


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2201 entries, 0 to 2200
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   event_id          2201 non-null   int64  
 1   tournament_title  2201 non-null   object 
 2   event_date        2201 non-null   object 
 3   purse             1804 non-null   float64
 4   city              1947 non-null   object 
 5   Yards             1891 non-null   float64
 6   par               2160 non-null   float64
 7   golf_course       1562 non-null   object 
 8   start_date        2199 non-null   object 
 9   end_date          2199 non-null   object 
dtypes: float64(3), int64(1), object(6)
memory usage: 172.1+ KB


In [41]:
def create_array_of_dates(start_date, end_date):
    """
    creates an array of dates between start_date and end_date
    """
    try:

        start_date = dt.datetime.strptime(start_date, "%Y-%m-%d")
        end_date = dt.datetime.strptime(end_date, "%Y-%m-%d")
        end_date = end_date + dt.timedelta(days=1)

        date_generated = [start_date + dt.timedelta(days=x) for x in range(0, (end_date-start_date).days)]
        date_generated = [date.strftime("%Y-%m-%d") for date in date_generated]
        return date_generated
    except:
        return None


In [42]:
df_tournament_details['round_dates']=\
df_tournament_details.apply(lambda x: create_array_of_dates(x['start_date'], x['end_date']), axis=1)


In [43]:
# explode the dates to have one row per date
df_tournament_details = df_tournament_details.explode('round_dates')

In [44]:
# create a column for the round of the tournament
df_tournament_details['round'] = df_tournament_details.groupby('event_id').cumcount() + 1

## Merge dataframes

In [47]:
df_merged_tournament_weather = pd.merge(df_tournament_details, df_weather_urls, left_on="city", right_on="City", how='left')

In [48]:
def get_url_weather_date(base_url, date):
    """
    creates a url for the weather for a specific date
    """
    try:
        match_date = re.findall("\d{4}-\d{1,2}-\d{1,2}", base_url)[0]

        return base_url.replace(match_date, date)
    except:
        return None    
    

In [49]:
df_merged_tournament_weather.head(3)

Unnamed: 0,event_id,tournament_title,event_date,purse,city,Yards,par,golf_course,start_date,end_date,round_dates,round,City,base_url,is_golf_club
0,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-04,1,,,
1,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-05,2,,,
2,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-06,3,,,


In [50]:
df_merged_tournament_weather['weather_url'] = df_merged_tournament_weather\
    .apply(lambda x: get_url_weather_date(x['base_url'], x['round_dates']), axis=1)


# Get Table from URLs

In [116]:
# selenium chromedriver
# Specify the path to the ChromeDriver executable
chrome_driver_path = '../drivers/chromedriver_arm'
# firefox_driver_path = '../drivers/geckodriver'

#link for solution to cannot open chromedriver
# https://stackoverflow.com/questions/60362018/macos-catalinav-10-15-3-error-chromedriver-cannot-be-opened-because-the-de
# xattr -d com.apple.quarantine

#open the browser
driver = webdriver.Chrome(executable_path=chrome_driver_path)

  driver = webdriver.Chrome(executable_path=chrome_driver_path)


In [141]:
len(df_merged_tournament_weather[~df_merged_tournament_weather['weather_url'].isnull()])

6200

In [None]:
# import warnings filter to suppress warnings
import warnings
warnings.filterwarnings('ignore')

#initalize the list of dataframes to store the hourly weather data
df_list_hourly_weather_data = []
counter = 0

# loop through data framewith valid weather urls
for index, row in df_merged_tournament_weather[~df_merged_tournament_weather['weather_url'].isnull()].iloc[4:9].iterrows():
   
    # for each row in dataframe that has a weather url get the event id, round and weather url
    weather_url = row['weather_url']
    event_id = row['event_id']
    round = row['round']
    tournament_city = row['city']
    counter += 1

    print(f"{counter} | Event ID: {event_id}, Round: {round}, Weather URL: {weather_url}")

    ### Use selenium to load the page and extract the hourly weather data

    # URL of the page with dynamically loaded tables
    url = weather_url

    # Load the page
    driver.get(url)
    driver.set_page_load_timeout(10)

    # Xpath to the table with the hourly weather data
    x_path_observation = "/html/body/app-root/app-history/one-column-layout/wu-header/sidenav/mat-sidenav-container/mat-sidenav-content/div[2]/section/div[2]/div[1]/div[5]/div[1]/div/lib-city-history-observation/div/div[2]/table"

    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, x_path_observation))
        )
        
        # Extract the page source after javascript has loaded
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # Find table within the div with the class 'observation-table'
        table_observation = soup.find('div', class_='observation-table').find('table')

        # Find the actual city name from the page
        weather_history_span = soup.find('span', string=re.compile("Weather History"))
        actual_city = weather_history_span.text.replace("Weather History", "").strip()
    

        # Convert the hourly weather data html table to a pandas DataFrame
        df_hourly_weather = pd.read_html(str(table_observation))[0]

        #clean up rows that have nulls in the Time column
        df_hourly_weather.dropna(subset=['Time'], inplace=True)

        # adding key columns to map back to the tournament details dataframe
        df_hourly_weather['event_id'] = event_id
        df_hourly_weather['round'] = round
        df_hourly_weather['weather_url'] = weather_url

        #adding city columns to help compare later if that city pulled for the weather is near the tournament city
        df_hourly_weather['tournament_city'] = tournament_city
        df_hourly_weather['actual_weather_city'] = actual_city

        # Append the hourly weather data to the list of dataframes
        df_list_hourly_weather_data.append(df_hourly_weather)

    except:
        print("page did not load in time for see hourly weather table")

# Concatenate the list of hourly weather dataframes into a single dataframe
df_hourly_weather_data_unioned = pd.concat(df_list_hourly_weather_data)



1 | Event ID: 3, Round: 1, Weather URL: https://www.wunderground.com/history/daily/KTUS/date/2001-01-11


<selenium.webdriver.remote.webelement.WebElement (session="0bedc025b500f7c668e32e00a57e625f", element="f.1A7B9D02980001EDD28566461C22C8A2.d.6060179D6A947EFB435A1F95BB18C754.e.134602")>

2 | Event ID: 3, Round: 2, Weather URL: https://www.wunderground.com/history/daily/KTUS/date/2001-01-12


<selenium.webdriver.remote.webelement.WebElement (session="0bedc025b500f7c668e32e00a57e625f", element="f.1A7B9D02980001EDD28566461C22C8A2.d.AC2C513E58D0D6BFA4F4BE3C16DF99DB.e.137027")>

3 | Event ID: 3, Round: 3, Weather URL: https://www.wunderground.com/history/daily/KTUS/date/2001-01-13


# Write Hourly Weather to File

In [146]:
output_path = "../data/raw_data/weather/hourly_weather/"
timestamp_run = dt.datetime.now().strftime('%Y-%m-%d %Hh%Mm%Ss')

os.mkdir(os.path.join(output_path, timestamp_run))

df_hourly_weather_data_unioned.to_csv(f'{output_path}/{timestamp_run}/weather_city_base_urls.csv', index=False)