# Read Libs

In [26]:
import pandas as pd
import datetime as dt
import numpy as np
import requests as req
import os
from bs4 import BeautifulSoup
import html5lib
import time
import re

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# display and output settings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML




display(HTML('''
<style>
    .container { width:85% !important; }
</style>
'''))

pd.set_option('display.min_rows', 100)

# Read in Tournament & Weather URL Details

In [2]:
path_tournament_detail = "../data/transformed_data/tournament_info/2024-10-06 16h20m45s/tournament_details_transformed.csv"
path_tournament_scores = "../data/transformed_data/tournament_scores/2024-10-06 16h20m45s/tournament_scores_transformed.csv"
path_weather_urls = "../data/raw_data/weather/weather_city_base_urls/2024-10-26 12h17m18s/weather_city_base_urls.csv"

df_tournament_details = pd.read_csv(path_tournament_detail)
df_tournament_scores = pd.read_csv(path_tournament_scores)
df_weather_urls = pd.read_csv(path_weather_urls)

In [3]:
df_tournament_details.head()
df_weather_urls.head()
df_tournament_scores.head()

Unnamed: 0,event_id,tournament_title,event_date,purse,city,Yards,par,golf_course,start_date,end_date
0,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07
1,2,Mercedes Championships,"January 11 - 14, 2001",3500000.0,"Kapalua, HI",7411.0,73.0,Kapalua Resort (Plantation Course),2001-01-11,2001-01-14
2,3,Touchstone Energy Tucson Open,"January 11 - 14, 2001",3000000.0,"Tucson, AZ",7213.0,72.0,Omni Tucson National Golf Resort and Spa,2001-01-11,2001-01-14
3,4,Sony Open in Hawaii,"January 18 - 21, 2001",4000000.0,"Honolulu, HI",7044.0,70.0,Waialae Country Club,2001-01-18,2001-01-21
4,5,Phoenix Open,"January 25 - 28, 2001",4000000.0,"Scottsdale, AZ",,71.0,TPC Scottsdale (Stadium Course),2001-01-25,2001-01-28


Unnamed: 0,City,base_url,is_golf_club
0,"Kapalua, HI",https://www.wunderground.com/history/daily/RJO...,False
1,"Tucson, AZ",https://www.wunderground.com/history/daily/KTU...,False
2,"Honolulu, HI",https://www.wunderground.com/history/daily/PHN...,False
3,"Scottsdale, AZ",https://www.wunderground.com/history/daily/KPH...,False
4,"Pacific Palisades, CA",https://www.wunderground.com/history/daily/KLA...,False


Unnamed: 0,event_id,POS,PLAYER,SCORE,R1,R2,R3,R4,TOT,EARNINGS,FEDEX PTS,par
0,2,1,Jim Furyk,-18,69,69,69,67,274,"$630,000",,73.0
1,2,2,Rory Sabbatini,-17,69,69,65,72,275,"$380,000",,73.0
2,2,T3,Ernie Els,-16,68,66,73,69,276,"$203,000",,73.0
3,2,T3,Vijay Singh,-16,71,67,67,71,276,"$203,000",,73.0
4,2,5,John Huston,-15,74,67,69,67,277,"$140,000",,73.0


# Clean Weather Urls

In [4]:
df_weather_urls['base_url'].apply(lambda x: len(str(x))).value_counts()


base_url
63    266
3     140
36      7
Name: count, dtype: int64

In [5]:
# replace urls if they are not valid by rules of length being less than 62
# takes away urls that could not find a city and are null
df_weather_urls['base_url'] = df_weather_urls['base_url'].apply(lambda x: None if len(str(x)) < 62 else x)

# Create tournament calendar | Round by Round Dates

In [6]:
df_tournament_details.head(3)
df_tournament_details.info()

Unnamed: 0,event_id,tournament_title,event_date,purse,city,Yards,par,golf_course,start_date,end_date
0,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07
1,2,Mercedes Championships,"January 11 - 14, 2001",3500000.0,"Kapalua, HI",7411.0,73.0,Kapalua Resort (Plantation Course),2001-01-11,2001-01-14
2,3,Touchstone Energy Tucson Open,"January 11 - 14, 2001",3000000.0,"Tucson, AZ",7213.0,72.0,Omni Tucson National Golf Resort and Spa,2001-01-11,2001-01-14


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2201 entries, 0 to 2200
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   event_id          2201 non-null   int64  
 1   tournament_title  2201 non-null   object 
 2   event_date        2201 non-null   object 
 3   purse             1804 non-null   float64
 4   city              1947 non-null   object 
 5   Yards             1891 non-null   float64
 6   par               2160 non-null   float64
 7   golf_course       1562 non-null   object 
 8   start_date        2199 non-null   object 
 9   end_date          2199 non-null   object 
dtypes: float64(3), int64(1), object(6)
memory usage: 172.1+ KB


In [7]:
def create_array_of_dates(start_date, end_date):
    """
    creates an array of dates between start_date and end_date
    """
    try:

        start_date = dt.datetime.strptime(start_date, "%Y-%m-%d")
        end_date = dt.datetime.strptime(end_date, "%Y-%m-%d")
        end_date = end_date + dt.timedelta(days=1)

        date_generated = [start_date + dt.timedelta(days=x) for x in range(0, (end_date-start_date).days)]
        date_generated = [date.strftime("%Y-%m-%d") for date in date_generated]
        return date_generated
    except:
        return None


In [8]:
df_tournament_details['round_dates']=\
df_tournament_details.apply(lambda x: create_array_of_dates(x['start_date'], x['end_date']), axis=1)


In [9]:
# explode the dates to have one row per date
df_tournament_details = df_tournament_details.explode('round_dates')

In [10]:
# create a column for the round of the tournament
df_tournament_details['round'] = df_tournament_details.groupby('event_id').cumcount() + 1

## Merge dataframes

In [11]:
df_merged_tournament_weather = pd.merge(df_tournament_details, df_weather_urls, left_on="city", right_on="City", how='left')

In [12]:
def get_url_weather_date(base_url, date):
    """
    creates a url for the weather for a specific date
    """
    try:
        match_date = re.findall("\d{4}-\d{1,2}-\d{1,2}", base_url)[0]

        return base_url.replace(match_date, date)
    except:
        return None    
    

In [13]:
df_merged_tournament_weather.head(3)

Unnamed: 0,event_id,tournament_title,event_date,purse,city,Yards,par,golf_course,start_date,end_date,round_dates,round,City,base_url,is_golf_club
0,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-04,1,,,
1,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-05,2,,,
2,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-06,3,,,


In [14]:
df_merged_tournament_weather['weather_url'] = df_merged_tournament_weather\
    .apply(lambda x: get_url_weather_date(x['base_url'], x['round_dates']), axis=1)


In [15]:
df_merged_tournament_weather.head(5)

Unnamed: 0,event_id,tournament_title,event_date,purse,city,Yards,par,golf_course,start_date,end_date,round_dates,round,City,base_url,is_golf_club,weather_url
0,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-04,1,,,,
1,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-05,2,,,,
2,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-06,3,,,,
3,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-07,4,,,,
4,2,Mercedes Championships,"January 11 - 14, 2001",3500000.0,"Kapalua, HI",7411.0,73.0,Kapalua Resort (Plantation Course),2001-01-11,2001-01-14,2001-01-11,1,"Kapalua, HI",https://www.wunderground.com/history/daily/RJO...,False,https://www.wunderground.com/history/daily/RJO...


# Get Table from URLs

In [16]:
for url in df_merged_tournament_weather['weather_url'].unique()[:12]:
    print(url)

None
https://www.wunderground.com/history/daily/RJOA/date/2001-01-11
https://www.wunderground.com/history/daily/RJOA/date/2001-01-12
https://www.wunderground.com/history/daily/RJOA/date/2001-01-13
https://www.wunderground.com/history/daily/RJOA/date/2001-01-14
https://www.wunderground.com/history/daily/KTUS/date/2001-01-11
https://www.wunderground.com/history/daily/KTUS/date/2001-01-12
https://www.wunderground.com/history/daily/KTUS/date/2001-01-13
https://www.wunderground.com/history/daily/KTUS/date/2001-01-14
https://www.wunderground.com/history/daily/PHNL/date/2001-01-18
https://www.wunderground.com/history/daily/PHNL/date/2001-01-19
https://www.wunderground.com/history/daily/PHNL/date/2001-01-20


In [33]:

# Specify the path to the ChromeDriver executable
chrome_driver_path = '../drivers/chromedriver_arm'
# firefox_driver_path = '../drivers/geckodriver'

#link for solution to cannot open chromedriver
# https://stackoverflow.com/questions/60362018/macos-catalinav-10-15-3-error-chromedriver-cannot-be-opened-because-the-de
# xattr -d com.apple.quarantine

driver = webdriver.Chrome(executable_path=chrome_driver_path)

# URL of the page with dynamically loaded tables
url = 'https://www.wunderground.com/history/daily/KTUS/date/2001-01-13'

# Load the page
driver.get(url)
driver.set_page_load_timeout(10)


x_path_observation = "/html/body/app-root/app-history/one-column-layout/wu-header/sidenav/mat-sidenav-container/mat-sidenav-content/div[2]/section/div[2]/div[1]/div[5]/div[1]/div/lib-city-history-observation/div/div[2]/table"
# Wait for the table to load (adjust the selector as needed)
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, x_path_observation))
)

# Extract the page source
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

driver.quit()

# Find all tables
tables = soup.find_all('table')

table_observation = soup.select_one('table[xpath="{}"]'.format(x_path_observation))

# Convert the first table to a DataFrame (adjust as needed)
df = pd.read_html(str(tables[0]))[0]



  driver = webdriver.Chrome(executable_path=chrome_driver_path)


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=130.0.6723.70)
Stacktrace:
0   chromedriver_arm                    0x0000000102a84500 cxxbridge1$str$ptr + 1917112
1   chromedriver_arm                    0x0000000102a7c890 cxxbridge1$str$ptr + 1885256
2   chromedriver_arm                    0x000000010268c538 cxxbridge1$string$len + 89424
3   chromedriver_arm                    0x0000000102667980 core::str::slice_error_fail::hbaf5d05fe3921cd2 + 3776
4   chromedriver_arm                    0x00000001026f7234 cxxbridge1$string$len + 526924
5   chromedriver_arm                    0x000000010270a5c4 cxxbridge1$string$len + 605660
6   chromedriver_arm                    0x00000001026c5374 cxxbridge1$string$len + 322444
7   chromedriver_arm                    0x00000001026c5fc4 cxxbridge1$string$len + 325596
8   chromedriver_arm                    0x0000000102a4bd2c cxxbridge1$str$ptr + 1685732
9   chromedriver_arm                    0x0000000102a50530 cxxbridge1$str$ptr + 1704168
10  chromedriver_arm                    0x0000000102a30e08 cxxbridge1$str$ptr + 1575360
11  chromedriver_arm                    0x0000000102a50e00 cxxbridge1$str$ptr + 1706424
12  chromedriver_arm                    0x0000000102a21f94 cxxbridge1$str$ptr + 1514316
13  chromedriver_arm                    0x0000000102a6d62c cxxbridge1$str$ptr + 1823204
14  chromedriver_arm                    0x0000000102a6d7ac cxxbridge1$str$ptr + 1823588
15  chromedriver_arm                    0x0000000102a7c530 cxxbridge1$str$ptr + 1884392
16  libsystem_pthread.dylib             0x00000001805def94 _pthread_start + 136
17  libsystem_pthread.dylib             0x00000001805d9d34 thread_start + 8


# Write Hourly Weather to File

In [43]:
# output_path = "../data/raw_data/weather/hourly_weather/"
# timestamp_run = dt.datetime.now().strftime('%Y-%m-%d %Hh%Mm%Ss')

# os.mkdir(os.path.join(output_path, timestamp_run))

# df_unique_cities.to_csv(f'{output_path}/{timestamp_run}/weather_city_base_urls.csv', index=False)