# Read Libs

In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import os
import re
import shutil

# display and output settings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML


display(HTML('''
<style>
    .container { width:85% !important; }
</style>
'''))

pd.set_option('display.min_rows', 100)

# Read Files In

In [2]:
path_transform_data_tournament_info= "../data/transformed_data/tournament_info"
path_transform_data_tournament_scores= "../data/transformed_data/tournament_scores"
path_transform_data_tournament_weather= "../data/transformed_data/weather_data/clean_weather_values"


list_paths_data = [path_transform_data_tournament_info, path_transform_data_tournament_scores, path_transform_data_tournament_weather]

In [3]:
# getting latest file in each directory

list_file_path_data = []
for directory in list_paths_data:

    batch_runs = list(filter(lambda x: ".DS_Store" not in x, os.listdir(directory)))
    
    latest_run = max(batch_runs)

    file_path = os.listdir(f"{directory}/{latest_run}")

    latest_file = list(filter(lambda x: ".DS_Store" not in x, file_path))[0]

    latest_file_path = f"{directory}/{latest_run}/{latest_file}"

    list_file_path_data.append(latest_file_path)


In [4]:
details = pd.read_csv(list_file_path_data[0])
scores = pd.read_csv(list_file_path_data[1])
weather = pd.read_csv(list_file_path_data[2])

## Preview

In [5]:
details.head()
scores.head()
weather.head()

Unnamed: 0,event_id,tournament_title,event_date,purse,city,Yards,par,golf_course,start_date,end_date
0,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07
1,2,Mercedes Championships,"January 11 - 14, 2001",3500000.0,"Kapalua, HI",7411.0,73.0,Kapalua Resort (Plantation Course),2001-01-11,2001-01-14
2,3,Touchstone Energy Tucson Open,"January 11 - 14, 2001",3000000.0,"Tucson, AZ",7213.0,72.0,Omni Tucson National Golf Resort and Spa,2001-01-11,2001-01-14
3,4,Sony Open in Hawaii,"January 18 - 21, 2001",4000000.0,"Honolulu, HI",7044.0,70.0,Waialae Country Club,2001-01-18,2001-01-21
4,5,Phoenix Open,"January 25 - 28, 2001",4000000.0,"Scottsdale, AZ",,71.0,TPC Scottsdale (Stadium Course),2001-01-25,2001-01-28


Unnamed: 0,event_id,POS,PLAYER,SCORE,R1,R2,R3,R4,TOT,EARNINGS,FEDEX PTS,par
0,2,1,Jim Furyk,-18,69,69,69,67,274,"$630,000",,73.0
1,2,2,Rory Sabbatini,-17,69,69,65,72,275,"$380,000",,73.0
2,2,T3,Ernie Els,-16,68,66,73,69,276,"$203,000",,73.0
3,2,T3,Vijay Singh,-16,71,67,67,71,276,"$203,000",,73.0
4,2,5,John Huston,-15,74,67,69,67,277,"$140,000",,73.0


Unnamed: 0,Time,Temperature (°F),Dew Point,Humidity,Wind,Wind Speed (mph),Wind Gust (mph),Pressure,Rain (in.),Condition,event_id,round,weather_url,tournament_city,actual_weather_city,distance_between_cities,flag_valid_weather,Hour,Flag Hour In Play
0,11:55 PM,44.0,38.0,79.0,ESE,7.0,0.0,27.36,0.0,Fair,3,1,https://www.wunderground.com/history/daily/KTU...,"Tucson, AZ","Tucson, AZ",0.0,Y,23,0
1,12:55 AM,42.0,39.0,89.0,ESE,7.0,0.0,27.35,0.0,Fair,3,1,https://www.wunderground.com/history/daily/KTU...,"Tucson, AZ","Tucson, AZ",0.0,Y,0,0
2,1:55 AM,43.0,38.0,82.0,ESE,5.0,0.0,27.32,0.0,Fair,3,1,https://www.wunderground.com/history/daily/KTU...,"Tucson, AZ","Tucson, AZ",0.0,Y,1,0
3,2:55 AM,39.0,37.0,93.0,CALM,0.0,0.0,27.33,0.0,Fair,3,1,https://www.wunderground.com/history/daily/KTU...,"Tucson, AZ","Tucson, AZ",0.0,Y,2,0
4,3:55 AM,41.0,37.0,86.0,S,5.0,0.0,27.33,0.0,Fair,3,1,https://www.wunderground.com/history/daily/KTU...,"Tucson, AZ","Tucson, AZ",0.0,Y,3,0


# Transformations

### Create Round Dates from Details | Taking Logic from Notebook 4

In [6]:
def create_array_of_dates(start_date, end_date):
    """
    creates an array of dates between start_date and end_date
    """
    try:

        start_date = dt.datetime.strptime(start_date, "%Y-%m-%d")
        end_date = dt.datetime.strptime(end_date, "%Y-%m-%d")
        end_date = end_date + dt.timedelta(days=1)

        date_generated = [start_date + dt.timedelta(days=x) for x in range(0, (end_date-start_date).days)]
        date_generated = [date.strftime("%Y-%m-%d") for date in date_generated]
        return date_generated
    except:
        return None


In [7]:
dates_by_round_details = details.copy()

In [8]:
dates_by_round_details['round_dates']=\
dates_by_round_details.apply(lambda x: create_array_of_dates(x['start_date'], x['end_date']), axis=1)

# explode the dates to have one row per date
dates_by_round_details = dates_by_round_details.explode('round_dates')
# create a column for the round of the tournament
dates_by_round_details['round'] = dates_by_round_details.groupby('event_id').cumcount() + 1

In [9]:
dates_by_round_details.head()

Unnamed: 0,event_id,tournament_title,event_date,purse,city,Yards,par,golf_course,start_date,end_date,round_dates,round
0,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-04,1
0,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-05,2
0,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-06,3
0,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,2001-01-07,4
1,2,Mercedes Championships,"January 11 - 14, 2001",3500000.0,"Kapalua, HI",7411.0,73.0,Kapalua Resort (Plantation Course),2001-01-11,2001-01-14,2001-01-11,1


#### Create Key for Joining on Round and Event

In [10]:
dates_by_round_details['Key_Event_Rd'] = "E"+ dates_by_round_details['event_id'].astype(str) + "R" + dates_by_round_details['round'].astype(str)

#### Clean up Columns Needed

In [11]:
cols_to_keep = ['event_id', 'round', 'Key_Event_Rd', 'round_dates']
dates_by_round_details[['event_id', 'round', 'Key_Event_Rd', 'round_dates']].head()

dates_by_round_details = dates_by_round_details[cols_to_keep]


Unnamed: 0,event_id,round,Key_Event_Rd,round_dates
0,1,1,E1R1,2001-01-04
0,1,2,E1R2,2001-01-05
0,1,3,E1R3,2001-01-06
0,1,4,E1R4,2001-01-07
1,2,1,E2R1,2001-01-11


### Melt Scores to Have Event, Round #, and Score

In [12]:
scores_by_round = scores.melt(
    id_vars=['event_id', 'PLAYER'],
    value_vars=[f'R{i}' for i in range(1, 5)],
    var_name='round',
    value_name='round_score'
)
# join par to scores by round

scores_by_round = scores_by_round\
.merge(details[['event_id', 'par']], on='event_id', how='left')

scores_by_round['round'] = scores_by_round['round'].str.replace('R', '').astype(int)



In [13]:
scores_by_round['round'].value_counts()
scores_by_round.head()


round
1    260888
2    260888
3    260888
4    260888
Name: count, dtype: int64

Unnamed: 0,event_id,PLAYER,round,round_score,par
0,2,Jim Furyk,1,69,73.0
1,2,Rory Sabbatini,1,69,73.0
2,2,Ernie Els,1,68,73.0
3,2,Vijay Singh,1,71,73.0
4,2,John Huston,1,74,73.0


In [14]:
def to_par(round_score, par):
    try:
        return int(round_score) - int(par)
    except:
        return np.nan

scores_by_round['to_par'] = scores_by_round\
    .apply(lambda x: to_par(x['round_score'], x['par']), axis=1)


#### Adding Key

In [15]:

scores_by_round['Key_Event_Rd'] = "E"+ scores_by_round['event_id'].astype(str) + "R" + scores_by_round['round'].astype(str)

In [16]:
scores_by_round.head()

Unnamed: 0,event_id,PLAYER,round,round_score,par,to_par,Key_Event_Rd
0,2,Jim Furyk,1,69,73.0,-4.0,E2R1
1,2,Rory Sabbatini,1,69,73.0,-4.0,E2R1
2,2,Ernie Els,1,68,73.0,-5.0,E2R1
3,2,Vijay Singh,1,71,73.0,-2.0,E2R1
4,2,John Huston,1,74,73.0,1.0,E2R1


### Adding Key to Weather Data

In [17]:
weather["Key_Event_Rd"] = "E"+ weather['event_id'].astype(str) + "R" + weather['round'].astype(str)

### Create Major Tournament Flag on Details

In [18]:
from fuzzywuzzy import fuzz, process

In [19]:
def create_major_flag(name):
    """
    returns 1 if the tournament is a major, 0 otherwise
    """

    major_tournaments = ["Masters", "PGA Championship", "The Open", "The Open Championship", "U.S. Open Championship", "U.S. Open"]
    major_tournaments = [x.lower() for x in major_tournaments]

    match = process.extractOne(query= name.lower(), 
                   choices = major_tournaments,
                   score_cutoff= 90)
    
    # manual list to not count some tournaments as majors to aid fuzzy matches
    list_do_not_count = [
    'Senior PGA Championship',
    'Sportsbet Australian Masters',
    'Commercialbank Qatar Masters',
    'BMW PGA Championship',
    'SAS Masters',
    'Omega European Masters',
    'Madrid Masters',
    'Portugal Masters',
    'CASTELLO MASTERS Costa Azahar',
    'JBWere Masters',
    'Commercial Bank Qatar Masters presented by Dolphin',
    'Avantha Masters',
    'BMW PGA CHAMPIONSHIP',
    'Nordea Scandinavian Masters',
    'Andalucia Masters',
    'Commercialbank Qatar Masters presented by Dolphin ',
    'Nordea Scandanavian Masters',
    'Bankia Madrid Masters',
    'Commercialbank Qatar Masters presented by Doplhin ',
    'Nordea Masters',
    'BMW Masters',
    'Commercial Bank Qatar Masters',
    'Wegmans LPGA Championship',
    'D+D REAL Czech Masters ',
    'AfrAsia Bank Mauritius Open',
    'D+D Real Czech Masters',
    'British Masters',
    "KPMG Women's PGA Championship",
    'Australian PGA Championship',
    'British Masters supported by Sky Sports',
    'KitchenAid Senior PGA Championship',
    'British Masters Supported by Sky Sports',
    'Andalucia Valderrama Masters hosted by the Sergio ',
    'DAP Championship',
    'SAS Championship',
    'HSBC New Zealand PGA Championship',
    'Jacob\'s Creek Open Championship',
    'U.S. Senior Open Championship',
    'SA Open Championship',
    'The Senior Open Championship'
    ]
    


    if match:
        if name not in list_do_not_count:
            return 1
    else:
        return 0

In [20]:
details['is_major'] = details['tournament_title'].apply(create_major_flag)


In [21]:

def normalize_major_name(major_flag, name):

    if major_flag == 1:
        if "masters" in name.lower():
            return "The Masters"
        elif "pga championship" in name.lower():
            return "PGA Championship"
        elif "u.s. open" in name.lower():
            return "U.S. Open"
        elif "the open" in name.lower():
            return "The Open Championship"
    else:
        return name   

details['tournament_title'] = details.apply(lambda x: normalize_major_name(x['is_major'], x['tournament_title']), axis=1)  
details[details['is_major'] == 1]['tournament_title'].value_counts()       

tournament_title
The Masters              25
U.S. Open                21
PGA Championship         21
The Open Championship    12
Name: count, dtype: int64

#### Strip whitespaces in cities

In [22]:
details['city'] = details['city'].str.strip()


#### Add lat/long for Viz Software 

In [23]:
from geopy.geocoders import Nominatim
from geopy.distance import geodesic

def geocode_city(city_name):
    """
    Geocode a city name to obtain its geographical location.
    This function uses the Nominatim geocoding service to convert a city name into its corresponding geographical location (latitude and longitude).
    Args:
        city_name (str): The name of the city to geocode.
    Returns:
        location (Location or None): A Location object containing the geographical coordinates of the city if successful, or None if the geocoding times out.
    Raises:
        GeocoderTimedOut: If the geocoding request times out.
    """

    geolocator = Nominatim(user_agent="my_geocoder")
    try:
        location = geolocator.geocode(city_name, timeout=10)
        return location
    except GeocoderTimedOut:
        print(f"Geocoding timed out for city: {city_name}")
        return None

In [24]:
# compressing dataframe to just cities to get 400 rows instead of 2000
city_agg = details[['city', 'event_id']]\
    .groupby('city').count().reset_index()

city_agg['latitude'] = city_agg['city'].apply(geocode_city).apply(lambda x: x.latitude if x else None)
city_agg['longitude'] = city_agg['city'].apply(geocode_city).apply(lambda x: x.longitude if x else None)

details = details.merge(city_agg[['city', 'latitude', 'longitude']], on='city', how='left')

# Preview All Data Tables

In [25]:
# tournament detail will have many rounds and dates
# each round will have many players and scores
# each round will have hourly weather details

details.head()
dates_by_round_details.head()
scores_by_round.head()
weather.head()

Unnamed: 0,event_id,tournament_title,event_date,purse,city,Yards,par,golf_course,start_date,end_date,is_major,latitude,longitude
0,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",5000000.0,,,,,2001-01-04,2001-01-07,0.0,,
1,2,Mercedes Championships,"January 11 - 14, 2001",3500000.0,"Kapalua, HI",7411.0,73.0,Kapalua Resort (Plantation Course),2001-01-11,2001-01-14,0.0,20.994904,-156.656234
2,3,Touchstone Energy Tucson Open,"January 11 - 14, 2001",3000000.0,"Tucson, AZ",7213.0,72.0,Omni Tucson National Golf Resort and Spa,2001-01-11,2001-01-14,0.0,32.222876,-110.974847
3,4,Sony Open in Hawaii,"January 18 - 21, 2001",4000000.0,"Honolulu, HI",7044.0,70.0,Waialae Country Club,2001-01-18,2001-01-21,0.0,21.304547,-157.855676
4,5,Phoenix Open,"January 25 - 28, 2001",4000000.0,"Scottsdale, AZ",,71.0,TPC Scottsdale (Stadium Course),2001-01-25,2001-01-28,0.0,33.494219,-111.926018


Unnamed: 0,event_id,round,Key_Event_Rd,round_dates
0,1,1,E1R1,2001-01-04
0,1,2,E1R2,2001-01-05
0,1,3,E1R3,2001-01-06
0,1,4,E1R4,2001-01-07
1,2,1,E2R1,2001-01-11


Unnamed: 0,event_id,PLAYER,round,round_score,par,to_par,Key_Event_Rd
0,2,Jim Furyk,1,69,73.0,-4.0,E2R1
1,2,Rory Sabbatini,1,69,73.0,-4.0,E2R1
2,2,Ernie Els,1,68,73.0,-5.0,E2R1
3,2,Vijay Singh,1,71,73.0,-2.0,E2R1
4,2,John Huston,1,74,73.0,1.0,E2R1


Unnamed: 0,Time,Temperature (°F),Dew Point,Humidity,Wind,Wind Speed (mph),Wind Gust (mph),Pressure,Rain (in.),Condition,event_id,round,weather_url,tournament_city,actual_weather_city,distance_between_cities,flag_valid_weather,Hour,Flag Hour In Play,Key_Event_Rd
0,11:55 PM,44.0,38.0,79.0,ESE,7.0,0.0,27.36,0.0,Fair,3,1,https://www.wunderground.com/history/daily/KTU...,"Tucson, AZ","Tucson, AZ",0.0,Y,23,0,E3R1
1,12:55 AM,42.0,39.0,89.0,ESE,7.0,0.0,27.35,0.0,Fair,3,1,https://www.wunderground.com/history/daily/KTU...,"Tucson, AZ","Tucson, AZ",0.0,Y,0,0,E3R1
2,1:55 AM,43.0,38.0,82.0,ESE,5.0,0.0,27.32,0.0,Fair,3,1,https://www.wunderground.com/history/daily/KTU...,"Tucson, AZ","Tucson, AZ",0.0,Y,1,0,E3R1
3,2:55 AM,39.0,37.0,93.0,CALM,0.0,0.0,27.33,0.0,Fair,3,1,https://www.wunderground.com/history/daily/KTU...,"Tucson, AZ","Tucson, AZ",0.0,Y,2,0,E3R1
4,3:55 AM,41.0,37.0,86.0,S,5.0,0.0,27.33,0.0,Fair,3,1,https://www.wunderground.com/history/daily/KTU...,"Tucson, AZ","Tucson, AZ",0.0,Y,3,0,E3R1


# Store to Final Folder

In [26]:

final_data_folder = "../data/final_data"

# write to final folder
details.to_csv(f"{final_data_folder}/tournament_info.csv", index=False)

dates_by_round_details.to_csv(f"{final_data_folder}/dates_event_round_dates.csv", index=False)

scores_by_round.to_csv(f"{final_data_folder}/player_scores_by_round.csv", index=False)

scores.to_csv(f"{final_data_folder}/scores_summary_final.csv", index=False)

weather.to_csv(f"{final_data_folder}/weather_data.csv", index=False)
