# Read Libs

In [1]:
import pandas as pd
import datetime as dt
import pdb
import numpy as np
import configparser
import os
import regex as re

# display and output settings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

display(HTML('''
<style>
    .container { width:85% !important; }
</style>
'''))

pd.set_option('display.min_rows', 20)

# Transform Data

## Clean Scores Data

### Reading In Data and Selecting Columns

In [2]:
scores_df_1 = pd.read_csv('../data/raw_data/tournament_scores/batch_1/2024-09-22 15h28m07s/tournament_scores.csv')
scores_df_2 = pd.read_csv('../data/raw_data/tournament_scores/batch_2/2024-09-22 14h03m51s/tournament_scores.csv')
scores_df = pd.concat([scores_df_1, scores_df_2])


  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
cols_scores = [
    "event_id",
    "POS",
    "PLAYER",
    "SCORE",
    "R1",
    "R2",
    "R3",
    "R4",
    "TOT",
    "EARNINGS",
    "FEDEX PTS"
]

In [4]:
scores_df = scores_df[cols_scores]

### identifying events that did not have complete score information

In [5]:
# creatiing a dataframe with the number of players per event
scores_df_agg = scores_df.groupby('event_id').count()


* will remove events that had less than 5 players in the scores table

In [6]:
# getting event ids that had more than 5 player threshold 
players_threshold_minimum = 5
keep_eventids = list(scores_df_agg[scores_df_agg['POS']>players_threshold_minimum]\
                     .reset_index()['event_id'])

In [7]:
scores_df = scores_df[scores_df['event_id'].isin(keep_eventids)]

## Clean Tournament Details

In [8]:
details_df_1 = pd.read_csv('../data/raw_data/tournament_info/batch_1/2024-09-15 15h13m10s/tournament_info.csv')
details_df_2 = pd.read_csv('../data/raw_data/tournament_info/batch_2/2024-09-22 16h43m40s/tournament_info.csv')

#filtering batch 1 details 
details_df_1 = details_df_1[details_df_1['event_id'] <= 3756]

#filtering batch 2 details to be event id's greater than batch 1's last id
details_df_2 = details_df_2[details_df_2['event_id'] > 3756]

details_df = pd.concat([details_df_1, details_df_2])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [9]:
# filtering out null or no detail titles
details_df = details_df[~details_df['tournament_title'].isnull()]
details_df = details_df[details_df['tournament_title'] != "No details found"]

### Adding Fields to Details

In [10]:
# adding city to details 
details_df['city'] = details_df['golf_course'].apply(lambda x: x.split("-")[-1])

In [11]:

def clean_purse(row_value):
    """
    Cleans the purse value from a given row.

    This function takes a row value, removes any commas, and extracts the first 
    dollar amount found. If no dollar amount is found, it returns NaN.

    Parameters:
    row_value (str): The value from the row which may contain a dollar amount.

    Returns:
    str or float: The cleaned dollar amount as a string without the dollar sign, 
                  or NaN if no dollar amount is found.
    """

    row_value_clean = str(row_value).replace(",", "")

    matches = re.findall("\$\d+", row_value_clean)

    if matches:
        return matches[0].replace("$", "") 
    else:
        return np.nan


# details_df['purse'].apply(lambda x: re.findall("\$\d+", str(x).replace(",", "")))
details_df['purse'] = details_df['purse'].apply(clean_purse)


In [12]:

def clean_course_yards(row_value):

    matches = re.findall("Yards.*", row_value)

    if matches:
        return int(matches[0].replace("Yards", "") )
    else:
        return np.nan
    

def clean_course_par(row_value):

    matches = re.findall("Par\d{2}", row_value)

    if matches:
        return int(matches[0].replace("Par", "") )
    else:
        return np.nan


In [13]:
# applying functions to create par and yards fields
details_df['Yards'] = details_df['course_details'].apply(clean_course_yards)
details_df['Par'] = details_df['course_details'].apply(clean_course_par)


In [14]:
details_df

Unnamed: 0,event_id,tournament_title,event_date,golf_course,course_details,purse,city,Yards,Par
1,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",No details found,No details found,5000000,No details found,,
2,2,Mercedes Championships,"January 11 - 14, 2001","Kapalua Resort (Plantation Course) - Kapalua, HI",Par73Yards7411,3500000,"Kapalua, HI",7411.0,73.0
3,3,Touchstone Energy Tucson Open,"January 11 - 14, 2001",Omni Tucson National Golf Resort and Spa - Tuc...,Par72Yards7213,3000000,"Tucson, AZ",7213.0,72.0
4,4,Sony Open in Hawaii,"January 18 - 21, 2001","Waialae Country Club - Honolulu, HI",Par70Yards7044,4000000,"Honolulu, HI",7044.0,70.0
5,5,Phoenix Open,"January 25 - 28, 2001","TPC Scottsdale (Stadium Course) - Scottsdale, AZ",No details found,4000000,"Scottsdale, AZ",,
6,6,AT&T Pebble Beach National Pro-Am,"February 1 - 4, 2001",No details found,Par72Yards6822,4000000,No details found,6822.0,72.0
7,7,Buick Invitational,"February 8 - 11, 2001",No details found,No details found,3500000,No details found,,
8,8,Bob Hope Chrysler Classic,"February 14 - 18, 2001",No details found,Par71Yards6997,3500000,No details found,6997.0,71.0
9,9,Nissan Open,"February 22 - 25, 2001","Riviera Country Club - Pacific Palisades, CA",No details found,3400000,"Pacific Palisades, CA",,
10,10,Genuity Championship,"March 1 - 4, 2001","Trump National Doral Golf Course - Doral, FL",No details found,4500000,"Doral, FL",,


In [127]:
# # logic to build out selecitng start and end dates of events
# # has to handle 2 scenarios , same month tournaments or tournaments overalaping months

# ### need to create dates start and finish for event date
# test_date = "January 4 - 7, 2001"
# test_date2 = "Mar 29 - April 1, 2001"
# event_date = test_date2

# pattern = r"\w+ \d+ - \w+ \d+, \d{4}"

# if len(re.findall(pattern, event_date)) == 0:
#     parts = test_date.split(",")
#     year = parts[-1].strip()
#     month = parts[0].split(" ")[0]

#     start_day = re.findall("\d", parts[0])[0]
#     end_day = re.findall("\d", parts[0])[1]

#     start_date = f"{month} {start_day}, {year}"
#     end_date = f"{month} {end_day}, {year}"

#     print(start_date, end_date)

# elif len(re.findall(pattern, event_date)) > 0:
#     parts = event_date.split(",")
#     year = parts[-1].strip()

#     start_month = parts[0].split(" - ")[0].split(' ')[0]
#     start_day = parts[0].split(" - ")[0].split(' ')[1]

#     end_month = parts[0].split(" - ")[1].split(' ')[0]
#     end_day = parts[0].split(" - ")[1].split(' ')[1]

#     start_date = f"{start_month} {start_day}, {year}"
#     end_date = f"{end_month} {end_day}, {year}"
    
#     print(start_date, end_date)

# else:
#     print('event_date not valid')

# # Clean up global variables used for testing
# # del test_date, test_date2, event_date, pattern, parts, year, month, start_day, end_day, start_date, end_date, start_month, end_month

Mar 29, 2001 April 1, 2001


In [117]:
def extract_start_end_dates(event_date, is_start):

    """
    Extracts the start or end date from a given event date string.
    The function handles two formats of event date strings:
    1. "Month Day - Month Day, Year" (e.g., "January 1 - January 3, 2023")
    2. "Month Day, Year" (e.g., "January 1-4, 2023")
    Args:
        event_date (str): The event date string to extract the start or end date from.
        is_start (bool): If True, returns the start date; if False, returns the end date.
    Returns:
        str: The extracted start or end date in the format "Month Day, Year".
    Raises:
        ValueError: If the event_date format is not recognized or parsing fails.
    """

    # pattern to determine which format the event date string is in
    pattern = r"\w+ \d+ - \w+ \d+, \d{4}"

    # if the event date string does not overlap months, then extract start and end dates
    if len(re.findall(pattern, event_date)) == 0:
        try:
            #split the passed date on comma to get the year and month/day sections
            parts = event_date.split(",")
            year = parts[-1].strip()
            month = parts[0].split(" ")[0]

            #extract the start and end days from the month/day section using regex by finding the digits
            start_day = re.findall("\d", parts[0])[0]
            end_day = re.findall("\d", parts[0])[1]

            start_date = f"{month} {start_day}, {year}"
            end_date = f"{month} {end_day}, {year}"
            
            #return the start or end date based on the is_start flag
            if is_start:
                return start_date
            else:
                return end_date
        except:
            print(f"this date failed {event_date}")
    
    # if the event date string overlaps months, then extract start and end dates        
    elif len(re.findall(pattern, event_date)) > 0:
        try:
            parts = event_date.split(",")
            year = parts[-1].strip()
            
            # extract the start and end months and days from the month/day section
            start_month = parts[0].split(" - ")[0].split(' ')[0]
            start_day = parts[0].split(" - ")[0].split(' ')[1]

            end_month = parts[0].split(" - ")[1].split(' ')[0]
            end_day = parts[0].split(" - ")[1].split(' ')[1]

            start_date = f"{start_month} {start_day}, {year}"
            end_date = f"{end_month} {end_day}, {year}"
            
            if is_start:
                return start_date
            else:
                return end_date
        except:
            print(f"this date failed {event_date}")
        



In [None]:
details_df['start_date'] = details_df['event_date'].apply(lambda x: extract_start_end_dates(x, True))
details_df['end_date'] = details_df['event_date'].apply(lambda x: extract_start_end_dates(x, False))

In [122]:
details_df['event_date'].apply(lambda x: extract_start_end_dates(x, False))

this date failed April 3, 1999


1         January 7, 2001
2         January 1, 2001
3         January 1, 2001
4         January 8, 2001
5         January 5, 2001
6        February 4, 2001
7        February 1, 2001
8        February 4, 2001
9        February 2, 2001
10          March 4, 2001
              ...        
4760         June 7, 2024
4761         July 7, 2024
4762         July 1, 2024
4763         July 8, 2024
4764         July 8, 2024
4765         July 5, 2024
4766       August 1, 2024
4767       August 5, 2024
4768       August 2, 2024
4769    September 1, 2024
Name: event_date, Length: 2201, dtype: object