# Read Libs

In [15]:
import pandas as pd
import datetime as dt
import pdb
import numpy as np
import configparser
import os
import regex as re

# display and output settings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

display(HTML('''
<style>
    .container { width:85% !important; }
</style>
'''))

pd.set_option('display.min_rows', 20)

# Transform Data

## Clean Scores Data

### Reading In Data and Selecting Columns

In [16]:
scores_df_1 = pd.read_csv('../data/raw_data/tournament_scores/batch_1/2024-09-22 15h28m07s/tournament_scores.csv')
scores_df_2 = pd.read_csv('../data/raw_data/tournament_scores/batch_2/2024-09-22 14h03m51s/tournament_scores.csv')
scores_df = pd.concat([scores_df_1, scores_df_2])


  exec(code_obj, self.user_global_ns, self.user_ns)


In [17]:
cols_scores = [
    "event_id",
    "POS",
    "PLAYER",
    "SCORE",
    "R1",
    "R2",
    "R3",
    "R4",
    "TOT",
    "EARNINGS",
    "FEDEX PTS"
]

In [18]:
scores_df = scores_df[cols_scores]

### identifying events that did not have complete score information

In [19]:
# creatiing a dataframe with the number of players per event
scores_df_agg = scores_df.groupby('event_id').count()


* will remove events that had less than 5 players in the scores table

In [20]:
# getting event ids that had more than 5 player threshold 
players_threshold_minimum = 5
keep_eventids = list(scores_df_agg[scores_df_agg['POS']>players_threshold_minimum]\
                     .reset_index()['event_id'])

In [21]:
scores_df = scores_df[scores_df['event_id'].isin(keep_eventids)]

## Clean Tournament Details

In [22]:
details_df_1 = pd.read_csv('../data/raw_data/tournament_info/batch_1/2024-09-15 15h13m10s/tournament_info.csv')
details_df_2 = pd.read_csv('../data/raw_data/tournament_info/batch_2/2024-09-22 16h43m40s/tournament_info.csv')


#filtering batch 1 details 
details_df_1 = details_df_1[details_df_1['event_id'] <= 3756]

#filtering batch 2 details to be event id's greater than batch 1's last id
details_df_2 = details_df_2[details_df_2['event_id'] > 3756]

details_df = pd.concat([details_df_1, details_df_2])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [23]:
# filtering out null or no detail titles
details_df = details_df[~details_df['tournament_title'].isnull()]
details_df = details_df[details_df['tournament_title'] != "No details found"]

### Adding Fields to Details

In [24]:
# adding city to details 
details_df['city'] = details_df['golf_course'].apply(lambda x: x.split("-")[-1])

In [25]:

def clean_purse(row_value):
    """
    Cleans the purse value from a given row.

    This function takes a row value, removes any commas, and extracts the first 
    dollar amount found. If no dollar amount is found, it returns NaN.

    Parameters:
    row_value (str): The value from the row which may contain a dollar amount.

    Returns:
    str or float: The cleaned dollar amount as a string without the dollar sign, 
                  or NaN if no dollar amount is found.
    """

    row_value_clean = str(row_value).replace(",", "")

    matches = re.findall("\$\d+", row_value_clean)

    if matches:
        return matches[0].replace("$", "") 
    else:
        return np.nan


# details_df['purse'].apply(lambda x: re.findall("\$\d+", str(x).replace(",", "")))
details_df['purse'] = details_df['purse'].apply(clean_purse)


In [26]:

def clean_course_yards(row_value):

    matches = re.findall("Yards.*", row_value)

    if matches:
        return int(matches[0].replace("Yards", "") )
    else:
        return np.nan
    

def clean_course_par(row_value):

    matches = re.findall("Par\d{2}", row_value)

    if matches:
        return int(matches[0].replace("Par", "") )
    else:
        return np.nan


In [27]:
# applying functions to create par and yards fields
details_df['Yards'] = details_df['course_details'].apply(clean_course_yards)
details_df['Par'] = details_df['course_details'].apply(clean_course_par)


### need to create dates start and finish for event date

In [28]:
details_df
def extract_start_end_dates(event_date):
    """
    Extracts the start and end dates from the event_date column.

    Parameters:
    event_date (str): The event date string in the format 'Month Day - Day, Year'.

    Returns:
    tuple: A tuple containing the start date and end date as datetime objects.
    """
    try:
        # Split the event_date string
        parts = event_date.split(',')
        year = parts[1].strip()
        days = parts[0].split('-')
        
        # Extract start and end dates
        start_date_str = f"{days[0].strip()} {year}"
        end_date_str = f"{days[1].strip()} {year}"
        
        # Convert to datetime objects
        start_date = dt.datetime.strptime(start_date_str, '%B %d %Y')
        end_date = dt.datetime.strptime(end_date_str, '%B %d %Y')
        
        return start_date, end_date
    except Exception as e:
        print(f"Error processing date: {event_date}, Error: {e}")
        return np.nan, np.nan

details_df[['start_date', 'end_date']] = details_df['event_date'].apply(lambda x: pd.Series(extract_start_end_dates(x)))

Unnamed: 0,event_id,tournament_title,event_date,golf_course,course_details,purse,city,Yards,Par
1,1,WGC-Accenture Match Play Championship,"January 4 - 7, 2001",No details found,No details found,5000000,No details found,,
2,2,Mercedes Championships,"January 11 - 14, 2001","Kapalua Resort (Plantation Course) - Kapalua, HI",Par73Yards7411,3500000,"Kapalua, HI",7411.0,73.0
3,3,Touchstone Energy Tucson Open,"January 11 - 14, 2001",Omni Tucson National Golf Resort and Spa - Tuc...,Par72Yards7213,3000000,"Tucson, AZ",7213.0,72.0
4,4,Sony Open in Hawaii,"January 18 - 21, 2001","Waialae Country Club - Honolulu, HI",Par70Yards7044,4000000,"Honolulu, HI",7044.0,70.0
5,5,Phoenix Open,"January 25 - 28, 2001","TPC Scottsdale (Stadium Course) - Scottsdale, AZ",No details found,4000000,"Scottsdale, AZ",,
6,6,AT&T Pebble Beach National Pro-Am,"February 1 - 4, 2001",No details found,Par72Yards6822,4000000,No details found,6822.0,72.0
7,7,Buick Invitational,"February 8 - 11, 2001",No details found,No details found,3500000,No details found,,
8,8,Bob Hope Chrysler Classic,"February 14 - 18, 2001",No details found,Par71Yards6997,3500000,No details found,6997.0,71.0
9,9,Nissan Open,"February 22 - 25, 2001","Riviera Country Club - Pacific Palisades, CA",No details found,3400000,"Pacific Palisades, CA",,
10,10,Genuity Championship,"March 1 - 4, 2001","Trump National Doral Golf Course - Doral, FL",No details found,4500000,"Doral, FL",,


Error processing date: January 4 - 7, 2001, Error: time data '7 2001' does not match format '%B %d %Y'
Error processing date: January 11 - 14, 2001, Error: time data '14 2001' does not match format '%B %d %Y'
Error processing date: January 11 - 14, 2001, Error: time data '14 2001' does not match format '%B %d %Y'
Error processing date: January 18 - 21, 2001, Error: time data '21 2001' does not match format '%B %d %Y'
Error processing date: January 25 - 28, 2001, Error: time data '28 2001' does not match format '%B %d %Y'
Error processing date: February 1 - 4, 2001, Error: time data '4 2001' does not match format '%B %d %Y'
Error processing date: February 8 - 11, 2001, Error: time data '11 2001' does not match format '%B %d %Y'
Error processing date: February 14 - 18, 2001, Error: time data '18 2001' does not match format '%B %d %Y'
Error processing date: February 22 - 25, 2001, Error: time data '25 2001' does not match format '%B %d %Y'
Error processing date: March 1 - 4, 2001, Error: 

TypeError: The DTypes <class 'numpy.dtype[datetime64]'> and <class 'numpy.dtype[float64]'> do not have a common DType. For example they cannot be stored in a single array unless the dtype is `object`.