## PGA Draftkings Notebook
Use [PGA Website](https://www.pgatour.com/tournaments/schedule.html) to look up tournament info and fill out first USER INPUT block below. (Fetch/XHR from Network tab, Paylod sub-tab)

Looks like the tournament ID is also in the address bar.

### User Input

In [1]:
# === USER INPUTS ===
# Old Tournament
old_tournament_name = "Hero World Challenge"
tournament_date = "12/7/2025"  # Ending date of tournament
old_course = "Albany GC"
tournament_id = "R2025478"  # Tournament ID from the PGA Tour API - Also visible in the URL of the tournament page

# New Tournament
new_tournament_name = "Sony Open in Hawaii"
new_ending_date = "1/15/2026"
new_course = "Waialae Country Club"
new_season = 2026


# === LIBRARIES AND VARIABLES ===
# Import necessary libraries
import requests
import pandas as pd
from datetime import datetime
import sqlite3 as sql
import numpy as np
from numpy import nan
import os
import importlib
import utils.db_utils
from utils.db_utils import TOURNAMENT_NAME_MAP, PLAYER_NAME_MAP

# === TOURNAMENT CONFIG ===
tournament_config = {
    "old": {
        "name": old_tournament_name,
        "date": tournament_date,
        "course": old_course,
        "id": tournament_id
    },
    "new": {
        "name": new_tournament_name,
        "course": new_course,
        "season": new_season,
        "ending_date": pd.to_datetime(new_ending_date, format="%m/%d/%Y"),
        "quoted_course": f'"{new_course}"',
        "quoted_name": f"'{new_tournament_name}'"
    }
}

this_week_key = str(tournament_config["new"]["ending_date"].date())

# Create a minimal one-row history DataFrame
this_week_history = pd.DataFrame([{
    "SEASON": tournament_config["new"]["season"],
    "TOURNAMENT": tournament_config["new"]["name"],
    "ENDING_DATE": tournament_config["new"]["ending_date"],
    "COURSE": tournament_config["new"]["course"],
    "TOURN_ID": None
}])

### Update Database

#### Old Tournament

In [2]:
importlib.reload(utils.db_utils)  # Only needed if you're actively editing db_utils.py
from utils.db_utils import update_tournament_results

# Change these each year!!
season = 2025
year = 20250  # Unique GraphQL year distinguishing number in case of multiple per year

# Run the update
db_path = "data/golf.db"  # Or use os.path.join("data", "golf.db")
tournDf = update_tournament_results(tournament_config, db_path, season, year)

# Show just the most recent tournament added for confirmation
from sqlalchemy import create_engine

engine = create_engine(f"sqlite:///{db_path}")

query = f"""
SELECT *
FROM tournaments
WHERE TOURN_ID = '{tournament_config['old']['id']}'
  AND ENDING_DATE = '{datetime.strptime(tournament_config['old']['date'], '%m/%d/%Y').date()}'
"""

recent = pd.read_sql(query, engine)
engine.dispose()
recent.sort_values(by='FINAL_POS').head()

üì¶ Fetching results for tournament ID R2025478 (Hero World Challenge), year: 20250
‚ÑπÔ∏è Tournament 'Hero World Challenge' already exists ‚Äî no new data inserted.


Unnamed: 0,SEASON,ENDING_DATE,TOURN_ID,TOURNAMENT,COURSE,PLAYER,POS,FINAL_POS,ROUNDS:1,ROUNDS:2,ROUNDS:3,ROUNDS:4,OFFICIAL_MONEY,FEDEX_CUP_POINTS
0,2025,2025-12-07,R2025478,Hero World Challenge,Albany GC,Hideki Matsuyama,1,1,-4,-6,-4,-8,"$1,000,000.00",0.0
1,2025,2025-12-07,R2025478,Hero World Challenge,Albany GC,Alex Noren,2,2,-3,-6,-5,-8,"$450,000.00",0.0
2,2025,2025-12-07,R2025478,Hero World Challenge,Albany GC,Sepp Straka,3,3,-6,-3,-8,-4,"$300,000.00",0.0
3,2025,2025-12-07,R2025478,Hero World Challenge,Albany GC,J.J. Spaun,T4,4,-6,-4,-3,-7,"$237,500.00",0.0
4,2025,2025-12-07,R2025478,Hero World Challenge,Albany GC,Scottie Scheffler,T4,4,-6,-3,-7,-4,"$237,500.00",0.0


#### Stats

In [None]:
importlib.reload(utils.db_utils)
from utils.db_utils import update_season_stats  # <- This line is essential

# Change these each year!! 
# NOT WORKING FOR 2026 YET WHICH IS BREAKING THE WAY THIS WORKBOOK WORKS... MAY NEED TO WAIT UNTIL STATS ARE LOADED FOR 2026
statsYear = 2025

stats_df = update_season_stats(statsYear, db_path)
stats_df.head()

‚úÖ Overwrote stats for season 2025 with 998 rows.


Unnamed: 0,PLAYER,SGTTG_RANK,SGTTG,SGOTT_RANK,SGOTT,SGAPR_RANK,SGAPR,SGATG_RANK,SGATG,SGP_RANK,SGP,BIRDIES_RANK,BIRDIES,PAR_3_RANK,PAR_3,PAR_4_RANK,PAR_4,PAR_5_RANK,PAR_5,TOTAL_DRIVING_RANK,TOTAL_DRIVING,DRIVING_DISTANCE_RANK,DRIVING_DISTANCE,DRIVING_ACCURACY_RANK,DRIVING_ACCURACY,GIR_RANK,GIR,SCRAMBLING_RANK,SCRAMBLING,OWGR_RANK,OWGR,SEASON
0,A.J. Ewart,,,,,,,,,,,,,,,,,,,,,,,,,,,,,495,0.2794,2025
1,Aaron Baddeley,,,,,,,,,,,,,,,,,,,,,,,,,,,,,752,0.1525,2025
2,Aaron Cockerill,,,,,,,,,,,,,,,,,,,,,,,,,,,,,414,0.3446,2025
3,Aaron Rai,12.0,0.942,22.0,0.391,21.0,0.476,68.0,0.077,132.0,-0.134,104.0,21.99%,179.0,3.16,16.0,3.97,28.0,4.54,74.0,173.0,171.0,289.6,2.0,73.85%,18.0,70.52%,101.0,59.16%,23,2.9153,2025
4,Abraham Ancer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,600,0.2022,2025


#### Odds
Not usually needed for weekly routine.

**Manual Fix! Odds name cleanup (only needed when joins fail)**

Make sure to update the dictionaries in db_utils.py if new names need to be added.

In [17]:
importlib.reload(utils.db_utils)
from utils.db_utils import clean_odds_names, PLAYER_NAME_MAP, TOURNAMENT_NAME_MAP

db_path = "data/golf.db" 
updated_odds = clean_odds_names(db_path, TOURNAMENT_NAME_MAP, PLAYER_NAME_MAP)
updated_odds.head()

‚úÖ Cleaned and updated 516 rows in 'odds' table.


Unnamed: 0,SEASON,TOURNAMENT,ENDING_DATE,PLAYER,ODDS,VEGAS_ODDS,TOURNAMENT_ORIG,PLAYER_ORIG
8275,2024,World Wide Technology Championship,2024-11-10,Max Greyserman,16/1,16.0,World Wide Tech Champ,Max Greyserman
8276,2024,World Wide Technology Championship,2024-11-10,Doug Ghim,18/1,18.0,World Wide Tech Champ,Doug Ghim
8277,2024,World Wide Technology Championship,2024-11-10,J.J. Spaun,25/1,25.0,World Wide Tech Champ,J.J. Spaun
8278,2024,World Wide Technology Championship,2024-11-10,Beau Hossler,25/1,25.0,World Wide Tech Champ,Beau Hossler
8279,2024,World Wide Technology Championship,2024-11-10,Ben Griffin,25/1,25.0,World Wide Tech Champ,Ben Griffin


**Historical Odds Updates**

Only run this when loading in entire year odds at the start of each year or if corrections need to be made.  This will load in the entire year into the database and update it using the dictionary in db_utils.py.

In [15]:
importlib.reload(utils.db_utils)
from utils.db_utils import import_historical_odds

oddsYear = "2022-2023"    # URL segment
season = 2023             # PGA Tour season
db_path = "data/golf.db"

odds_df = import_historical_odds(oddsYear, season, db_path)
odds_df.head()

‚úÖ Inserting 120 new rows into odds table...


Unnamed: 0,SEASON,TOURNAMENT,ENDING_DATE,PLAYER,ODDS,VEGAS_ODDS
4,2023,Fortinet Championship,2022-09-18,Hideki Matsuyama,18/1,18.0
5,2023,Fortinet Championship,2022-09-18,Max Homa,18/1,18.0
6,2023,Fortinet Championship,2022-09-18,Corey Conners,20/1,20.0
7,2023,Fortinet Championship,2022-09-18,Maverick McNealy,25/1,25.0
8,2023,Fortinet Championship,2022-09-18,Taylor Pendrith,25/1,25.0


**Not normally needed**

The code below is a way to troubleshoot the odds function that is in db_utils.py.  If it does not pull the stats correctly, we can run it outside of the db_utils.py file and explore what might be wrong with it, iterating until it works, and then using that fix to update db_utils.py.  This is not normally needed, but wanted to leave this in place as a reference because this odds website can be tricky.

In [16]:
import pandas as pd
import numpy as np
import requests
import re
from datetime import datetime
from io import StringIO

# === USER INPUT ===
oddsYear = "2020-2021"    # URL segment
season = 2021        # PGA Tour season

url = f"http://golfodds.com/archives-{oddsYear}.html"
response = requests.get(url)
tables = pd.read_html(StringIO(response.text))
# raw_df = tables[5]  # the actual table of interest
# Find the largest 2-column table that contains at least some odds-like strings
raw_df = None
for tbl in tables:
    if tbl.shape[1] == 2 and tbl.shape[0] > 50:  # Rough filter
        sample = tbl.iloc[:, 1].astype(str).str.contains(r"\d+/\d+").sum()
        if sample > 5:
            raw_df = tbl
            break

if raw_df is None:
    raise ValueError("‚ùå Could not find valid odds table on the page.")

# === STEP 1: Initial clean-up ===
df = raw_df.dropna(how="all").reset_index(drop=True)
df.columns = ["PLAYER", "ODDS"]

# üîß Clean up non-breaking spaces and extra whitespace
df["PLAYER"] = (
    df["PLAYER"]
    .astype(str)
    .str.replace("\xa0", " ", regex=False)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

df.insert(loc=0, column="SEASON", value=season)
df.insert(loc=1, column="TOURNAMENT", value=np.nan)
df.insert(loc=2, column="ENDING_DATE", value=np.nan)

# === STEP 2: Helper function for parsing date strings ===
def parse_ending_date(text):
    import re
    from datetime import datetime

    # Normalize whitespace and symbols
    text = (
        text.replace("\u2013", "-")
            .replace("‚Äì", "-")
            .replace("\xa0", " ")
    )
    text = re.sub(r"\bSept(?!ember)\b", "Sep", text)

    # ‚úÖ Fix typo: "Match" ‚Üí "March" only when used in a date context
    text = re.sub(r"\bMatch(?=\s+\d{1,2}\s*[-‚Äì]\s*\d{1,2},\s*\d{4})", "March", text)

    # Pattern 1: "July 30 - August 2, 2015" or "Oct 29 - Nov 1, 2015"
    match = re.search(r"(\w+)\s\d+\s*-\s*(\w+)\s(\d+),\s(\d{4})", text)
    if match:
        month2, day2, year = match.group(2), match.group(3), match.group(4)
        for fmt in ["%B %d, %Y", "%b %d, %Y"]:
            try:
                return datetime.strptime(f"{month2} {day2}, {year}", fmt).date()
            except ValueError:
                continue

    # Pattern 2: "November 21-24, 2024"
    match = re.search(r"(\w+)\s\d+-\d+,\s(\d{4})", text)
    if match:
        month, year = match.group(1), match.group(2)
        day = re.search(r"(\d+)-(\d+)", text).group(2)
        for fmt in ["%B %d, %Y", "%b %d, %Y"]:
            try:
                return datetime.strptime(f"{month} {day}, {year}", fmt).date()
            except ValueError:
                continue

    # Pattern 3: "Sunday, October 20, 2019"
    try:
        return datetime.strptime(text.strip(), "%A, %B %d, %Y").date()
    except ValueError:
        pass

    # Pattern 4: "October 20, 2019"
    try:
        return datetime.strptime(text.strip(), "%B %d, %Y").date()
    except ValueError:
        pass

    return None

last_tourn_name = None
last_date = None

# === STEP 3: Iterate block by block ===
final_rows = []
i = 0
last_tourn_name = None
last_end_date = None

while i < len(df) - 4:
    player_i = str(df.loc[i, "PLAYER"])
    player_i2 = str(df.loc[i + 2, "PLAYER"])
    player_i3 = str(df.loc[i + 3, "PLAYER"]).lower()

    # Detect start of a new tournament block
    is_header = (
        pd.isna(df.loc[i, "ODDS"]) and
        pd.isna(df.loc[i + 1, "ODDS"]) and (
            re.search(r"\w+\s\d+\s*[-‚Äì]\s*(\w+\s)?\d+,\s\d{4}", player_i2) or
            re.search(r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),?\s+\w+\s\d{1,2},\s\d{4}", player_i2)
        )
    )

    if is_header:
        tourn_name = player_i.strip()
        end_date = parse_ending_date(player_i2)

        # Skip cancelled or empty blocks
        if "cancelled" in player_i3:
            print(f"‚ö†Ô∏è Skipping cancelled tournament: {tourn_name} ‚Äî {end_date}")
            i += 4
            continue

        # Avoid duplicate block processing
        if tourn_name == last_tourn_name and end_date == last_end_date:
            i += 1
            continue

        print(f"üìç Detected: {tourn_name} ‚Äî Ending: {end_date}")
        last_tourn_name = tourn_name
        last_end_date = end_date
        i += 4  # Skip header lines

        # Collect all player rows until next header block
        while i < len(df) - 2:
            next_i2 = str(df.loc[i + 2, "PLAYER"])
            is_next_header = (
                pd.isna(df.loc[i, "ODDS"]) and
                pd.isna(df.loc[i + 1, "ODDS"]) and (
                    re.search(r"\w+\s\d+\s*[-‚Äì]\s*(\w+\s)?\d+,\s\d{4}", next_i2) or
                    re.search(r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),?\s+\w+\s\d{1,2},\s\d{4}", next_i2)
                )
            )
            if is_next_header:
                break

            if pd.notna(df.loc[i, "ODDS"]):
                row = df.loc[i].copy()
                row["TOURNAMENT"] = tourn_name
                row["ENDING_DATE"] = end_date
                final_rows.append(row)
            i += 1
    else:
        i += 1

# === STEP 4: Create cleaned DataFrame ===
clean_df = pd.DataFrame(final_rows)

# ‚úÖ Prevent crash if nothing was parsed
if clean_df.empty or "PLAYER" not in clean_df.columns:
    print(f"‚ö†Ô∏è No valid tournament blocks detected for season {season} ({oddsYear})")
    final_df = pd.DataFrame()  # Safe fallback
else:
    # Remove winner tag
    clean_df["PLAYER"] = clean_df["PLAYER"].str.replace(r"\s\*Winner\*", "", regex=True)

    # Clean odds to numeric
    clean_df["VEGAS_ODDS"] = (
        clean_df["ODDS"]
        .str.replace(",", "")
        .str.extract(r"(\d+)/(\d+)")
        .astype(float)
        .apply(lambda x: x[0] / x[1], axis=1)
    )

    # Final output with source index for debugging
    final_df = clean_df[
        ["SEASON", "TOURNAMENT", "ENDING_DATE", "PLAYER", "ODDS", "VEGAS_ODDS"]
    ].reset_index(drop=True)

    # Drop non-standard team events (e.g., Presidents Cup, Ryder Cup)
    drop_terms = ["Presidents Cup", "Ryder Cup"]
    final_df = final_df[~final_df["TOURNAMENT"].str.contains("|".join(drop_terms), case=False, na=False)]

    display(final_df.head())



üìç Detected: Safeway Open ‚Äî Ending: 2020-09-13
üìç Detected: US Open ‚Äî Ending: 2020-09-20
üìç Detected: R & C Championship ‚Äî Ending: 2020-09-27
üìç Detected: at Big Cedar Lodge - ‚Äî Ending: 2020-09-22
üìç Detected: Sanderson Farms Champ ‚Äî Ending: 2020-10-04
üìç Detected: Shriners H for C Open ‚Äî Ending: 2020-10-11
üìç Detected: The CJ Cup ‚Äî Ending: 2020-10-18
üìç Detected: ZOZO CHAMPIONSHIP ‚Äî Ending: 2020-10-25
üìç Detected: Bermuda Championship ‚Äî Ending: 2020-11-01
üìç Detected: Vivint Houston Open ‚Äî Ending: 2020-11-08
üìç Detected: The Masters ‚Äî Ending: 2020-11-15
üìç Detected: The RSM Classic ‚Äî Ending: 2020-11-22
üìç Detected: Champions for Change ‚Äî Ending: 2020-11-27
üìç Detected: Mayakoba Golf Classic ‚Äî Ending: 2020-12-06
üìç Detected: QBE Shootout ‚Äî Ending: 2020-12-13
üìç Detected: Sentry Tourn of Champions ‚Äî Ending: 2021-01-10
üìç Detected: Sony Open in Hawaii ‚Äî Ending: 2021-01-17
üìç Detected: The American Express ‚Äî Ending: 2

Unnamed: 0,SEASON,TOURNAMENT,ENDING_DATE,PLAYER,ODDS,VEGAS_ODDS
0,2021,Safeway Open,2020-09-13,Phil Mickelson,20/1,20.0
1,2021,Safeway Open,2020-09-13,Si Woo Kim,20/1,20.0
2,2021,Safeway Open,2020-09-13,Brendan Steele,20/1,20.0
3,2021,Safeway Open,2020-09-13,Shane Lowry,25/1,25.0
4,2021,Safeway Open,2020-09-13,Sergio Garcia,30/1,30.0


In [17]:
from datetime import datetime, date
# ‚úÖ Check for non-date types in ENDING_DATE
non_dates = final_df[~final_df["ENDING_DATE"].apply(lambda x: isinstance(x, date))]

print(f"üß™ Rows with invalid ENDING_DATE values: {len(non_dates)}")
display(non_dates.head(10))



üß™ Rows with invalid ENDING_DATE values: 0


Unnamed: 0,SEASON,TOURNAMENT,ENDING_DATE,PLAYER,ODDS,VEGAS_ODDS


In [18]:
dupes = final_df.duplicated(subset=["SEASON", "TOURNAMENT", "ENDING_DATE", "PLAYER"], keep=False)

print(f"üö® Duplicate primary keys in final_df: {dupes.sum()}")
display(final_df[dupes].sort_values(by=["SEASON", "TOURNAMENT", "PLAYER"]))

üö® Duplicate primary keys in final_df: 0


Unnamed: 0,SEASON,TOURNAMENT,ENDING_DATE,PLAYER,ODDS,VEGAS_ODDS


#### Misc. Cleaning
Not normally needed

In [26]:
importlib.reload(utils.db_utils)
from utils.db_utils import clean_player_names_in_table, PLAYER_NAME_MAP

db_path = "data/golf.db"

# Run for all relevant tables
clean_player_names_in_table(db_path, "tournaments", PLAYER_NAME_MAP)
clean_player_names_in_table(db_path, "stats", PLAYER_NAME_MAP)
clean_player_names_in_table(db_path, "odds", PLAYER_NAME_MAP)

‚ÑπÔ∏è No player names needed updates in 'tournaments'.
‚ÑπÔ∏è No player names needed updates in 'stats'.
‚ÑπÔ∏è No player names needed updates in 'odds'.


Unnamed: 0,SEASON,TOURNAMENT,ENDING_DATE,PLAYER,ODDS,VEGAS_ODDS,PLAYER_ORIG


## Historical Data

### Pull Relevant Seasons
Do a check to see when this course or tournament have been historically played.

In [3]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_combined_history_seasons

# === USER INPUT ===
seasons = list(range(2016, 2025))  # Adjust as needed
db_path = "data/golf.db"

# Pull course and tournament from config
n_course = tournament_config["new"]["course"]
n_tourn = tournament_config["new"]["name"]

# Fetch relevant history
history_df = get_combined_history_seasons(db_path, course=n_course, tournament=n_tourn, allowed_seasons=seasons)
history_df.head(20)


‚ÑπÔ∏è Found 9 relevant tournaments from course or tournament name.


Unnamed: 0,SEASON,COURSE,TOURN_ID,TOURNAMENT,ENDING_DATE
720,2016,Waialae Country Club,006,Sony Open in Hawaii,2016-01-17
576,2017,Waialae Country Club,006,Sony Open in Hawaii,2017-01-15
432,2018,Waialae Country Club,006,Sony Open in Hawaii,2018-01-14
288,2019,Waialae Country Club,006,Sony Open in Hawaii,2019-01-13
144,2020,Waialae Country Club,006,Sony Open in Hawaii,2020-01-12
0,2021,Waialae Country Club,006,Sony Open in Hawaii,2021-01-17
1008,2022,Waialae Country Club,006,Sony Open in Hawaii,2022-01-16
1152,2023,Waialae Country Club,006,Sony Open in Hawaii,2023-01-15
1296,2024,Waialae Country Club,R2024006,Sony Open in Hawaii,2024-01-14


### Cut Percentage and FedEx Points
Use a rolling-window approach to look at the most recent cut percentage and how many FedEx cup points have been accumulated recently. This will intentionally not match the PGA Tour stats that start over every year, but will have the same amount of data all the time.  We also add a new feature called Form Density which divides the FedEx Cup Points by the Total Events.

In [4]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_cut_and_fedex_history

cuts = get_cut_and_fedex_history("data/golf.db", history_df, window_months=9)
# cuts["2024-05-12"].head(20)

for end_date, df in cuts.items():
    print(f"\nüìÜ {end_date} ‚Äî {df['TOURNAMENT'].iloc[0]} ({len(df)} players)")
    display(df.head(3))


üìÜ 2016-01-17 ‚Äî Sony Open in Hawaii (538 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,13,6,138.0,46.2,10.62,0,2016-01-17,Sony Open in Hawaii
1,Aaron Wise,1,0,0.0,0.0,0.0,0,2016-01-17,Sony Open in Hawaii
2,Abraham Ancer,5,0,0.0,0.0,0.0,0,2016-01-17,Sony Open in Hawaii



üìÜ 2017-01-15 ‚Äî Sony Open in Hawaii (524 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,12,8,249.5,66.7,20.79,1,2017-01-15,Sony Open in Hawaii
1,Aaron Wise,6,3,0.0,50.0,0.0,2,2017-01-15,Sony Open in Hawaii
2,Abraham Ancer,7,5,96.5,71.4,13.79,1,2017-01-15,Sony Open in Hawaii



üìÜ 2018-01-14 ‚Äî Sony Open in Hawaii (534 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,A.J. McInerney,1,1,0.0,100.0,0.0,1,2018-01-14,Sony Open in Hawaii
1,Aaron Baddeley,12,7,237.78,58.3,19.82,0,2018-01-14,Sony Open in Hawaii
2,Aaron Rai,1,0,0.0,0.0,0.0,0,2018-01-14,Sony Open in Hawaii



üìÜ 2019-01-13 ‚Äî Sony Open in Hawaii (563 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,14,7,230.09,50.0,16.43,1,2019-01-13,Sony Open in Hawaii
1,Aaron Wise,13,7,1185.65,53.8,91.2,4,2019-01-13,Sony Open in Hawaii
2,Abraham Ancer,13,10,353.47,76.9,27.19,6,2019-01-13,Sony Open in Hawaii



üìÜ 2020-01-12 ‚Äî Sony Open in Hawaii (594 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,10,5,112.37,50.0,11.24,0,2020-01-12,Sony Open in Hawaii
1,Aaron Rai,1,1,0.0,100.0,0.0,1,2020-01-12,Sony Open in Hawaii
2,Aaron Wise,15,10,275.82,66.7,18.39,0,2020-01-12,Sony Open in Hawaii



üìÜ 2021-01-17 ‚Äî Sony Open in Hawaii (424 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,10,2,42.67,20.0,4.27,0,2021-01-17,Sony Open in Hawaii
1,Aaron Crawford,1,0,0.0,0.0,0.0,0,2021-01-17,Sony Open in Hawaii
2,Aaron Terrazas,1,0,0.0,0.0,0.0,0,2021-01-17,Sony Open in Hawaii



üìÜ 2022-01-16 ‚Äî Sony Open in Hawaii (547 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,8,3,35.09,37.5,4.39,1,2022-01-16,Sony Open in Hawaii
1,Aaron Pike,1,0,0.0,0.0,0.0,0,2022-01-16,Sony Open in Hawaii
2,Aaron Rai,10,6,142.47,60.0,14.25,4,2022-01-16,Sony Open in Hawaii



üìÜ 2023-01-15 ‚Äî Sony Open in Hawaii (588 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,A.J. Ewart,1,0,0.0,0.0,0.0,0,2023-01-15,Sony Open in Hawaii
1,Aaron Baddeley,9,5,140.98,55.6,15.66,1,2023-01-15,Sony Open in Hawaii
2,Aaron Cockerill,1,1,0.0,100.0,0.0,1,2023-01-15,Sony Open in Hawaii



üìÜ 2024-01-14 ‚Äî Sony Open in Hawaii (564 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,12,7,232.896,58.3,19.41,2,2024-01-14,Sony Open in Hawaii
1,Aaron Cockerill,1,0,0.0,0.0,0.0,0,2024-01-14,Sony Open in Hawaii
2,Aaron Rai,14,8,443.014,57.1,31.64,3,2024-01-14,Sony Open in Hawaii


### Recent Form

In [5]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_recent_avg_finish

recent_form = get_recent_avg_finish("data/golf.db", history_df, window_months=9)

# Example preview
for date, df in recent_form.items(): 
    print(f"\nüìÜ {date} ‚Äî {df['TOURNAMENT'].iloc[0]} ({len(df)} players)")
    display(df.head(3))


üìÜ 2016-01-17 ‚Äî Sony Open in Hawaii (538 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Jordan Niebrugge,1,6.0,8.66,2016-01-17,Sony Open in Hawaii
1,Rory McIlroy,6,10.8,5.55,2016-01-17,Sony Open in Hawaii
2,Anthony Wall,1,12.0,17.31,2016-01-17,Sony Open in Hawaii



üìÜ 2017-01-15 ‚Äî Sony Open in Hawaii (524 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Pat Perez,3,3.7,2.67,2017-01-15,Sony Open in Hawaii
1,Jared du Toit,1,9.0,12.98,2017-01-15,Sony Open in Hawaii
2,Matthew Southgate,1,12.0,17.31,2017-01-15,Sony Open in Hawaii



üìÜ 2018-01-14 ‚Äî Sony Open in Hawaii (534 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Matthew Southgate,1,6.0,8.66,2018-01-14,Sony Open in Hawaii
1,Jordan L Smith,1,9.0,12.98,2018-01-14,Sony Open in Hawaii
2,A.J. McInerney,1,10.0,14.43,2018-01-14,Sony Open in Hawaii



üìÜ 2019-01-13 ‚Äî Sony Open in Hawaii (563 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Justin Rose,7,9.0,4.33,2019-01-13,Sony Open in Hawaii
1,Chase Seiffert,1,9.0,12.98,2019-01-13,Sony Open in Hawaii
2,Armando Favela,1,16.0,23.08,2019-01-13,Sony Open in Hawaii



üìÜ 2020-01-12 ‚Äî Sony Open in Hawaii (594 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Robert MacIntyre,1,6.0,8.66,2020-01-12,Sony Open in Hawaii
1,Aaron Rai,1,12.0,17.31,2020-01-12,Sony Open in Hawaii
2,Patrick Cantlay,12,12.8,4.99,2020-01-12,Sony Open in Hawaii



üìÜ 2021-01-17 ‚Äî Sony Open in Hawaii (424 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Austin Eckroat,1,12.0,17.31,2021-01-17,Sony Open in Hawaii
1,Xander Schauffele,12,15.6,6.08,2021-01-17,Sony Open in Hawaii
2,Justin Thomas,13,17.7,6.71,2021-01-17,Sony Open in Hawaii



üìÜ 2022-01-16 ‚Äî Sony Open in Hawaii (547 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Collin Morikawa,10,13.5,5.63,2022-01-16,Sony Open in Hawaii
1,Marcel Siem,1,15.0,21.64,2022-01-16,Sony Open in Hawaii
2,Jordan Spieth,9,16.2,7.04,2022-01-16,Sony Open in Hawaii



üìÜ 2023-01-15 ‚Äî Sony Open in Hawaii (588 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Ryo Hisatsune,1,12.0,17.31,2023-01-15,Sony Open in Hawaii
1,Tom Kim,4,12.8,7.95,2023-01-15,Sony Open in Hawaii
2,Jon Rahm,10,13.1,5.46,2023-01-15,Sony Open in Hawaii



üìÜ 2024-01-14 ‚Äî Sony Open in Hawaii (564 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Ben Kohles,1,5.0,7.21,2024-01-14,Sony Open in Hawaii
1,Ryo Hisatsune,1,6.0,8.66,2024-01-14,Sony Open in Hawaii
2,Scottie Scheffler,12,7.1,2.77,2024-01-14,Sony Open in Hawaii


### Course History

In [6]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_course_history

# Filter history_df for only the course we're targeting
target_course = tournament_config["new"]["course"]
course_df = history_df[history_df["COURSE"] == target_course]
course_hist = get_course_history("data/golf.db", course_df)

# View example
for date, df in course_hist.items():
    if not df.empty:
        print(f"\nüèåÔ∏è‚Äç‚ôÇÔ∏è Course history for {df['TOURNAMENT'].iloc[0]} on {date}")
        display(df.head(3))


üèåÔ∏è‚Äç‚ôÇÔ∏è Course history for Sony Open in Hawaii on 2016-01-17


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Adam Hadwin,1,90.0,129.84,2016-01-17,Waialae Country Club,Sony Open in Hawaii
1,Alex Cejka,1,90.0,129.84,2016-01-17,Waialae Country Club,Sony Open in Hawaii
2,Alex Prugh,1,90.0,129.84,2016-01-17,Waialae Country Club,Sony Open in Hawaii



üèåÔ∏è‚Äç‚ôÇÔ∏è Course history for Sony Open in Hawaii on 2017-01-15


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Adam Hadwin,2,78.5,71.45,2017-01-15,Waialae Country Club,Sony Open in Hawaii
1,Adam Scott,1,56.0,80.79,2017-01-15,Waialae Country Club,Sony Open in Hawaii
2,Alex Cejka,2,90.0,81.92,2017-01-15,Waialae Country Club,Sony Open in Hawaii



üèåÔ∏è‚Äç‚ôÇÔ∏è Course history for Sony Open in Hawaii on 2018-01-14


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Adam Hadwin,2,78.5,71.45,2018-01-14,Waialae Country Club,Sony Open in Hawaii
1,Adam Scott,1,56.0,80.79,2018-01-14,Waialae Country Club,Sony Open in Hawaii
2,Alex Cejka,2,90.0,81.92,2018-01-14,Waialae Country Club,Sony Open in Hawaii



üèåÔ∏è‚Äç‚ôÇÔ∏è Course history for Sony Open in Hawaii on 2019-01-13


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,1,90.0,129.84,2019-01-13,Waialae Country Club,Sony Open in Hawaii
1,Aaron Wise,1,90.0,129.84,2019-01-13,Waialae Country Club,Sony Open in Hawaii
2,Abraham Ancer,1,90.0,129.84,2019-01-13,Waialae Country Club,Sony Open in Hawaii



üèåÔ∏è‚Äç‚ôÇÔ∏è Course history for Sony Open in Hawaii on 2020-01-12


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,1,90.0,129.84,2020-01-12,Waialae Country Club,Sony Open in Hawaii
1,Aaron Wise,1,90.0,129.84,2020-01-12,Waialae Country Club,Sony Open in Hawaii
2,Abraham Ancer,2,59.5,54.16,2020-01-12,Waialae Country Club,Sony Open in Hawaii



üèåÔ∏è‚Äç‚ôÇÔ∏è Course history for Sony Open in Hawaii on 2021-01-17


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,1,90.0,129.84,2021-01-17,Waialae Country Club,Sony Open in Hawaii
1,Aaron Wise,2,90.0,81.92,2021-01-17,Waialae Country Club,Sony Open in Hawaii
2,Abraham Ancer,3,52.3,37.73,2021-01-17,Waialae Country Club,Sony Open in Hawaii



üèåÔ∏è‚Äç‚ôÇÔ∏è Course history for Sony Open in Hawaii on 2022-01-16


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,2,65.5,59.62,2022-01-16,Waialae Country Club,Sony Open in Hawaii
1,Aaron Wise,2,90.0,81.92,2022-01-16,Waialae Country Club,Sony Open in Hawaii
2,Abraham Ancer,4,61.8,38.4,2022-01-16,Waialae Country Club,Sony Open in Hawaii



üèåÔ∏è‚Äç‚ôÇÔ∏è Course history for Sony Open in Hawaii on 2023-01-15


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,2,65.5,59.62,2023-01-15,Waialae Country Club,Sony Open in Hawaii
1,Aaron Rai,1,90.0,129.84,2023-01-15,Waialae Country Club,Sony Open in Hawaii
2,Aaron Wise,2,90.0,81.92,2023-01-15,Waialae Country Club,Sony Open in Hawaii



üèåÔ∏è‚Äç‚ôÇÔ∏è Course history for Sony Open in Hawaii on 2024-01-14


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,3,46.0,33.18,2024-01-14,Waialae Country Club,Sony Open in Hawaii
1,Aaron Rai,2,75.5,68.72,2024-01-14,Waialae Country Club,Sony Open in Hawaii
2,Aaron Wise,2,90.0,81.92,2024-01-14,Waialae Country Club,Sony Open in Hawaii


In [7]:
#  DEBUGGING: Check for empty DataFrames or missing columns
for date_key, df in course_hist.items():
    if df.empty:
        print(f"‚ÑπÔ∏è Empty course_hist for date {date_key}")
    elif "PLAYER" not in df.columns:
        print(f"‚ùå Missing 'PLAYER' column in course_hist[{date_key}]. Columns present: {df.columns.tolist()}")
        display(df.head())

## Training Dataset

In [8]:
pd.set_option("display.max_columns", None)   # Show all columns
importlib.reload(utils.db_utils)
from utils.db_utils import build_training_rows
training_df = build_training_rows(
    db_path,
    history_df,
    cuts,
    recent_form,
    course_hist,
)
training_df.head(10)
# training_df.info()
# training_df.isna().sum().sort_values(ascending=False)

Unnamed: 0,SEASON,ENDING_DATE,TOURNAMENT,COURSE,PLAYER,POS,FINAL_POS,SGTTG_RANK,SGTTG,SGOTT_RANK,SGOTT,SGAPR_RANK,SGAPR,SGATG_RANK,SGATG,SGP_RANK,SGP,BIRDIES_RANK,BIRDIES,PAR_3_RANK,PAR_3,PAR_4_RANK,PAR_4,PAR_5_RANK,PAR_5,TOTAL_DRIVING_RANK,TOTAL_DRIVING,DRIVING_DISTANCE_RANK,DRIVING_DISTANCE,DRIVING_ACCURACY_RANK,DRIVING_ACCURACY,GIR_RANK,GIR,SCRAMBLING_RANK,SCRAMBLING,OWGR_RANK,OWGR,VEGAS_ODDS,CUT_PERCENTAGE,FEDEX_CUP_POINTS,form_density,CONSECUTIVE_CUTS,RECENT_FORM,adj_form,COURSE_HISTORY,adj_ch,TOP_20
0,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Fabian Gomez,1,1,136.0,-0.249,102.0,0.009,157.0,-0.289,88.0,0.03,49.0,0.213,110.0,19.69,50.0,3.04,172.0,4.1,68.0,4.66,81.0,178.0,105.0,289.0,73.0,61.44,141.0,63.71,132.0,56.59,74.0,1.95,80.0,58.3,387.17,32.26,5.0,55.5,21.64,71.0,102.43,1
1,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Brandt Snedeker,2,2,44.0,0.573,116.0,-0.059,78.0,0.164,18.0,0.32,25.0,0.395,32.0,21.41,78.0,3.06,2.0,3.98,100.0,4.69,99.0,188.0,75.0,292.6,113.0,58.69,90.0,65.67,16.0,62.73,20.0,4.08,20.0,73.3,866.03,57.74,1.0,39.1,14.1,,,1
2,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Zac Blair,3,3,165.0,-0.565,119.0,-0.068,173.0,-0.498,102.0,0.001,56.0,0.195,169.0,18.01,78.0,3.06,114.0,4.06,169.0,4.76,91.0,185.0,180.0,275.3,5.0,69.86,169.0,62.34,16.0,62.73,243.0,0.73,,57.1,252.56,18.04,2.0,63.5,23.45,6.0,8.66,1
3,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Si Woo Kim,4,4,42.0,0.605,53.0,0.257,140.0,-0.168,9.0,0.403,118.0,-0.102,46.0,21.0,78.0,3.06,34.0,4.02,17.0,4.59,18.0,126.0,69.0,293.5,57.0,62.84,112.0,64.83,20.0,62.32,52.0,2.56,,60.0,142.0,28.4,2.0,48.0,26.79,,,1
4,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Kevin Kisner,T5,5,127.0,-0.124,38.0,0.348,137.0,-0.142,173.0,-0.331,3.0,0.672,36.0,21.34,78.0,3.06,34.0,4.02,80.0,4.67,30.0,138.0,105.0,289.0,33.0,64.92,104.0,65.19,129.0,56.88,36.0,3.08,20.0,85.7,1643.0,117.36,4.0,28.2,10.41,84.0,121.19,1
5,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Greg Owen,T5,5,51.0,0.534,64.0,0.177,25.0,0.475,170.0,-0.279,185.0,-0.987,117.0,19.55,102.0,3.07,114.0,4.06,169.0,4.76,40.0,146.0,39.0,297.7,107.0,59.22,8.0,69.75,181.0,51.7,266.0,0.67,,44.4,93.36,10.37,0.0,70.3,30.53,,,1
6,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Graham DeLaet,T7,7,67.0,0.347,26.0,0.444,8.0,0.662,183.0,-0.567,145.0,-0.227,27.0,21.55,146.0,3.1,34.0,4.02,138.0,4.72,21.0,132.0,37.0,297.9,95.0,60.11,28.0,68.35,179.0,52.39,160.0,1.06,60.0,72.7,308.23,28.02,1.0,51.3,20.64,90.0,129.84,1
7,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Jamie Lovemark,T7,7,65.0,0.373,9.0,0.609,170.0,-0.459,21.0,0.294,72.0,0.131,27.0,21.55,168.0,3.12,34.0,4.02,28.0,4.61,40.0,146.0,11.0,304.8,135.0,57.32,99.0,65.32,36.0,61.44,94.0,1.51,,66.7,141.23,23.54,1.0,44.7,22.97,,,1
8,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Jason Dufner,T9,9,22.0,0.84,48.0,0.29,26.0,0.469,70.0,0.081,165.0,-0.353,32.0,21.41,102.0,3.07,14.0,4.01,50.0,4.64,20.0,128.0,70.0,293.2,58.0,62.83,6.0,70.15,139.0,56.39,60.0,2.28,60.0,83.3,355.99,29.67,2.0,44.7,17.43,,,1
9,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Zach Johnson,T9,9,81.0,0.214,147.0,-0.269,79.0,0.163,17.0,0.321,25.0,0.395,64.0,20.54,50.0,3.04,48.0,4.03,60.0,4.65,121.0,199.0,167.0,280.1,32.0,65.23,122.0,64.53,24.0,62.11,29.0,3.5,20.0,84.6,1396.12,107.39,3.0,33.9,12.85,64.0,92.33,1


In [10]:
# Play with different years to see if I notice anything wrong (check Odds for tournament name mismatches)
training_df[training_df["SEASON"] == 2024].head()

Unnamed: 0,SEASON,ENDING_DATE,TOURNAMENT,COURSE,PLAYER,POS,FINAL_POS,SGTTG_RANK,SGTTG,SGOTT_RANK,SGOTT,SGAPR_RANK,SGAPR,SGATG_RANK,SGATG,SGP_RANK,SGP,BIRDIES_RANK,BIRDIES,PAR_3_RANK,PAR_3,PAR_4_RANK,PAR_4,PAR_5_RANK,PAR_5,TOTAL_DRIVING_RANK,TOTAL_DRIVING,DRIVING_DISTANCE_RANK,DRIVING_DISTANCE,DRIVING_ACCURACY_RANK,DRIVING_ACCURACY,GIR_RANK,GIR,SCRAMBLING_RANK,SCRAMBLING,OWGR_RANK,OWGR,VEGAS_ODDS,CUT_PERCENTAGE,FEDEX_CUP_POINTS,form_density,CONSECUTIVE_CUTS,RECENT_FORM,adj_form,COURSE_HISTORY,adj_ch,TOP_20
1152,2024,2024-01-14,Sony Open in Hawaii,Waialae Country Club,Grayson Murray,1,1,62.0,0.344,57.0,0.199,85.0,0.087,77.0,0.058,108.0,-0.038,39.0,23.31%,166.0,3.13,140.0,4.05,35.0,4.53,6.0,99.0,51.0,303.5,48.0,63.72%,162.0,62.43%,135.0,57.20%,,,400.0,33.3,85.914,14.32,0.0,70.5,36.23,90.0,129.84,1
1153,2024,2024-01-14,Sony Open in Hawaii,Waialae Country Club,Keegan Bradley,T2,2,46.0,0.482,62.0,0.174,39.0,0.301,100.0,0.007,127.0,-0.141,86.0,22.36%,142.0,3.08,107.0,4.02,77.0,4.57,15.0,116.0,56.0,305.5,60.0,62.55%,89.0,67.09%,144.0,56.41%,12.0,4.0174,50.0,83.3,787.039,65.59,5.0,37.1,14.46,51.7,24.86,1
1154,2024,2024-01-14,Sony Open in Hawaii,Waialae Country Club,Byeong Hun An,T2,2,34.0,0.579,21.0,0.404,73.0,0.181,106.0,-0.006,113.0,-0.079,25.0,24.07%,38.0,3.01,107.0,4.02,53.0,4.55,75.0,170.0,4.0,317.1,166.0,53.33%,72.0,67.63%,114.0,58.35%,36.0,2.6376,35.0,78.6,1023.731,73.12,7.0,37.5,13.85,12.0,17.31,1
1155,2024,2024-01-14,Sony Open in Hawaii,Waialae Country Club,Carl Yuan,T4,4,133.0,-0.179,55.0,0.207,151.0,-0.354,113.0,-0.032,173.0,-0.704,142.0,21.00%,142.0,3.08,162.0,4.07,147.0,4.65,32.0,135.0,19.0,312.1,116.0,58.53%,167.0,62.60%,103.0,58.89%,211.0,0.679,300.0,64.3,351.567,25.11,2.0,57.7,21.31,21.0,30.3,1
1156,2024,2024-01-14,Sony Open in Hawaii,Waialae Country Club,Russell Henley,T4,4,26.0,0.646,103.0,0.014,34.0,0.343,20.0,0.289,40.0,0.304,114.0,21.69%,21.0,3.0,26.0,3.98,129.0,4.63,70.0,167.0,160.0,291.3,7.0,69.49%,136.0,64.99%,13.0,65.00%,15.0,3.7122,20.0,83.3,1131.365,94.28,5.0,30.1,11.74,43.4,20.87,1


### Normalization
***Fix Historical Odds***

Many names do not have historical odds but rather are part of the "field." So giving these "NaN" values the average odds (as I would for missing stats) isn't a good approximation of reality ‚Äì they are typically the "field" because they are not notable and have poor odds individually.

There are also some crazy odds numbers occasionally (1000/1 or 3000/1) that don't happen all that much and are making that tail too long.

This cell assigns odds of 1000/1 for anyone missing and clips anything larger to 1000/1 to normalize the data better and assign the missing odds more appropriately where they belong.

***OWGR Adjustment***

The NaN's for OWGR are similarly bad players.  So we should assign these the worst ranking instead of the mean and clip it at 1000 just in case there are ever outliers.

For the OWGR score, it should match the lowest score in the dataset to assign that person the same as the worst.

***Recent Form Adjustment***

The NaN's for Recent Form mean that this player has not played any tournaments in the lookback period (9 months).  Similar to Odds and OWGR, I want to punish those that don't play often in my model.  Set these to 90 (i.e. like they miss a lot of cuts - because if they aren't playing they aren't good enough to make it to these tournaments very frequently).  The adj_form feature will also need to be updated here with the new data.

***FedEx Cup Point Adjustment***

The NaN's for FedEx Cup Points means there is no data.  Therefore they should be 0.

In [11]:
# === VEGAS ODDS ===
training_df["VEGAS_ODDS"] = training_df["VEGAS_ODDS"].fillna(1000).clip(upper=1000)

# === OWGR and OWGR_RANK ===
owgr_min = training_df["OWGR"].min(skipna=True)
training_df["OWGR"] = training_df["OWGR"].fillna(owgr_min)
training_df["OWGR_RANK"] = training_df["OWGR_RANK"].fillna(1000).astype(float).clip(upper=1000)

# === RECENT FORM and adj_form ===
training_df["RECENT_FORM"] = training_df["RECENT_FORM"].fillna(90)
if "TOTAL_EVENTS_PLAYED" in training_df.columns:
    training_df["adj_form"] = (
        training_df["RECENT_FORM"] / np.log1p(training_df["TOTAL_EVENTS_PLAYED"])
    ).round(2)

# === FEDEX CUP POINTS ===
training_df["FEDEX_CUP_POINTS"] = training_df["FEDEX_CUP_POINTS"].fillna(0)

training_df.head(5)


Unnamed: 0,SEASON,ENDING_DATE,TOURNAMENT,COURSE,PLAYER,POS,FINAL_POS,SGTTG_RANK,SGTTG,SGOTT_RANK,SGOTT,SGAPR_RANK,SGAPR,SGATG_RANK,SGATG,SGP_RANK,SGP,BIRDIES_RANK,BIRDIES,PAR_3_RANK,PAR_3,PAR_4_RANK,PAR_4,PAR_5_RANK,PAR_5,TOTAL_DRIVING_RANK,TOTAL_DRIVING,DRIVING_DISTANCE_RANK,DRIVING_DISTANCE,DRIVING_ACCURACY_RANK,DRIVING_ACCURACY,GIR_RANK,GIR,SCRAMBLING_RANK,SCRAMBLING,OWGR_RANK,OWGR,VEGAS_ODDS,CUT_PERCENTAGE,FEDEX_CUP_POINTS,form_density,CONSECUTIVE_CUTS,RECENT_FORM,adj_form,COURSE_HISTORY,adj_ch,TOP_20
0,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Fabian Gomez,1,1,136.0,-0.249,102.0,0.009,157.0,-0.289,88.0,0.03,49.0,0.213,110.0,19.69,50.0,3.04,172.0,4.1,68.0,4.66,81.0,178.0,105.0,289.0,73.0,61.44,141.0,63.71,132.0,56.59,74.0,1.95,80.0,58.3,387.17,32.26,5.0,55.5,21.64,71.0,102.43,1
1,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Brandt Snedeker,2,2,44.0,0.573,116.0,-0.059,78.0,0.164,18.0,0.32,25.0,0.395,32.0,21.41,78.0,3.06,2.0,3.98,100.0,4.69,99.0,188.0,75.0,292.6,113.0,58.69,90.0,65.67,16.0,62.73,20.0,4.08,20.0,73.3,866.03,57.74,1.0,39.1,14.1,,,1
2,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Zac Blair,3,3,165.0,-0.565,119.0,-0.068,173.0,-0.498,102.0,0.001,56.0,0.195,169.0,18.01,78.0,3.06,114.0,4.06,169.0,4.76,91.0,185.0,180.0,275.3,5.0,69.86,169.0,62.34,16.0,62.73,243.0,0.73,1000.0,57.1,252.56,18.04,2.0,63.5,23.45,6.0,8.66,1
3,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Si Woo Kim,4,4,42.0,0.605,53.0,0.257,140.0,-0.168,9.0,0.403,118.0,-0.102,46.0,21.0,78.0,3.06,34.0,4.02,17.0,4.59,18.0,126.0,69.0,293.5,57.0,62.84,112.0,64.83,20.0,62.32,52.0,2.56,1000.0,60.0,142.0,28.4,2.0,48.0,26.79,,,1
4,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Kevin Kisner,T5,5,127.0,-0.124,38.0,0.348,137.0,-0.142,173.0,-0.331,3.0,0.672,36.0,21.34,78.0,3.06,34.0,4.02,80.0,4.67,30.0,138.0,105.0,289.0,33.0,64.92,104.0,65.19,129.0,56.88,36.0,3.08,20.0,85.7,1643.0,117.36,4.0,28.2,10.41,84.0,121.19,1


#### Average the NaNs
At this point, the intentional offsetting of NaNs for poor players is over.  I don't want to penalize statistics or course history because you never know where they might land on the spectrum, so for these NaN values, we will take the mean.

In [12]:
# === Clean percentage stats stored as strings like '62.5%' ===
percent_stats = ["SCRAMBLING", "DRIVING_ACCURACY", "BIRDIES", "GIR"]

for col in percent_stats:
    if col in training_df.columns:
        training_df[col] = (
            training_df[col]
            .astype(str)
            .str.replace('%', '', regex=False)
            .replace(['None', 'nan', 'NaN', '--', 'DNP', ''], np.nan)
            .astype(float)
        )

# Select only the numeric columns
numeric_columns = training_df.select_dtypes(include=['float64', 'int64'])

# Fill NaN values with the mean of each column
numeric_columns = numeric_columns.fillna(numeric_columns.mean())

# Update the original dataframe with the filled numeric columns
training_df.update(numeric_columns)

training_df.head()

Unnamed: 0,SEASON,ENDING_DATE,TOURNAMENT,COURSE,PLAYER,POS,FINAL_POS,SGTTG_RANK,SGTTG,SGOTT_RANK,SGOTT,SGAPR_RANK,SGAPR,SGATG_RANK,SGATG,SGP_RANK,SGP,BIRDIES_RANK,BIRDIES,PAR_3_RANK,PAR_3,PAR_4_RANK,PAR_4,PAR_5_RANK,PAR_5,TOTAL_DRIVING_RANK,TOTAL_DRIVING,DRIVING_DISTANCE_RANK,DRIVING_DISTANCE,DRIVING_ACCURACY_RANK,DRIVING_ACCURACY,GIR_RANK,GIR,SCRAMBLING_RANK,SCRAMBLING,OWGR_RANK,OWGR,VEGAS_ODDS,CUT_PERCENTAGE,FEDEX_CUP_POINTS,form_density,CONSECUTIVE_CUTS,RECENT_FORM,adj_form,COURSE_HISTORY,adj_ch,TOP_20
0,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Fabian Gomez,1,1,136.0,-0.249,102.0,0.009,157.0,-0.289,88.0,0.03,49.0,0.213,110.0,19.69,50.0,3.04,172.0,4.1,68.0,4.66,81.0,178.0,105.0,289.0,73.0,61.44,141.0,63.71,132.0,56.59,74.0,1.95,80.0,58.3,387.17,32.26,5.0,55.5,21.64,71.0,102.43,1
1,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Brandt Snedeker,2,2,44.0,0.573,116.0,-0.059,78.0,0.164,18.0,0.32,25.0,0.395,32.0,21.41,78.0,3.06,2.0,3.98,100.0,4.69,99.0,188.0,75.0,292.6,113.0,58.69,90.0,65.67,16.0,62.73,20.0,4.08,20.0,73.3,866.03,57.74,1.0,39.1,14.1,56.233299,55.19796,1
2,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Zac Blair,3,3,165.0,-0.565,119.0,-0.068,173.0,-0.498,102.0,0.001,56.0,0.195,169.0,18.01,78.0,3.06,114.0,4.06,169.0,4.76,91.0,185.0,180.0,275.3,5.0,69.86,169.0,62.34,16.0,62.73,243.0,0.73,1000.0,57.1,252.56,18.04,2.0,63.5,23.45,6.0,8.66,1
3,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Si Woo Kim,4,4,42.0,0.605,53.0,0.257,140.0,-0.168,9.0,0.403,118.0,-0.102,46.0,21.0,78.0,3.06,34.0,4.02,17.0,4.59,18.0,126.0,69.0,293.5,57.0,62.84,112.0,64.83,20.0,62.32,52.0,2.56,1000.0,60.0,142.0,28.4,2.0,48.0,26.79,56.233299,55.19796,1
4,2016,2016-01-17,Sony Open in Hawaii,Waialae Country Club,Kevin Kisner,T5,5,127.0,-0.124,38.0,0.348,137.0,-0.142,173.0,-0.331,3.0,0.672,36.0,21.34,78.0,3.06,34.0,4.02,80.0,4.67,30.0,138.0,105.0,289.0,33.0,64.92,104.0,65.19,129.0,56.88,36.0,3.08,20.0,85.7,1643.0,117.36,4.0,28.2,10.41,84.0,121.19,1


In [13]:
# Check for NaN values
assert training_df.isna().sum().sum() == 0, "üö® Still missing values!"

In [14]:
# Inspect which columns have NaN values
training_df.isna().sum().sort_values(ascending=False).head(10)


SEASON         0
ENDING_DATE    0
TOURNAMENT     0
COURSE         0
PLAYER         0
POS            0
FINAL_POS      0
SGTTG_RANK     0
SGTTG          0
SGOTT_RANK     0
dtype: int64

### Check Features for Outliers and General Health (Histograms)
Now that all the data is cleaned, let's look at the distribution graphs.

In [15]:
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
import pandas as pd
import numpy as np

# Select numeric columns with >1 unique value, excluding label
df_numeric = training_df.select_dtypes(include=np.number)
df_numeric = df_numeric.loc[:, df_numeric.nunique() > 1]
df_numeric = df_numeric.drop(columns=["TOP_20"], errors="ignore")

# Create subplots
n_cols = 3
n_rows = int(np.ceil(len(df_numeric.columns) / n_cols))
fig = sp.make_subplots(rows=n_rows, cols=n_cols, subplot_titles=df_numeric.columns)

# Add one histogram per feature with smart binning
for i, column in enumerate(df_numeric.columns):
    row = i // n_cols + 1
    col = i % n_cols + 1

    # Compute dynamic bin count
    col_range = df_numeric[column].max() - df_numeric[column].min()
    n_unique = df_numeric[column].nunique()
    nbins = min(50, max(5, int(n_unique / 2))) if col_range < 10 else 50  # tighter for short-range

    fig.add_trace(
        go.Histogram(
            x=df_numeric[column],
            nbinsx=nbins,
            histnorm="density",
            marker=dict(color="#636EFA"),
            opacity=0.8
        ),
        row=row,
        col=col
    )

# Style the figure
fig.update_layout(
    height=300 * n_rows,
    width=1000,
    title_text="Feature Distributions (with Adaptive Binning)",
    template="plotly_dark",
    showlegend=False
)

fig.show()

# Current Week Data

## Import 2026 Field
**Important!**  Save curren't week's `DKSalaries.csv` into Data folder before this step.

In [16]:
importlib.reload(utils.db_utils)
from utils.db_utils import DK_PLAYER_NAME_MAP
from utils.db_utils import standardize_player_names

# Load DraftKings player list
dk = pd.read_csv("data/DKSalaries.csv", usecols=["Name", "Salary"])

# Normalize DK names to PGA naming convention
dk["Name"] = dk["Name"].replace(DK_PLAYER_NAME_MAP)

# Rename for consistency
dk = dk.rename(columns={"Name": "PLAYER", "Salary": "SALARY"})

dk = standardize_player_names(dk)  # ‚úÖ Normalize to match DB

dk.head(10)

Unnamed: 0,PLAYER,SALARY
0,Russell Henley,10300
1,Ben Griffin,10100
2,Collin Morikawa,9800
3,Hideki Matsuyama,9700
4,Robert MacIntyre,9600
5,J.J. Spaun,9500
6,Si Woo Kim,9400
7,Keegan Bradley,9300
8,Maverick McNealy,9200
9,Harry Hall,9100


## Vegas Odds
Confirm the [Vegas Odds](http://golfodds.com/weekly-odds.html) page is updated with current tournament.

Note: sometimes this site can give errors and not load correctly.  Just have to do it again later.
Another trick to try is change the url try to run it and then put the url back the way it was.  Worked once, maybe random coincidence.

In [17]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_current_week_odds

odds_current = get_current_week_odds(season=2025, tournament_name=new_tournament_name)
odds_current.head(10)


Unnamed: 0,SEASON,TOURNAMENT,PLAYER,ODDS,VEGAS_ODDS
1,2025,Sony Open in Hawaii,Russell Henley,11/1,11.0
2,2025,Sony Open in Hawaii,Ben Griffin,18/1,18.0
3,2025,Sony Open in Hawaii,J.J. Spaun,18/1,18.0
4,2025,Sony Open in Hawaii,Hideki Matsuyama,18/1,18.0
5,2025,Sony Open in Hawaii,Collin Morikawa,20/1,20.0
6,2025,Sony Open in Hawaii,Si Woo Kim,20/1,20.0
7,2025,Sony Open in Hawaii,Robert MacIntyre,20/1,20.0
8,2025,Sony Open in Hawaii,Keegan Bradley,20/1,20.0
9,2025,Sony Open in Hawaii,Corey Conners,25/1,25.0
10,2025,Sony Open in Hawaii,Maverick McNealy,30/1,30.0


## Cut Percentage and FedEx Points

In [18]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_cut_and_fedex_history

cuts_rolling = get_cut_and_fedex_history(db_path, this_week_history, window_months=9)
cuts_current = cuts_rolling[this_week_key].copy()
cuts_current.head(10)

Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,A.J. Ewart,1,0,0.0,0.0,0.0,0,2026-01-15,Sony Open in Hawaii
1,Aaron Baddeley,6,1,2.75,16.7,0.46,1,2026-01-15,Sony Open in Hawaii
2,Aaron Cockerill,1,0,0.0,0.0,0.0,0,2026-01-15,Sony Open in Hawaii
3,Aaron Rai,12,10,476.855,83.3,39.74,7,2026-01-15,Sony Open in Hawaii
4,Aaron Wise,6,1,6.333,16.7,1.06,0,2026-01-15,Sony Open in Hawaii
5,Adam Hadwin,17,10,151.225,58.8,8.9,3,2026-01-15,Sony Open in Hawaii
6,Adam Schenk,15,8,673.425,53.3,44.9,0,2026-01-15,Sony Open in Hawaii
7,Adam Scott,11,10,328.938,90.9,29.9,3,2026-01-15,Sony Open in Hawaii
8,Adam Svensson,14,5,114.15,35.7,8.15,0,2026-01-15,Sony Open in Hawaii
9,Adrian Otaegui,1,0,0.0,0.0,0.0,0,2026-01-15,Sony Open in Hawaii


## Recent Form

In [19]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_recent_avg_finish

recent_form = get_recent_avg_finish(db_path, this_week_history, window_months=9)
recent_form_current = recent_form[this_week_key].copy()

recent_form_current.head(10)

Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Scottie Scheffler,13,3.5,1.33,2026-01-15,Sony Open in Hawaii
1,Jason Scrivener,1,8.0,11.54,2026-01-15,Sony Open in Hawaii
2,Doc Redman,3,15.7,11.33,2026-01-15,Sony Open in Hawaii
3,Jon Rahm,3,16.3,11.76,2026-01-15,Sony Open in Hawaii
4,Robin Williams,1,17.0,24.53,2026-01-15,Sony Open in Hawaii
5,Andy Sullivan,1,17.0,24.53,2026-01-15,Sony Open in Hawaii
6,Harry Hall,11,17.5,7.04,2026-01-15,Sony Open in Hawaii
7,Kazuki Higa,1,18.0,25.97,2026-01-15,Sony Open in Hawaii
8,Matt Fitzpatrick,12,19.0,7.41,2026-01-15,Sony Open in Hawaii
9,Ren Yonezawa,1,20.0,28.85,2026-01-15,Sony Open in Hawaii


## Course History

In [20]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_course_history

course_hist = get_course_history(db_path, this_week_history, lookback_years=7)
course_hist_current = course_hist[this_week_key].copy()

course_hist_current.head(10)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,3,41.3,29.79,2026-01-15,Waialae Country Club,Sony Open in Hawaii
1,Aaron Rai,3,69.3,49.99,2026-01-15,Waialae Country Club,Sony Open in Hawaii
2,Aaron Wise,1,90.0,129.84,2026-01-15,Waialae Country Club,Sony Open in Hawaii
3,Abraham Ancer,3,72.7,52.44,2026-01-15,Waialae Country Club,Sony Open in Hawaii
4,Adam Hadwin,2,74.5,67.81,2026-01-15,Waialae Country Club,Sony Open in Hawaii
5,Adam Long,3,76.0,54.82,2026-01-15,Waialae Country Club,Sony Open in Hawaii
6,Adam Schenk,5,68.6,38.29,2026-01-15,Waialae Country Club,Sony Open in Hawaii
7,Adam Scott,2,31.0,28.22,2026-01-15,Waialae Country Club,Sony Open in Hawaii
8,Adam Svensson,4,27.0,16.78,2026-01-15,Waialae Country Club,Sony Open in Hawaii
9,Adrien Dumont de Chassart,1,90.0,129.84,2026-01-15,Waialae Country Club,Sony Open in Hawaii


## Merged Dataframe

In [23]:
importlib.reload(utils.db_utils)
from utils.db_utils import build_test_rows

this_week = build_test_rows(
    db_path=db_path,
    stats_df=stats_df,
    odds_df=odds_current,
    cuts_df=cuts_current,
    recent_form_df=recent_form_current,
    course_hist_df=course_hist_current,
    dk_df=dk,
    season=tournament_config["new"]["season"]
)

this_week.head()

Unnamed: 0,PLAYER,SGTTG_RANK,SGTTG,SGOTT_RANK,SGOTT,SGAPR_RANK,SGAPR,SGATG_RANK,SGATG,SGP_RANK,SGP,BIRDIES_RANK,BIRDIES,PAR_3_RANK,PAR_3,PAR_4_RANK,PAR_4,PAR_5_RANK,PAR_5,TOTAL_DRIVING_RANK,TOTAL_DRIVING,DRIVING_DISTANCE_RANK,DRIVING_DISTANCE,DRIVING_ACCURACY_RANK,DRIVING_ACCURACY,GIR_RANK,GIR,SCRAMBLING_RANK,SCRAMBLING,OWGR_RANK,OWGR,SEASON,VEGAS_ODDS,CUT_PERCENTAGE,FEDEX_CUP_POINTS,form_density,CONSECUTIVE_CUTS,RECENT_FORM,adj_form,COURSE_HISTORY,adj_ch,SALARY
0,A.J. Ewart,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6000
1,Aaron Rai,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8200
2,Adam Schenk,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6400
3,Adam Scott,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8400
4,Adam Svensson,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6700


### Dataframe Normalization

Run the same normalization techniques that I ran on the historical data:
- Any NaN Odds go to 1000/1 and clip all values at 1000/1
- OWGR NaN values go to 1000
- Recent form NaN values act like a MC at 90
- Average everything else.

In [24]:
# === 1. Clean percentage stats stored as strings like '62.5%' ===
percent_stats = ["SCRAMBLING", "DRIVING_ACCURACY", "BIRDIES", "GIR"]

for col in percent_stats:
    if col in this_week.columns:
        this_week[col] = (
            this_week[col]
            .astype(str)
            .str.replace('%', '', regex=False)
            .replace(['None', 'nan', 'NaN', '--', 'DNP', ''], np.nan)
        )
        this_week[col] = pd.to_numeric(this_week[col], errors="coerce")

# === 2. Normalize & assign fallback values for key fields ===
this_week["VEGAS_ODDS"] = pd.to_numeric(this_week["VEGAS_ODDS"], errors="coerce").fillna(1000).clip(upper=1000)

if "OWGR" in this_week.columns:
    this_week["OWGR"] = pd.to_numeric(this_week["OWGR"], errors="coerce")
    max_owgr = this_week["OWGR"].dropna().max()
    this_week["OWGR"] = this_week["OWGR"].fillna(max_owgr).clip(upper=1000)

if "OWGR_RANK" in this_week.columns:
    this_week["OWGR_RANK"] = pd.to_numeric(this_week["OWGR_RANK"], errors="coerce").fillna(1000).clip(upper=1000)

this_week["RECENT_FORM"] = pd.to_numeric(this_week["RECENT_FORM"], errors="coerce").fillna(90)
this_week["FEDEX_CUP_POINTS"] = pd.to_numeric(this_week["FEDEX_CUP_POINTS"], errors="coerce").fillna(0)

if "COURSE_HISTORY" in this_week.columns:
    this_week["COURSE_HISTORY"] = pd.to_numeric(this_week["COURSE_HISTORY"], errors="coerce")
    ch_mean = this_week["COURSE_HISTORY"].mean()
    this_week["COURSE_HISTORY"] = this_week["COURSE_HISTORY"].fillna(ch_mean)

# === 3. Fill all remaining NaNs in numeric columns ===
# Recast everything that might look numeric to be sure
for col in this_week.columns:
    if col not in ["PLAYER", "SALARY", "TOURNAMENT", "SEASON"]:
        try:
            this_week[col] = pd.to_numeric(this_week[col], errors="coerce")
        except Exception:
            continue

numeric_cols = this_week.select_dtypes(include=["number"]).columns.tolist()

for col in numeric_cols:
    if this_week[col].isna().any():
        col_mean = this_week[col].mean()
        this_week[col] = this_week[col].fillna(col_mean if not np.isnan(col_mean) else 0)



Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd

In [23]:
# === Final Check: No NaNs Should Remain ===
assert this_week.isna().sum().sum() == 0, "üö® Still missing values in prediction set!"

In [25]:
# === Check for Remaining Missing Values ===
missing_summary = this_week.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

print("üßπ Columns still containing NaN values:")
display(missing_summary)

üßπ Columns still containing NaN values:


Series([], dtype: int64)

In [26]:
this_week.head(10)

Unnamed: 0,PLAYER,SGTTG_RANK,SGTTG,SGOTT_RANK,SGOTT,SGAPR_RANK,SGAPR,SGATG_RANK,SGATG,SGP_RANK,SGP,BIRDIES_RANK,BIRDIES,PAR_3_RANK,PAR_3,PAR_4_RANK,PAR_4,PAR_5_RANK,PAR_5,TOTAL_DRIVING_RANK,TOTAL_DRIVING,DRIVING_DISTANCE_RANK,DRIVING_DISTANCE,DRIVING_ACCURACY_RANK,DRIVING_ACCURACY,GIR_RANK,GIR,SCRAMBLING_RANK,SCRAMBLING,OWGR_RANK,OWGR,SEASON,VEGAS_ODDS,CUT_PERCENTAGE,FEDEX_CUP_POINTS,form_density,CONSECUTIVE_CUTS,RECENT_FORM,adj_form,COURSE_HISTORY,adj_ch,SALARY
0,A.J. Ewart,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,6000
1,Aaron Rai,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,8200
2,Adam Schenk,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,6400
3,Adam Scott,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,8400
4,Adam Svensson,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,6700
5,Adrien Dumont De Chassart,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,6800
6,Adrien Saddier,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,6600
7,Alejandro Tosti,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,6400
8,Alex Smalley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,7200
9,Anson Cabello,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,6000


## Correlations
**Last 4 Years**

In [26]:
import plotly.express as px

# Get the 4 most recent seasons (or fewer if not available)
available_seasons = sorted(training_df["SEASON"].unique())[-4:]

# Dictionary to store correlation results
correlation_frames = {}

# Calculate correlations for each season
for season in available_seasons:
    df = training_df[training_df["SEASON"] == season].copy()

    # Numeric features only, drop final result
    numeric = df.select_dtypes(include=["float64", "int64"]).drop(columns=["FINAL_POS", "SEASON"], errors="ignore")

    # Skip if TOP_20 is not present
    if "TOP_20" not in numeric.columns:
        continue

    # Correlation with TOP_20
    cor = numeric.corr().abs()["TOP_20"].drop("TOP_20", errors="ignore").sort_values(ascending=False)
    cor_df = cor.reset_index()
    cor_df.columns = ["Feature", "Correlation"]
    correlation_frames[season] = cor_df

# Plot one bar chart per season
for season, cor_df in correlation_frames.items():
    fig = px.bar(
        cor_df,
        x="Feature",
        y="Correlation",
        title=f"üîé Correlation to TOP_20 ‚Äî Season {int(season)}",
        height=500,
        width=1000
    )

    fig.update_layout(
        xaxis_title="Feature",
        yaxis_title="Absolute Correlation to TOP_20",
        template="plotly_dark",
        xaxis_tickangle=-45
    )

    fig.show()


**Average of Last 4 Years**

In [27]:
import pandas as pd
import numpy as np
import plotly.express as px

# === Rebuild correlation matrices for the 4 most recent seasons ===
cor = {}
recent_seasons = sorted(training_df["SEASON"].unique())[-4:]

for season in recent_seasons:
    df_season = training_df[training_df["SEASON"] == season].copy()
    numeric_cols = df_season.select_dtypes(include=["float64", "int64"])
    cor[int(season)] = numeric_cols.corr().abs()

# === Combine into one summary DataFrame ===
cor_df = pd.DataFrame()

for season in recent_seasons:
    df = cor[season].reset_index()
    if "TOP_20" not in df.columns:
        continue
    temp = df[["index", "TOP_20"]].copy()
    temp = temp.rename(columns={"TOP_20": str(season)})
    if cor_df.empty:
        cor_df = temp
    else:
        cor_df = cor_df.merge(temp, on="index", how="outer")

# Drop target and metadata rows
cor_df = cor_df[~cor_df["index"].isin(["TOP_20", "FINAL_POS", "SEASON"])]

# Compute average correlation across seasons
cor_df["AVERAGE"] = cor_df.drop(columns=["index"]).mean(axis=1)
cor_df = cor_df.sort_values(by="AVERAGE", ascending=False)

# === Plotly Bar Chart ===
fig = px.bar(
    cor_df,
    x="index",
    y="AVERAGE",
    title="Average Correlation to TOP_20 (Last 4 Seasons)",
    labels={"index": "Feature", "AVERAGE": "Avg Correlation"},
    template="plotly_dark"
)

fig.update_layout(
    xaxis_tickangle=-45,
    height=600,
    width=1000,
    showlegend=False
)

fig.show()


# ML Model

## Predictors

In [28]:
# === Define Features and Target ===
target_col = "TOP_20"

# Columns to exclude from features
exclude = [
    "PLAYER", "TOURNAMENT", "COURSE", "ENDING_DATE", "SEASON", "TOURN_ID",
    "TOP_20", "FINAL_POS"  
]

# Select numeric feature columns
feature_cols = [
    col for col in training_df.columns
    if col not in exclude and training_df[col].dtype in [np.float64, np.int64]
]

print(f"Selected {len(feature_cols)} features:\n", feature_cols)

Selected 39 features:
 ['SGTTG_RANK', 'SGTTG', 'SGOTT_RANK', 'SGOTT', 'SGAPR_RANK', 'SGAPR', 'SGATG_RANK', 'SGATG', 'SGP_RANK', 'SGP', 'BIRDIES_RANK', 'BIRDIES', 'PAR_3_RANK', 'PAR_3', 'PAR_4_RANK', 'PAR_4', 'PAR_5_RANK', 'PAR_5', 'TOTAL_DRIVING_RANK', 'TOTAL_DRIVING', 'DRIVING_DISTANCE_RANK', 'DRIVING_DISTANCE', 'DRIVING_ACCURACY_RANK', 'DRIVING_ACCURACY', 'GIR_RANK', 'GIR', 'SCRAMBLING_RANK', 'SCRAMBLING', 'OWGR_RANK', 'OWGR', 'VEGAS_ODDS', 'CUT_PERCENTAGE', 'FEDEX_CUP_POINTS', 'form_density', 'CONSECUTIVE_CUTS', 'RECENT_FORM', 'adj_form', 'COURSE_HISTORY', 'adj_ch']


### Random Forest
Initial default model before hyperparameter tuning.

This one actually performed really well (ROC AUC above 0.85).  Strangely doing hyperparameter tuning hurt the model (less than 0.7), so I removed that step and we will just use this default model.  This does cross-validation testing on 5 folds to ensure robust train/test splits.

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# === Standardize features ===
scaler = StandardScaler()
X_rf_std = scaler.fit_transform(training_df[feature_cols])
y_rf = training_df["TOP_20"]

# === Balance the dataset with SMOTE and undersampling ===
over = SMOTE(sampling_strategy=0.5, k_neighbors=3)
under = RandomUnderSampler(sampling_strategy=0.5)
pipeline = Pipeline([("o", over), ("u", under)])
X_rf_bal, y_rf_bal = pipeline.fit_resample(X_rf_std, y_rf)

# === Define and evaluate Random Forest ===
rf = RandomForestClassifier(n_estimators=100, random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf, X_rf_bal, y_rf_bal, cv=cv, scoring="roc_auc")

print("üìä Random Forest (All Features)")
print("Cross-validation scores:", scores)
print("Average ROC AUC score: {:.2f}".format(scores.mean()))


ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead

Random Forest Training & Feature Importance

Now we train on all the data (no longer cross-validation folds) and extract the feature importance to see how it compares with linear correlations.


In [74]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import plotly.express as px

# === Resample the full dataset ===
over = SMOTE(sampling_strategy=0.5, k_neighbors=3)
under = RandomUnderSampler(sampling_strategy=0.5)
resample_pipeline = Pipeline([("o", over), ("u", under)])
X_all_resampled, y_all_resampled = resample_pipeline.fit_resample(
    training_df[feature_cols], training_df["TOP_20"]
)

# === Standardize ===
scaler = StandardScaler()
X_all_scaled = scaler.fit_transform(X_all_resampled)

# === Train Calibrated Random Forest ===
base_rf = RandomForestClassifier(random_state=42)
final_model = CalibratedClassifierCV(estimator=base_rf, method="sigmoid", cv=5)
final_model.fit(X_all_scaled, y_all_resampled)

# === Aggregate feature importances across all CV folds ===
all_importances = np.array([
    est.estimator.feature_importances_ for est in final_model.calibrated_classifiers_
])
mean_importance = all_importances.mean(axis=0)

importance_df = pd.DataFrame({
    "Feature": feature_cols,
    "Importance": mean_importance
}).sort_values(by="Importance", ascending=False)

# === Plot feature importances ===
fig = px.bar(
    importance_df,
    x="Feature",
    y="Importance",
    title="Random Forest Feature Importances",
    template="plotly_dark"
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()


Use this model on this week's data to predict the best performers.

In [75]:
# === Prepare test data (this week's players) ===

# Select and standardize features using the same scaler as training
X_test = this_week[feature_cols]
X_test_scaled = scaler.transform(X_test)

# === Predict probabilities for Top 20 finish ===
# CalibratedClassifierCV returns well-calibrated probabilities
this_week["PROBABILITY"] = final_model.predict_proba(X_test_scaled)[:, 1]

# === Sort results by highest predicted probability ===
this_week_sorted = this_week.sort_values(by="PROBABILITY", ascending=False).reset_index(drop=True)

# === Define final column order explicitly ===
columns_to_show = [
    "PLAYER", "SALARY", "PROBABILITY",
    "RECENT_FORM", "COURSE_HISTORY", "VEGAS_ODDS", "CUT_PERCENTAGE", 
    "OWGR_RANK", "FEDEX_CUP_POINTS",

    # SG stats
    "SGTTG_RANK", "SGOTT_RANK", "SGAPR_RANK", "SGATG_RANK", "SGP_RANK",

    # Scoring/Performance
    "BIRDIES_RANK", "PAR_3_RANK", "PAR_4_RANK", "PAR_5_RANK",

    # Driving / Ball Striking
    "TOTAL_DRIVING_RANK", "DRIVING_DISTANCE_RANK", "DRIVING_ACCURACY_RANK",
    "GIR_RANK", "SCRAMBLING_RANK"
]

# Keep only columns that exist in the dataframe
columns_to_show = [col for col in columns_to_show if col in this_week_sorted.columns]

# Final dataframe for display/export
export_df = this_week_sorted[columns_to_show].copy()

# Preview top 20
export_df.head(20)


Unnamed: 0,PLAYER,SALARY,PROBABILITY,RECENT_FORM,COURSE_HISTORY,VEGAS_ODDS,CUT_PERCENTAGE,OWGR_RANK,FEDEX_CUP_POINTS,SGTTG_RANK,SGOTT_RANK,SGAPR_RANK,SGATG_RANK,SGP_RANK,BIRDIES_RANK,PAR_3_RANK,PAR_4_RANK,PAR_5_RANK,TOTAL_DRIVING_RANK,DRIVING_DISTANCE_RANK,DRIVING_ACCURACY_RANK,GIR_RANK,SCRAMBLING_RANK
0,Jacob Bridgeman,7900,0.947764,48.2,72.0,60.0,69.6,69.0,1393.989,123.0,125.0,125.0,59.0,18.0,62.0,34.0,81.0,114.0,110.0,95.0,94.0,146.0,28.0
1,Lee Hodges,7100,0.916152,58.4,50.7,100.0,58.8,151.0,251.2,38.0,78.0,29.0,79.0,99.0,76.0,34.0,23.0,33.0,43.0,118.0,33.0,16.0,21.0
2,Doug Ghim,7400,0.868698,50.7,51.0,65.0,68.4,156.0,353.849,13.0,43.0,17.0,54.0,176.0,103.0,62.0,57.0,59.0,28.0,91.0,41.0,20.0,98.0
3,Mac Meissner,7500,0.793211,51.4,59.050505,60.0,66.7,104.0,565.714,52.0,140.0,37.0,24.0,115.0,129.0,34.0,57.0,127.0,106.0,83.0,102.0,95.0,8.0
4,Vince Whaley,8700,0.777363,46.9,47.8,35.0,75.0,114.0,564.108,86.0,101.0,115.0,38.0,34.0,64.0,2.0,57.0,11.0,121.0,48.0,148.0,60.0,7.0
5,Alex Smalley,9000,0.696225,55.4,57.3,35.0,50.0,134.0,558.756,34.0,18.0,89.0,61.0,77.0,23.0,107.0,4.0,48.0,12.0,61.0,46.0,56.0,26.0
6,Kevin Roy,7100,0.678862,49.6,68.0,100.0,65.0,139.0,513.037,59.0,39.0,96.0,86.0,63.0,57.0,81.0,8.0,41.0,9.0,63.0,37.0,48.0,49.0
7,Jesper Svensson,7500,0.674506,45.5,59.050505,70.0,78.9,137.0,405.028,46.0,9.0,138.0,66.0,89.0,12.0,107.0,57.0,9.0,38.0,3.0,143.0,79.0,126.0
8,Bud Cauley,8000,0.642066,40.2,25.0,50.0,82.4,65.0,1064.164,33.0,57.0,31.0,95.0,67.0,114.0,107.0,81.0,68.0,106.0,101.0,84.0,120.0,67.0
9,Matt Kuchar,8300,0.619062,39.9,66.2,50.0,86.7,129.0,414.183,92.0,152.0,74.0,25.0,15.0,69.0,62.0,4.0,143.0,136.0,176.0,34.0,122.0,4.0


### Save to CSV

In [76]:
# Ensure required columns for spreadsheet export
if "COURSE_HISTORY" not in export_df.columns:
    export_df["COURSE_HISTORY"] = "-"

# Define columns to round (FedEx + stats)
columns_to_round = [
    "FEDEX_CUP_POINTS",
    "SGTTG_RANK", "SGOTT_RANK", "SGAPR_RANK", "SGATG_RANK", "SGP_RANK",
    "BIRDIES_RANK", "PAR_3_RANK", "PAR_4_RANK", "PAR_5_RANK",
    "TOTAL_DRIVING_RANK", "DRIVING_DISTANCE_RANK", "DRIVING_ACCURACY_RANK",
    "GIR_RANK", "SCRAMBLING_RANK"
]

columns_to_round_1 = [
    "COURSE_HISTORY", "CUT_PERCENTAGE"
]

# Round only the selected columns to 0 decimal places
export_df[columns_to_round] = export_df[columns_to_round].round(0).astype("Int64")
# Round only the selected columns to 1 decimal places
export_df[columns_to_round_1] = export_df[columns_to_round_1].round(1)

# Explicit column order for Excel integration
column_order = [
    "PLAYER", "SALARY", "PROBABILITY", "RECENT_FORM", "COURSE_HISTORY", "VEGAS_ODDS", 
    "CUT_PERCENTAGE", "OWGR_RANK", "FEDEX_CUP_POINTS", 
    "SGTTG_RANK", "SGOTT_RANK", "SGAPR_RANK", "SGATG_RANK", "SGP_RANK",
    "BIRDIES_RANK", "PAR_3_RANK", "PAR_4_RANK", "PAR_5_RANK",
    "TOTAL_DRIVING_RANK", "DRIVING_DISTANCE_RANK", "DRIVING_ACCURACY_RANK",
    "GIR_RANK", "SCRAMBLING_RANK"
]

# Ensure all expected columns exist
for col in column_order:
    if col not in export_df.columns:
        export_df[col] = "-" if col == "COURSE_HISTORY" else np.nan

# Reorder columns to match Excel layout
export_df = export_df[column_order]

# Export to CSV
filename = f"data/current_week_export.csv"
export_df.to_csv(filename, index=False)
print(f"‚úÖ Exported to {filename}")

‚úÖ Exported to data/current_week_export.csv
