## PGA Draftkings Notebook
Use [PGA Website](https://www.pgatour.com/tournaments/schedule.html) to look up tournament info and fill out first USER INPUT block below. (Fetch/XHR from Network tab, Paylod sub-tab)

Looks like the tournament ID is also in the address bar.

### User Input

In [None]:
# === USER INPUTS ===
# Old Tournament
old_tournament_name = "PGA Championship"
tournament_date = "5/18/2025"
old_course = "Quail Hollow Club"
tournament_id = "R2025033"

# New Tournament
new_tournament_name = "Charles Schwab Challenge"
new_course = "Colonial Country Club"
new_season = 2025
new_ending_date = "2025-05-25"


# === LIBRARIES AND VARIABLES ===
# Import necessary libraries
import requests
import pandas as pd
from datetime import datetime
import sqlite3 as sql
import numpy as np
from numpy import nan
import os
import importlib
import utils.db_utils
from utils.db_utils import TOURNAMENT_NAME_MAP, PLAYER_NAME_MAP

# === TOURNAMENT CONFIG ===
tournament_config = {
    "old": {
        "name": old_tournament_name,
        "date": tournament_date,
        "course": old_course,
        "id": tournament_id
    },
    "new": {
        "name": new_tournament_name,
        "course": new_course,
        "season": new_season,
        "ending_date": pd.to_datetime(new_ending_date),
        "quoted_course": f'"{new_course}"',
        "quoted_name": f"'{new_tournament_name}'"
    }
}

this_week_key = str(tournament_config["new"]["ending_date"].date())

# Create a minimal one-row history DataFrame
this_week_history = pd.DataFrame([{
    "SEASON": tournament_config["new"]["season"],
    "TOURNAMENT": tournament_config["new"]["name"],
    "ENDING_DATE": tournament_config["new"]["ending_date"],
    "COURSE": tournament_config["new"]["course"],
    "TOURN_ID": None
}])

### Update Database

#### Old Tournament

In [21]:
importlib.reload(utils.db_utils)  # Only needed if you're actively editing db_utils.py
from utils.db_utils import update_tournament_results

# Change these each year!!
season = 2025
year = 20250  # Unique GraphQL year distinguishing number in case of multiple per year

# Run the update
db_path = "data/golf.db"  # Or use os.path.join("data", "golf.db")
tournDf = update_tournament_results(tournament_config, db_path, season, year)

# Show just the most recent tournament added for confirmation
from sqlalchemy import create_engine

engine = create_engine(f"sqlite:///{db_path}")

query = f"""
SELECT *
FROM tournaments
WHERE TOURN_ID = '{tournament_config['old']['id']}'
  AND ENDING_DATE = '{datetime.strptime(tournament_config['old']['date'], '%m/%d/%Y').date()}'
"""

recent = pd.read_sql(query, engine)
engine.dispose()
recent.sort_values(by='FINAL_POS').head()

📦 Fetching results for tournament ID R2025033 (PGA Championship), year: 20250
ℹ️ Tournament 'PGA Championship' already exists — no new data inserted.


Unnamed: 0,SEASON,ENDING_DATE,TOURN_ID,TOURNAMENT,COURSE,PLAYER,POS,FINAL_POS,ROUNDS:1,ROUNDS:2,ROUNDS:3,ROUNDS:4,OFFICIAL_MONEY,FEDEX_CUP_POINTS
127,2025,2025-05-18,R2025033,PGA Championship,Quail Hollow Club,Scottie Scheffler,1,1,-2,-3,-6,E,"$3,420,000.00",750.0
31,2025,2025-05-18,R2025033,PGA Championship,Quail Hollow Club,Davis Riley,T2,2,E,-3,-4,+1,"$1,418,666.67",391.667
45,2025,2025-05-18,R2025033,PGA Championship,Quail Hollow Club,Harris English,T2,2,+1,-1,E,-6,"$1,418,666.67",391.667
18,2025,2025-05-18,R2025033,PGA Championship,Quail Hollow Club,Bryson DeChambeau,T2,2,E,-3,-2,-1,"$1,418,666.67",0.0
55,2025,2025-05-18,R2025033,PGA Championship,Quail Hollow Club,Jhonattan Vegas,T5,5,-7,-1,+2,+1,"$694,700.00",275.0


#### Stats

In [41]:
importlib.reload(utils.db_utils)
from utils.db_utils import update_season_stats  # <- This line is essential

# Change these each year!!
statsYear = 2025

stats_df = update_season_stats(statsYear, db_path)
stats_df.head()

✅ Overwrote stats for season 2025 with 999 rows.


Unnamed: 0,PLAYER,SGTTG_RANK,SGTTG,SGOTT_RANK,SGOTT,SGAPR_RANK,SGAPR,SGATG_RANK,SGATG,SGP_RANK,SGP,BIRDIES_RANK,BIRDIES,PAR_3_RANK,PAR_3,PAR_4_RANK,PAR_4,PAR_5_RANK,PAR_5,TOTAL_DRIVING_RANK,TOTAL_DRIVING,DRIVING_DISTANCE_RANK,DRIVING_DISTANCE,DRIVING_ACCURACY_RANK,DRIVING_ACCURACY,GIR_RANK,GIR,SCRAMBLING_RANK,SCRAMBLING,OWGR_RANK,OWGR,SEASON
0,A.J. Ewart,,,,,,,,,,,,,,,,,,,,,,,,,,,,,656,0.1687,2025
1,Aaron Baddeley,166.0,-0.965,179.0,-1.366,133.0,-0.203,3.0,0.604,158.0,-0.47,178.0,18.01%,111.0,3.08,155.0,4.07,179.0,4.78,176.0,290.0,167.0,288.9,123.0,56.64%,173.0,60.61%,87.0,60.68%,417,0.3112,2025
2,Aaron Cockerill,,,,,,,,,,,,,,,,,,,,,,,,,,,,,358,0.3667,2025
3,Aaron Rai,21.0,0.831,35.0,0.33,20.0,0.513,107.0,-0.012,96.0,0.004,27.0,23.86%,179.0,3.19,6.0,3.95,25.0,4.53,75.0,170.0,169.0,286.7,1.0,73.13%,16.0,69.57%,131.0,58.51%,28,2.7748,2025
4,Aaron Wilkin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,744,0.1321,2025


#### Odds
Not usually needed for weekly routine.

**Manual Fix! Odds name cleanup (only needed when joins fail)**

Make sure to update the dictionaries in db_utils.py if new names need to be added.

In [62]:
importlib.reload(utils.db_utils)
from utils.db_utils import clean_odds_names, PLAYER_NAME_MAP, TOURNAMENT_NAME_MAP

db_path = "data/golf.db" 
updated_odds = clean_odds_names(db_path, TOURNAMENT_NAME_MAP, PLAYER_NAME_MAP)
updated_odds.head()

ℹ️ No odds rows required name cleanup.


Unnamed: 0,SEASON,TOURNAMENT,ENDING_DATE,PLAYER,ODDS,VEGAS_ODDS,TOURNAMENT_ORIG,PLAYER_ORIG


**Historical Odds Updates**

Only run this when loading in entire year odds at the start of each year or if corrections need to be made.  This will load in the entire year into the database and update it using the dictionary in db_utils.py.

In [8]:
importlib.reload(utils.db_utils)
from utils.db_utils import import_historical_odds

oddsYear = "2022-2023"    # URL segment
season = 2023             # PGA Tour season
db_path = "data/golf.db"

odds_df = import_historical_odds(oddsYear, season, db_path)
odds_df.head()

✅ Inserting 10769 new rows into odds table...


Unnamed: 0,SEASON,TOURNAMENT,ENDING_DATE,PLAYER,ODDS,VEGAS_ODDS
4,2023,Fortinet Championship,2022-09-18,Hideki Matsuyama,18/1,18.0
5,2023,Fortinet Championship,2022-09-18,Max Homa,18/1,18.0
6,2023,Fortinet Championship,2022-09-18,Corey Conners,20/1,20.0
7,2023,Fortinet Championship,2022-09-18,Maverick McNealy,25/1,25.0
8,2023,Fortinet Championship,2022-09-18,Taylor Pendrith,25/1,25.0


**Not normally needed**

The code below is a way to troubleshoot the odds function that is in db_utils.py.  If it does not pull the stats correctly, we can run it outside of the db_utils.py file and explore what might be wrong with it, iterating until it works, and then using that fix to update db_utils.py.  This is not normally needed, but wanted to leave this in place as a reference because this odds website can be tricky.

In [3]:
import pandas as pd
import numpy as np
import requests
import re
from datetime import datetime
from io import StringIO

# === USER INPUT ===
oddsYear = "2020-2021"    # URL segment
season = 2021        # PGA Tour season

url = f"http://golfodds.com/archives-{oddsYear}.html"
response = requests.get(url)
tables = pd.read_html(StringIO(response.text))
# raw_df = tables[5]  # the actual table of interest
# Find the largest 2-column table that contains at least some odds-like strings
raw_df = None
for tbl in tables:
    if tbl.shape[1] == 2 and tbl.shape[0] > 50:  # Rough filter
        sample = tbl.iloc[:, 1].astype(str).str.contains(r"\d+/\d+").sum()
        if sample > 5:
            raw_df = tbl
            break

if raw_df is None:
    raise ValueError("❌ Could not find valid odds table on the page.")

# === STEP 1: Initial clean-up ===
df = raw_df.dropna(how="all").reset_index(drop=True)
df.columns = ["PLAYER", "ODDS"]

# 🔧 Clean up non-breaking spaces and extra whitespace
df["PLAYER"] = (
    df["PLAYER"]
    .astype(str)
    .str.replace("\xa0", " ", regex=False)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

df.insert(loc=0, column="SEASON", value=season)
df.insert(loc=1, column="TOURNAMENT", value=np.nan)
df.insert(loc=2, column="ENDING_DATE", value=np.nan)

# === STEP 2: Helper function for parsing date strings ===
def parse_ending_date(text):
    import re
    from datetime import datetime

    # Normalize whitespace and symbols
    text = (
        text.replace("\u2013", "-")
            .replace("–", "-")
            .replace("\xa0", " ")
    )
    text = re.sub(r"\bSept(?!ember)\b", "Sep", text)

    # ✅ Fix typo: "Match" → "March" only when used in a date context
    text = re.sub(r"\bMatch(?=\s+\d{1,2}\s*[-–]\s*\d{1,2},\s*\d{4})", "March", text)

    # Pattern 1: "July 30 - August 2, 2015" or "Oct 29 - Nov 1, 2015"
    match = re.search(r"(\w+)\s\d+\s*-\s*(\w+)\s(\d+),\s(\d{4})", text)
    if match:
        month2, day2, year = match.group(2), match.group(3), match.group(4)
        for fmt in ["%B %d, %Y", "%b %d, %Y"]:
            try:
                return datetime.strptime(f"{month2} {day2}, {year}", fmt).date()
            except ValueError:
                continue

    # Pattern 2: "November 21-24, 2024"
    match = re.search(r"(\w+)\s\d+-\d+,\s(\d{4})", text)
    if match:
        month, year = match.group(1), match.group(2)
        day = re.search(r"(\d+)-(\d+)", text).group(2)
        for fmt in ["%B %d, %Y", "%b %d, %Y"]:
            try:
                return datetime.strptime(f"{month} {day}, {year}", fmt).date()
            except ValueError:
                continue

    # Pattern 3: "Sunday, October 20, 2019"
    try:
        return datetime.strptime(text.strip(), "%A, %B %d, %Y").date()
    except ValueError:
        pass

    # Pattern 4: "October 20, 2019"
    try:
        return datetime.strptime(text.strip(), "%B %d, %Y").date()
    except ValueError:
        pass

    return None

last_tourn_name = None
last_date = None

# === STEP 3: Iterate block by block ===
final_rows = []
i = 0
last_tourn_name = None
last_end_date = None

while i < len(df) - 4:
    player_i = str(df.loc[i, "PLAYER"])
    player_i2 = str(df.loc[i + 2, "PLAYER"])
    player_i3 = str(df.loc[i + 3, "PLAYER"]).lower()

    # Detect start of a new tournament block
    is_header = (
        pd.isna(df.loc[i, "ODDS"]) and
        pd.isna(df.loc[i + 1, "ODDS"]) and (
            re.search(r"\w+\s\d+\s*[-–]\s*(\w+\s)?\d+,\s\d{4}", player_i2) or
            re.search(r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),?\s+\w+\s\d{1,2},\s\d{4}", player_i2)
        )
    )

    if is_header:
        tourn_name = player_i.strip()
        end_date = parse_ending_date(player_i2)

        # Skip cancelled or empty blocks
        if "cancelled" in player_i3:
            print(f"⚠️ Skipping cancelled tournament: {tourn_name} — {end_date}")
            i += 4
            continue

        # Avoid duplicate block processing
        if tourn_name == last_tourn_name and end_date == last_end_date:
            i += 1
            continue

        print(f"📍 Detected: {tourn_name} — Ending: {end_date}")
        last_tourn_name = tourn_name
        last_end_date = end_date
        i += 4  # Skip header lines

        # Collect all player rows until next header block
        while i < len(df) - 2:
            next_i2 = str(df.loc[i + 2, "PLAYER"])
            is_next_header = (
                pd.isna(df.loc[i, "ODDS"]) and
                pd.isna(df.loc[i + 1, "ODDS"]) and (
                    re.search(r"\w+\s\d+\s*[-–]\s*(\w+\s)?\d+,\s\d{4}", next_i2) or
                    re.search(r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),?\s+\w+\s\d{1,2},\s\d{4}", next_i2)
                )
            )
            if is_next_header:
                break

            if pd.notna(df.loc[i, "ODDS"]):
                row = df.loc[i].copy()
                row["TOURNAMENT"] = tourn_name
                row["ENDING_DATE"] = end_date
                final_rows.append(row)
            i += 1
    else:
        i += 1

# === STEP 4: Create cleaned DataFrame ===
clean_df = pd.DataFrame(final_rows)

# ✅ Prevent crash if nothing was parsed
if clean_df.empty or "PLAYER" not in clean_df.columns:
    print(f"⚠️ No valid tournament blocks detected for season {season} ({oddsYear})")
    final_df = pd.DataFrame()  # Safe fallback
else:
    # Remove winner tag
    clean_df["PLAYER"] = clean_df["PLAYER"].str.replace(r"\s\*Winner\*", "", regex=True)

    # Clean odds to numeric
    clean_df["VEGAS_ODDS"] = (
        clean_df["ODDS"]
        .str.replace(",", "")
        .str.extract(r"(\d+)/(\d+)")
        .astype(float)
        .apply(lambda x: x[0] / x[1], axis=1)
    )

    # Final output with source index for debugging
    final_df = clean_df[
        ["SEASON", "TOURNAMENT", "ENDING_DATE", "PLAYER", "ODDS", "VEGAS_ODDS"]
    ].reset_index(drop=True)

    # Drop non-standard team events (e.g., Presidents Cup, Ryder Cup)
    drop_terms = ["Presidents Cup", "Ryder Cup"]
    final_df = final_df[~final_df["TOURNAMENT"].str.contains("|".join(drop_terms), case=False, na=False)]

    display(final_df.head())



📍 Detected: Safeway Open — Ending: 2020-09-13
📍 Detected: US Open — Ending: 2020-09-20
📍 Detected: R & C Championship — Ending: 2020-09-27
📍 Detected: at Big Cedar Lodge - — Ending: 2020-09-22
📍 Detected: Sanderson Farms Champ — Ending: 2020-10-04
📍 Detected: Shriners H for C Open — Ending: 2020-10-11
📍 Detected: The CJ Cup — Ending: 2020-10-18
📍 Detected: ZOZO CHAMPIONSHIP — Ending: 2020-10-25
📍 Detected: Bermuda Championship — Ending: 2020-11-01
📍 Detected: Vivint Houston Open — Ending: 2020-11-08
📍 Detected: The Masters — Ending: 2020-11-15
📍 Detected: The RSM Classic — Ending: 2020-11-22
📍 Detected: Champions for Change — Ending: 2020-11-27
📍 Detected: Mayakoba Golf Classic — Ending: 2020-12-06
📍 Detected: QBE Shootout — Ending: 2020-12-13
📍 Detected: Sentry Tourn of Champions — Ending: 2021-01-10
📍 Detected: Sony Open in Hawaii — Ending: 2021-01-17
📍 Detected: The American Express — Ending: 2021-01-24
📍 Detected: Abu Dhabi HSBC Champ — Ending: 2021-01-24
📍 Detected: Farmers Insura

Unnamed: 0,SEASON,TOURNAMENT,ENDING_DATE,PLAYER,ODDS,VEGAS_ODDS
0,2021,Safeway Open,2020-09-13,Phil Mickelson,20/1,20.0
1,2021,Safeway Open,2020-09-13,Si Woo Kim,20/1,20.0
2,2021,Safeway Open,2020-09-13,Brendan Steele,20/1,20.0
3,2021,Safeway Open,2020-09-13,Shane Lowry,25/1,25.0
4,2021,Safeway Open,2020-09-13,Sergio Garcia,30/1,30.0


In [4]:
from datetime import datetime, date
# ✅ Check for non-date types in ENDING_DATE
non_dates = final_df[~final_df["ENDING_DATE"].apply(lambda x: isinstance(x, date))]

print(f"🧪 Rows with invalid ENDING_DATE values: {len(non_dates)}")
display(non_dates.head(10))



🧪 Rows with invalid ENDING_DATE values: 0


Unnamed: 0,SEASON,TOURNAMENT,ENDING_DATE,PLAYER,ODDS,VEGAS_ODDS


In [5]:
dupes = final_df.duplicated(subset=["SEASON", "TOURNAMENT", "ENDING_DATE", "PLAYER"], keep=False)

print(f"🚨 Duplicate primary keys in final_df: {dupes.sum()}")
display(final_df[dupes].sort_values(by=["SEASON", "TOURNAMENT", "PLAYER"]))

🚨 Duplicate primary keys in final_df: 0


Unnamed: 0,SEASON,TOURNAMENT,ENDING_DATE,PLAYER,ODDS,VEGAS_ODDS


## Historical Data

### Pull Relevant Seasons
Do a check to see when this course or tournament have been historically played.

In [3]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_combined_history_seasons

# === USER INPUT ===
seasons = list(range(2016, 2025))  # Adjust as needed
db_path = "data/golf.db"

# Pull course and tournament from config
n_course = tournament_config["new"]["course"]
n_tourn = tournament_config["new"]["name"]

# Fetch relevant history
history_df = get_combined_history_seasons(db_path, course=n_course, tournament=n_tourn, allowed_seasons=seasons)
history_df.head(20)


ℹ️ Found 9 relevant tournaments from course or tournament name.


Unnamed: 0,SEASON,COURSE,TOURN_ID,TOURNAMENT,ENDING_DATE
632,2016,Colonial Country Club,021,DEAN & DELUCA Invitational,2016-05-29
511,2017,Colonial Country Club,021,DEAN & DELUCA Invitational,2017-05-28
390,2018,Colonial Country Club,021,Fort Worth Invitational,2018-05-27
269,2019,Colonial Country Club,021,Charles Schwab Challenge,2019-05-26
121,2020,Colonial Country Club,021,Charles Schwab Challenge,2020-06-14
0,2021,Colonial Country Club,021,Charles Schwab Challenge,2021-05-30
875,2022,Colonial Country Club,021,Charles Schwab Challenge,2022-05-29
995,2023,Colonial Country Club,R2023021,Charles Schwab Challenge,2023-05-28
1115,2024,Colonial Country Club,R2024021,Charles Schwab Challenge,2024-05-26


### Cut Percentage and FedEx Points
Use a rolling-window approach to look at the most recent cut percentage and how many FedEx cup points have been accumulated recently. This will intentionally not match the PGA Tour stats that start over every year, but will have the same amount of data all the time.  We also add a new feature called Form Density which divides the FedEx Cup Points by the Total Events.

In [4]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_cut_and_fedex_history

cuts = get_cut_and_fedex_history("data/golf.db", history_df, window_months=9)
# cuts["2024-05-12"].head(20)

for end_date, df in cuts.items():
    print(f"\n📆 {end_date} — {df['TOURNAMENT'].iloc[0]} ({len(df)} players)")
    display(df.head(3))


📆 2016-05-29 — DEAN & DELUCA Invitational (415 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,15,9,365.0,60.0,24.33,0,2016-05-29,DEAN & DELUCA Invitational
1,Aaron Wise,1,0,0.0,0.0,0.0,0,2016-05-29,DEAN & DELUCA Invitational
2,Abraham Ancer,10,2,43.0,20.0,4.3,2,2016-05-29,DEAN & DELUCA Invitational



📆 2017-05-28 — DEAN & DELUCA Invitational (400 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,14,8,264.29,57.1,18.88,2,2017-05-28,DEAN & DELUCA Invitational
1,Aaron Wise,6,4,0.0,66.7,0.0,4,2017-05-28,DEAN & DELUCA Invitational
2,Abraham Ancer,1,1,0.0,100.0,0.0,1,2017-05-28,DEAN & DELUCA Invitational



📆 2018-05-27 — Fort Worth Invitational (407 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,A.J. McInerney,2,1,0.0,50.0,0.0,0,2018-05-27,Fort Worth Invitational
1,Aaron Baddeley,16,10,239.12,62.5,14.94,0,2018-05-27,Fort Worth Invitational
2,Aaron Wise,17,11,992.76,64.7,58.4,2,2018-05-27,Fort Worth Invitational



📆 2019-05-26 — Charles Schwab Challenge (428 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,11,6,228.08,54.5,20.73,0,2019-05-26,Charles Schwab Challenge
1,Aaron Wise,12,9,485.89,75.0,40.49,4,2019-05-26,Charles Schwab Challenge
2,Abraham Ancer,14,11,398.44,78.6,28.46,2,2019-05-26,Charles Schwab Challenge



📆 2020-06-14 — Charles Schwab Challenge (393 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,10,6,115.47,60.0,11.55,0,2020-06-14,Charles Schwab Challenge
1,Aaron Wise,11,4,116.5,36.4,10.59,1,2020-06-14,Charles Schwab Challenge
2,Abraham Ancer,9,7,430.33,77.8,47.81,7,2020-06-14,Charles Schwab Challenge



📆 2021-05-30 — Charles Schwab Challenge (471 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,11,2,18.08,18.2,1.64,0,2021-05-30,Charles Schwab Challenge
1,Aaron Rai,1,0,0.0,0.0,0.0,0,2021-05-30,Charles Schwab Challenge
2,Aaron Terrazas,1,0,0.0,0.0,0.0,0,2021-05-30,Charles Schwab Challenge



📆 2022-05-29 — Charles Schwab Challenge (475 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,5,2,56.96,40.0,11.39,0,2022-05-29,Charles Schwab Challenge
1,Aaron Beverly,1,0,0.0,0.0,0.0,0,2022-05-29,Charles Schwab Challenge
2,Aaron Jarvis,1,0,0.0,0.0,0.0,0,2022-05-29,Charles Schwab Challenge



📆 2023-05-28 — Charles Schwab Challenge (487 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,13,10,305.661,76.9,23.51,4,2023-05-28,Charles Schwab Challenge
1,Aaron Beverly,1,1,0.0,100.0,0.0,1,2023-05-28,Charles Schwab Challenge
2,Aaron Jarvis,1,0,0.0,0.0,0.0,0,2023-05-28,Charles Schwab Challenge



📆 2024-05-26 — Charles Schwab Challenge (448 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,13,9,181.975,69.2,14.0,4,2024-05-26,Charles Schwab Challenge
1,Aaron Rai,13,10,368.707,76.9,28.36,4,2024-05-26,Charles Schwab Challenge
2,Adam Hadwin,14,11,679.292,78.6,48.52,5,2024-05-26,Charles Schwab Challenge


### Recent Form

In [5]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_recent_avg_finish

recent_form = get_recent_avg_finish("data/golf.db", history_df, window_months=9)

# Example preview
for date, df in recent_form.items(): 
    print(f"\n📆 {date} — {df['TOURNAMENT'].iloc[0]} ({len(df)} players)")
    display(df.head(3))


📆 2016-05-29 — DEAN & DELUCA Invitational (415 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Lee McCoy,1,4.0,5.77,2016-05-29,DEAN & DELUCA Invitational
1,Cody Gribble,1,8.0,11.54,2016-05-29,DEAN & DELUCA Invitational
2,Jon Rahm,2,12.5,11.38,2016-05-29,DEAN & DELUCA Invitational



📆 2017-05-28 — DEAN & DELUCA Invitational (400 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Keith Mitchell,1,11.0,15.87,2017-05-28,DEAN & DELUCA Invitational
1,Oscar Fraustro,1,13.0,18.76,2017-05-28,DEAN & DELUCA Invitational
2,Cam Davis,1,15.0,21.64,2017-05-28,DEAN & DELUCA Invitational



📆 2018-05-27 — Fort Worth Invitational (407 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Jason Day,8,8.3,3.78,2018-05-27,Fort Worth Invitational
1,Dustin Johnson,7,13.6,6.54,2018-05-27,Fort Worth Invitational
2,Sam Horsfield,1,14.0,20.2,2018-05-27,Fort Worth Invitational



📆 2019-05-26 — Charles Schwab Challenge (428 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Rory McIlroy,8,7.1,3.23,2019-05-26,Charles Schwab Challenge
1,Erik van Rooyen,1,8.0,11.54,2019-05-26,Charles Schwab Challenge
2,Dustin Johnson,9,13.9,6.04,2019-05-26,Charles Schwab Challenge



📆 2020-06-14 — Charles Schwab Challenge (393 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Webb Simpson,4,3.3,2.05,2020-06-14,Charles Schwab Challenge
1,Tyrrell Hatton,2,3.5,3.19,2020-06-14,Charles Schwab Challenge
2,Rory McIlroy,4,4.0,2.49,2020-06-14,Charles Schwab Challenge



📆 2021-05-30 — Charles Schwab Challenge (471 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Bud Cauley,1,14.0,20.2,2021-05-30,Charles Schwab Challenge
1,Jon Rahm,14,16.3,6.02,2021-05-30,Charles Schwab Challenge
2,Dawie van der Walt,1,20.0,28.85,2021-05-30,Charles Schwab Challenge



📆 2022-05-29 — Charles Schwab Challenge (475 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Taylor Montgomery,1,11.0,15.87,2022-05-29,Charles Schwab Challenge
1,Haotong Li,1,12.0,17.31,2022-05-29,Charles Schwab Challenge
2,Justin Thomas,13,12.8,4.85,2022-05-29,Charles Schwab Challenge



📆 2023-05-28 — Charles Schwab Challenge (487 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Joohyung Kim,1,1.0,1.44,2023-05-28,Charles Schwab Challenge
1,Brooks Koepka,2,1.5,1.37,2023-05-28,Charles Schwab Challenge
2,Scottie Scheffler,14,8.8,3.25,2023-05-28,Charles Schwab Challenge



📆 2024-05-26 — Charles Schwab Challenge (448 players)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Bryson DeChambeau,2,4.0,3.64,2024-05-26,Charles Schwab Challenge
1,Scottie Scheffler,12,4.7,1.83,2024-05-26,Charles Schwab Challenge
2,Dean Burmester,1,12.0,17.31,2024-05-26,Charles Schwab Challenge


### Course History

In [6]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_course_history

# Filter history_df for only the course we're targeting
target_course = tournament_config["new"]["course"]
course_df = history_df[history_df["COURSE"] == target_course]
course_hist = get_course_history("data/golf.db", course_df)

# View example
for date, df in course_hist.items():
    if not df.empty:
        print(f"\n🏌️‍♂️ Course history for {df['TOURNAMENT'].iloc[0]} on {date}")
        display(df.head(3))


🏌️‍♂️ Course history for DEAN & DELUCA Invitational on 2016-05-29


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,1,90.0,129.84,2016-05-29,Colonial Country Club,DEAN & DELUCA Invitational
1,Adam Hadwin,1,5.0,7.21,2016-05-29,Colonial Country Club,DEAN & DELUCA Invitational
2,Adam Scott,1,24.0,34.62,2016-05-29,Colonial Country Club,DEAN & DELUCA Invitational



🏌️‍♂️ Course history for DEAN & DELUCA Invitational on 2017-05-28


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,2,72.5,65.99,2017-05-28,Colonial Country Club,DEAN & DELUCA Invitational
1,Adam Hadwin,2,13.5,12.29,2017-05-28,Colonial Country Club,DEAN & DELUCA Invitational
2,Adam Scott,2,39.5,35.95,2017-05-28,Colonial Country Club,DEAN & DELUCA Invitational



🏌️‍♂️ Course history for Fort Worth Invitational on 2018-05-27


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,3,78.3,56.48,2018-05-27,Colonial Country Club,Fort Worth Invitational
1,Adam Hadwin,3,26.7,19.26,2018-05-27,Colonial Country Club,Fort Worth Invitational
2,Adam Scott,2,39.5,35.95,2018-05-27,Colonial Country Club,Fort Worth Invitational



🏌️‍♂️ Course history for Charles Schwab Challenge on 2019-05-26


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,4,81.3,50.51,2019-05-26,Colonial Country Club,Charles Schwab Challenge
1,Aaron Wise,1,90.0,129.84,2019-05-26,Colonial Country Club,Charles Schwab Challenge
2,Abraham Ancer,1,52.0,75.02,2019-05-26,Colonial Country Club,Charles Schwab Challenge



🏌️‍♂️ Course history for Charles Schwab Challenge on 2020-06-14


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,5,74.6,41.64,2020-06-14,Colonial Country Club,Charles Schwab Challenge
1,Aaron Wise,1,90.0,129.84,2020-06-14,Colonial Country Club,Charles Schwab Challenge
2,Abraham Ancer,2,55.0,50.06,2020-06-14,Colonial Country Club,Charles Schwab Challenge



🏌️‍♂️ Course history for Charles Schwab Challenge on 2021-05-30


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,5,74.6,41.64,2021-05-30,Colonial Country Club,Charles Schwab Challenge
1,Aaron Wise,1,90.0,129.84,2021-05-30,Colonial Country Club,Charles Schwab Challenge
2,Abraham Ancer,3,41.3,29.79,2021-05-30,Colonial Country Club,Charles Schwab Challenge



🏌️‍♂️ Course history for Charles Schwab Challenge on 2022-05-29


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,4,70.8,43.99,2022-05-29,Colonial Country Club,Charles Schwab Challenge
1,Aaron Wise,1,90.0,129.84,2022-05-29,Colonial Country Club,Charles Schwab Challenge
2,Abraham Ancer,4,34.5,21.44,2022-05-29,Colonial Country Club,Charles Schwab Challenge



🏌️‍♂️ Course history for Charles Schwab Challenge on 2023-05-28


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,4,70.8,43.99,2023-05-28,Colonial Country Club,Charles Schwab Challenge
1,Aaron Rai,1,68.0,98.1,2023-05-28,Colonial Country Club,Charles Schwab Challenge
2,Aaron Wise,1,90.0,129.84,2023-05-28,Colonial Country Club,Charles Schwab Challenge



🏌️‍♂️ Course history for Charles Schwab Challenge on 2024-05-26


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,4,62.3,38.71,2024-05-26,Colonial Country Club,Charles Schwab Challenge
1,Aaron Rai,2,40.0,36.41,2024-05-26,Colonial Country Club,Charles Schwab Challenge
2,Aaron Wise,1,90.0,129.84,2024-05-26,Colonial Country Club,Charles Schwab Challenge


## Training Dataset

In [7]:
pd.set_option("display.max_columns", None)   # Show all columns
importlib.reload(utils.db_utils)
from utils.db_utils import build_training_rows
training_df = build_training_rows(
    db_path,
    history_df,
    cuts,
    recent_form,
    course_hist,
)
training_df.head(10)
# training_df.info()
# training_df.isna().sum().sort_values(ascending=False)

Unnamed: 0,SEASON,ENDING_DATE,TOURNAMENT,COURSE,PLAYER,POS,FINAL_POS,SGTTG_RANK,SGTTG,SGOTT_RANK,SGOTT,SGAPR_RANK,SGAPR,SGATG_RANK,SGATG,SGP_RANK,SGP,BIRDIES_RANK,BIRDIES,PAR_3_RANK,PAR_3,PAR_4_RANK,PAR_4,PAR_5_RANK,PAR_5,TOTAL_DRIVING_RANK,TOTAL_DRIVING,DRIVING_DISTANCE_RANK,DRIVING_DISTANCE,DRIVING_ACCURACY_RANK,DRIVING_ACCURACY,GIR_RANK,GIR,SCRAMBLING_RANK,SCRAMBLING,OWGR_RANK,OWGR,VEGAS_ODDS,CUT_PERCENTAGE,FEDEX_CUP_POINTS,form_density,CONSECUTIVE_CUTS,RECENT_FORM,adj_form,COURSE_HISTORY,adj_ch,TOP_20
0,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Aaron Baddeley,T55,55,146.0,-0.33,159.0,-0.366,164.0,-0.362,10.0,0.402,8.0,0.586,59.0,20.72,78.0,3.06,14.0,4.01,50.0,4.64,159.0,237.0,82.0,291.7,155.0,55.42,154.0,63.11,5.0,64.3,143.0,1.14,,60.0,365.0,24.33,0.0,54.0,19.48,90.0,129.84,0
1,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Adam Hadwin,T22,22,111.0,-0.011,55.0,0.223,121.0,-0.055,132.0,-0.081,11.0,0.53,22.0,21.74,16.0,3.01,48.0,4.03,68.0,4.66,35.0,141.0,92.0,290.1,49.0,63.62,83.0,66.07,97.0,58.27,193.0,0.91,,70.6,378.67,22.27,4.0,53.6,18.54,5.0,7.21,0
2,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Adam Scott,T55,55,1.0,2.062,20.0,0.522,1.0,1.491,83.0,0.049,129.0,-0.168,9.0,22.88,10.0,3.0,8.0,4.0,13.0,4.58,64.0,166.0,13.0,304.6,153.0,55.55,3.0,70.76,74.0,59.5,6.0,7.07,12.0,100.0,958.35,136.91,7.0,20.3,9.76,24.0,34.62,0
3,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Andrew Loupe,CUT,90,150.0,-0.395,105.0,-0.005,154.0,-0.281,158.0,-0.172,38.0,0.28,12.0,22.74,78.0,3.06,177.0,4.11,13.0,4.58,101.0,189.0,5.0,309.3,184.0,45.56,148.0,63.41,175.0,52.83,194.0,0.91,,62.5,482.82,30.18,2.0,53.9,19.02,,,0
4,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Anirban Lahiri,T6,6,97.0,0.088,95.0,0.025,90.0,0.143,130.0,-0.08,99.0,-0.02,107.0,19.73,136.0,3.09,97.0,4.05,155.0,4.74,154.0,232.0,82.0,291.7,150.0,56.31,161.0,62.88,138.0,56.4,92.0,1.63,100.0,66.7,181.0,20.11,3.0,55.4,24.06,,,1
5,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Ben Crane,T29,29,115.0,-0.028,117.0,-0.063,99.0,0.101,49.0,0.173,46.0,0.23,128.0,19.23,78.0,3.06,48.0,4.03,100.0,4.69,138.0,216.0,162.0,280.3,54.0,63.18,143.0,63.6,31.0,61.65,371.0,0.5,,53.3,282.0,18.8,1.0,59.5,21.46,71.0,102.43,0
6,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Ben Martin,T29,29,126.0,-0.117,79.0,0.079,101.0,0.081,169.0,-0.277,70.0,0.14,29.0,21.53,146.0,3.1,97.0,4.05,28.0,4.61,71.0,174.0,108.0,288.8,66.0,62.14,64.0,66.73,160.0,54.84,123.0,1.27,,64.3,257.05,18.36,0.0,58.9,21.75,10.0,14.43,0
7,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Bill Haas,T47,47,34.0,0.693,81.0,0.068,54.0,0.272,14.0,0.353,130.0,-0.174,172.0,17.89,22.0,3.02,68.0,4.04,100.0,4.69,55.0,159.0,97.0,289.6,62.0,62.34,57.0,67.14,12.0,63.3,44.0,2.87,40.0,76.9,912.0,70.15,1.0,35.4,13.41,,,0
8,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Blayne Barber,T34,34,97.0,0.088,98.0,0.018,94.0,0.127,121.0,-0.056,90.0,0.01,88.0,20.07,35.0,3.03,114.0,4.06,60.0,4.65,53.0,157.0,97.0,289.6,60.0,62.37,63.0,66.74,115.0,57.62,249.0,0.72,,37.5,260.34,16.27,0.0,71.1,25.1,,,0
9,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Boo Weekley,CUT,90,69.0,0.307,14.0,0.571,127.0,-0.09,159.0,-0.174,144.0,-0.225,52.0,20.88,102.0,3.07,97.0,4.05,24.0,4.6,11.0,100.0,84.0,291.5,16.0,66.98,41.0,67.87,94.0,58.33,202.0,0.88,80.0,58.8,398.55,23.44,5.0,56.5,19.55,33.0,47.61,0


In [43]:
# Play with different years to see if I notice anything wrong
training_df[training_df["SEASON"] == 2024]

Unnamed: 0,SEASON,ENDING_DATE,TOURNAMENT,COURSE,PLAYER,POS,FINAL_POS,SGTTG_RANK,SGTTG,SGOTT_RANK,SGOTT,SGAPR_RANK,SGAPR,SGATG_RANK,SGATG,SGP_RANK,SGP,BIRDIES_RANK,BIRDIES,PAR_3_RANK,PAR_3,PAR_4_RANK,PAR_4,PAR_5_RANK,PAR_5,TOTAL_DRIVING_RANK,TOTAL_DRIVING,DRIVING_DISTANCE_RANK,DRIVING_DISTANCE,DRIVING_ACCURACY_RANK,DRIVING_ACCURACY,GIR_RANK,GIR,SCRAMBLING_RANK,SCRAMBLING,OWGR_RANK,OWGR,VEGAS_ODDS,CUT_PERCENTAGE,FEDEX_CUP_POINTS,form_density,CONSECUTIVE_CUTS,RECENT_FORM,adj_form,COURSE_HISTORY,adj_ch,TOP_20
993,2024,2024-05-26,Charles Schwab Challenge,Colonial Country Club,Aaron Rai,T32,32,9.0,1.131,36.0,0.303,8.0,0.676,56.0,0.152,83.0,0.054,99.0,22.10%,38.0,3.01,10.0,3.97,44.0,4.54,53.0,153.0,152.0,293.8,1.0,72.02%,7.0,71.58%,33.0,63.18%,21.0,3.2294,50.0,76.9,368.707,28.36,4.0,43.9,16.63,40.0,36.41,0
994,2024,2024-05-26,Charles Schwab Challenge,Colonial Country Club,Adam Schenk,W/D,90,146.0,-0.421,53.0,0.212,166.0,-0.520,130.0,-0.114,85.0,0.051,132.0,21.24%,152.0,3.09,131.0,4.04,116.0,4.61,102.0,180.0,82.0,302.4,98.0,59.93%,149.0,64.50%,74.0,60.50%,82.0,1.4461,80.0,73.7,568.417,29.92,0.0,46.8,15.62,63.3,32.53,0
995,2024,2024-05-26,Charles Schwab Challenge,Colonial Country Club,Adam Scott,T12,12,38.0,0.540,33.0,0.322,87.0,0.114,67.0,0.104,29.0,0.398,79.0,22.54%,38.0,3.01,57.0,4.00,44.0,4.54,75.0,170.0,49.0,307.1,121.0,58.28%,102.0,66.35%,45.0,62.09%,20.0,3.2584,60.0,83.3,577.232,48.10,0.0,34.4,13.41,52.0,75.02,1
996,2024,2024-05-26,Charles Schwab Challenge,Colonial Country Club,Adam Svensson,T24,24,33.0,0.592,99.0,0.019,28.0,0.380,39.0,0.193,164.0,-0.453,153.0,20.41%,142.0,3.08,85.0,4.01,121.0,4.62,62.0,158.0,118.0,298.3,40.0,64.18%,88.0,67.13%,60.0,61.39%,112.0,1.1671,100.0,75.0,275.228,13.76,6.0,49.3,16.19,40.0,57.71,0
997,2024,2024-05-26,Charles Schwab Challenge,Colonial Country Club,Akshay Bhatia,CUT,90,56.0,0.403,58.0,0.188,51.0,0.264,119.0,-0.048,34.0,0.345,87.0,22.35%,52.0,3.02,85.0,4.01,77.0,4.57,28.0,133.0,104.0,299.8,29.0,65.50%,114.0,66.02%,54.0,61.79%,29.0,2.7339,60.0,71.4,1143.376,54.45,0.0,41.5,13.43,56.0,80.79,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,2024,2024-05-26,Charles Schwab Challenge,Colonial Country Club,Vincent Norrman,T70,70,145.0,-0.420,52.0,0.214,142.0,-0.313,162.0,-0.322,177.0,-0.954,177.0,16.89%,114.0,3.06,177.0,4.14,147.0,4.65,42.0,144.0,22.0,311.0,122.0,58.27%,56.0,68.25%,175.0,51.79%,193.0,0.7219,300.0,46.2,50.800,3.91,0.0,73.7,27.93,48.0,69.25,0
1121,2024,2024-05-26,Charles Schwab Challenge,Colonial Country Club,Webb Simpson,T50,50,,,,,,,,,,,,,,,,,,,,,,,,,,,,,305.0,0.4596,125.0,72.7,172.393,15.67,4.0,56.0,22.54,53.0,32.93,0
1122,2024,2024-05-26,Charles Schwab Challenge,Colonial Country Club,Wesley Bryan,CUT,90,,,,,,,,,,,,,,,,,,,,,,,,,,,,,264.0,0.5372,500.0,60.0,29.456,5.89,0.0,64.6,36.05,59.5,54.16,0
1123,2024,2024-05-26,Charles Schwab Challenge,Colonial Country Club,Zac Blair,CUT,90,105.0,-0.018,151.0,-0.282,93.0,0.083,42.0,0.181,118.0,-0.087,163.0,19.97%,52.0,3.02,85.0,4.01,163.0,4.69,118.0,190.0,174.0,283.6,16.0,67.21%,125.0,65.56%,36.0,62.67%,144.0,0.9649,600.0,52.9,91.811,5.40,1.0,69.0,23.87,78.3,56.48,0


### Normalization
***Fix Historical Odds***

Many names do not have historical odds but rather are part of the "field." So giving these "NaN" values the average odds (as I would for missing stats) isn't a good approximation of reality – they are typically the "field" because they are not notable and have poor odds individually.

There are also some crazy odds numbers occasionally (1000/1 or 3000/1) that don't happen all that much and are making that tail too long.

This cell assigns odds of 1000/1 for anyone missing and clips anything larger to 1000/1 to normalize the data better and assign the missing odds more appropriately where they belong.

***OWGR Adjustment***

The NaN's for OWGR are similarly bad players.  So we should assign these the worst ranking instead of the mean and clip it at 1000 just in case there are ever outliers.

For the OWGR score, it should match the lowest score in the dataset to assign that person the same as the worst.

***Recent Form Adjustment***

The NaN's for Recent Form mean that this player has not played any tournaments in the lookback period (9 months).  Similar to Odds and OWGR, I want to punish those that don't play often in my model.  Set these to 90 (i.e. like they miss a lot of cuts - because if they aren't playing they aren't good enough to make it to these tournaments very frequently).  The adj_form feature will also need to be updated here with the new data.

***FedEx Cup Point Adjustment***

The NaN's for FedEx Cup Points means there is no data.  Therefore they should be 0.

In [8]:
# === VEGAS ODDS ===
training_df["VEGAS_ODDS"] = training_df["VEGAS_ODDS"].fillna(1000).clip(upper=1000)

# === OWGR and OWGR_RANK ===
owgr_min = training_df["OWGR"].min(skipna=True)
training_df["OWGR"] = training_df["OWGR"].fillna(owgr_min)
training_df["OWGR_RANK"] = training_df["OWGR_RANK"].fillna(1000).astype(float).clip(upper=1000)

# === RECENT FORM and adj_form ===
training_df["RECENT_FORM"] = training_df["RECENT_FORM"].fillna(90)
if "TOTAL_EVENTS_PLAYED" in training_df.columns:
    training_df["adj_form"] = (
        training_df["RECENT_FORM"] / np.log1p(training_df["TOTAL_EVENTS_PLAYED"])
    ).round(2)

# === FEDEX CUP POINTS ===
training_df["FEDEX_CUP_POINTS"] = training_df["FEDEX_CUP_POINTS"].fillna(0)

training_df.head(10)


Unnamed: 0,SEASON,ENDING_DATE,TOURNAMENT,COURSE,PLAYER,POS,FINAL_POS,SGTTG_RANK,SGTTG,SGOTT_RANK,SGOTT,SGAPR_RANK,SGAPR,SGATG_RANK,SGATG,SGP_RANK,SGP,BIRDIES_RANK,BIRDIES,PAR_3_RANK,PAR_3,PAR_4_RANK,PAR_4,PAR_5_RANK,PAR_5,TOTAL_DRIVING_RANK,TOTAL_DRIVING,DRIVING_DISTANCE_RANK,DRIVING_DISTANCE,DRIVING_ACCURACY_RANK,DRIVING_ACCURACY,GIR_RANK,GIR,SCRAMBLING_RANK,SCRAMBLING,OWGR_RANK,OWGR,VEGAS_ODDS,CUT_PERCENTAGE,FEDEX_CUP_POINTS,form_density,CONSECUTIVE_CUTS,RECENT_FORM,adj_form,COURSE_HISTORY,adj_ch,TOP_20
0,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Aaron Baddeley,T55,55,146.0,-0.33,159.0,-0.366,164.0,-0.362,10.0,0.402,8.0,0.586,59.0,20.72,78.0,3.06,14.0,4.01,50.0,4.64,159.0,237.0,82.0,291.7,155.0,55.42,154.0,63.11,5.0,64.3,143.0,1.14,1000.0,60.0,365.0,24.33,0.0,54.0,19.48,90.0,129.84,0
1,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Adam Hadwin,T22,22,111.0,-0.011,55.0,0.223,121.0,-0.055,132.0,-0.081,11.0,0.53,22.0,21.74,16.0,3.01,48.0,4.03,68.0,4.66,35.0,141.0,92.0,290.1,49.0,63.62,83.0,66.07,97.0,58.27,193.0,0.91,1000.0,70.6,378.67,22.27,4.0,53.6,18.54,5.0,7.21,0
2,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Adam Scott,T55,55,1.0,2.062,20.0,0.522,1.0,1.491,83.0,0.049,129.0,-0.168,9.0,22.88,10.0,3.0,8.0,4.0,13.0,4.58,64.0,166.0,13.0,304.6,153.0,55.55,3.0,70.76,74.0,59.5,6.0,7.07,12.0,100.0,958.35,136.91,7.0,20.3,9.76,24.0,34.62,0
3,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Andrew Loupe,CUT,90,150.0,-0.395,105.0,-0.005,154.0,-0.281,158.0,-0.172,38.0,0.28,12.0,22.74,78.0,3.06,177.0,4.11,13.0,4.58,101.0,189.0,5.0,309.3,184.0,45.56,148.0,63.41,175.0,52.83,194.0,0.91,1000.0,62.5,482.82,30.18,2.0,53.9,19.02,,,0
4,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Anirban Lahiri,T6,6,97.0,0.088,95.0,0.025,90.0,0.143,130.0,-0.08,99.0,-0.02,107.0,19.73,136.0,3.09,97.0,4.05,155.0,4.74,154.0,232.0,82.0,291.7,150.0,56.31,161.0,62.88,138.0,56.4,92.0,1.63,100.0,66.7,181.0,20.11,3.0,55.4,24.06,,,1
5,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Ben Crane,T29,29,115.0,-0.028,117.0,-0.063,99.0,0.101,49.0,0.173,46.0,0.23,128.0,19.23,78.0,3.06,48.0,4.03,100.0,4.69,138.0,216.0,162.0,280.3,54.0,63.18,143.0,63.6,31.0,61.65,371.0,0.5,1000.0,53.3,282.0,18.8,1.0,59.5,21.46,71.0,102.43,0
6,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Ben Martin,T29,29,126.0,-0.117,79.0,0.079,101.0,0.081,169.0,-0.277,70.0,0.14,29.0,21.53,146.0,3.1,97.0,4.05,28.0,4.61,71.0,174.0,108.0,288.8,66.0,62.14,64.0,66.73,160.0,54.84,123.0,1.27,1000.0,64.3,257.05,18.36,0.0,58.9,21.75,10.0,14.43,0
7,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Bill Haas,T47,47,34.0,0.693,81.0,0.068,54.0,0.272,14.0,0.353,130.0,-0.174,172.0,17.89,22.0,3.02,68.0,4.04,100.0,4.69,55.0,159.0,97.0,289.6,62.0,62.34,57.0,67.14,12.0,63.3,44.0,2.87,40.0,76.9,912.0,70.15,1.0,35.4,13.41,,,0
8,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Blayne Barber,T34,34,97.0,0.088,98.0,0.018,94.0,0.127,121.0,-0.056,90.0,0.01,88.0,20.07,35.0,3.03,114.0,4.06,60.0,4.65,53.0,157.0,97.0,289.6,60.0,62.37,63.0,66.74,115.0,57.62,249.0,0.72,1000.0,37.5,260.34,16.27,0.0,71.1,25.1,,,0
9,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Boo Weekley,CUT,90,69.0,0.307,14.0,0.571,127.0,-0.09,159.0,-0.174,144.0,-0.225,52.0,20.88,102.0,3.07,97.0,4.05,24.0,4.6,11.0,100.0,84.0,291.5,16.0,66.98,41.0,67.87,94.0,58.33,202.0,0.88,80.0,58.8,398.55,23.44,5.0,56.5,19.55,33.0,47.61,0


#### Average the NaNs
At this point, the intentional offsetting of NaNs for poor players is over.  I don't want to penalize statistics or course history because you never know where they might land on the spectrum, so for these NaN values, we will take the mean.

In [9]:
# Some stats were not averaging correctly - try to force these to be numbers
stats_to_fix = ["SCRAMBLING", "DRIVING_ACCURACY", "BIRDIES", "GIR"]

for col in stats_to_fix:
    training_df[col] = pd.to_numeric(training_df[col], errors="coerce")

# Select only the numeric columns
numeric_columns = training_df.select_dtypes(include=['float64', 'int64'])

# Fill NaN values with the mean of each column
numeric_columns = numeric_columns.fillna(numeric_columns.mean()).round(decimals=0)

# Update the original dataframe with the filled numeric columns
training_df.update(numeric_columns)

training_df.head()

Unnamed: 0,SEASON,ENDING_DATE,TOURNAMENT,COURSE,PLAYER,POS,FINAL_POS,SGTTG_RANK,SGTTG,SGOTT_RANK,SGOTT,SGAPR_RANK,SGAPR,SGATG_RANK,SGATG,SGP_RANK,SGP,BIRDIES_RANK,BIRDIES,PAR_3_RANK,PAR_3,PAR_4_RANK,PAR_4,PAR_5_RANK,PAR_5,TOTAL_DRIVING_RANK,TOTAL_DRIVING,DRIVING_DISTANCE_RANK,DRIVING_DISTANCE,DRIVING_ACCURACY_RANK,DRIVING_ACCURACY,GIR_RANK,GIR,SCRAMBLING_RANK,SCRAMBLING,OWGR_RANK,OWGR,VEGAS_ODDS,CUT_PERCENTAGE,FEDEX_CUP_POINTS,form_density,CONSECUTIVE_CUTS,RECENT_FORM,adj_form,COURSE_HISTORY,adj_ch,TOP_20
0,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Aaron Baddeley,T55,55,146.0,-0.0,159.0,-0.0,164.0,-0.0,10.0,0.0,8.0,1.0,59.0,21.0,78.0,3.0,14.0,4.0,50.0,5.0,159.0,237.0,82.0,292.0,155.0,55.0,154.0,63.0,5.0,64.0,143.0,1.0,1000.0,60.0,365.0,24.0,0.0,54.0,19.0,90.0,130.0,0
1,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Adam Hadwin,T22,22,111.0,-0.0,55.0,0.0,121.0,-0.0,132.0,-0.0,11.0,1.0,22.0,22.0,16.0,3.0,48.0,4.0,68.0,5.0,35.0,141.0,92.0,290.0,49.0,64.0,83.0,66.0,97.0,58.0,193.0,1.0,1000.0,71.0,379.0,22.0,4.0,54.0,19.0,5.0,7.0,0
2,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Adam Scott,T55,55,1.0,2.0,20.0,1.0,1.0,1.0,83.0,0.0,129.0,-0.0,9.0,23.0,10.0,3.0,8.0,4.0,13.0,5.0,64.0,166.0,13.0,305.0,153.0,56.0,3.0,71.0,74.0,60.0,6.0,7.0,12.0,100.0,958.0,137.0,7.0,20.0,10.0,24.0,35.0,0
3,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Andrew Loupe,CUT,90,150.0,-0.0,105.0,-0.0,154.0,-0.0,158.0,-0.0,38.0,0.0,12.0,23.0,78.0,3.0,177.0,4.0,13.0,5.0,101.0,189.0,5.0,309.0,184.0,46.0,148.0,63.0,175.0,53.0,194.0,1.0,1000.0,62.0,483.0,30.0,2.0,54.0,19.0,53.0,52.0,0
4,2016,2016-05-29,DEAN & DELUCA Invitational,Colonial Country Club,Anirban Lahiri,T6,6,97.0,0.0,95.0,0.0,90.0,0.0,130.0,-0.0,99.0,-0.0,107.0,20.0,136.0,3.0,97.0,4.0,155.0,5.0,154.0,232.0,82.0,292.0,150.0,56.0,161.0,63.0,138.0,56.0,92.0,2.0,100.0,67.0,181.0,20.0,3.0,55.0,24.0,53.0,52.0,1


In [10]:
# Check for NaN values
assert training_df.isna().sum().sum() == 0, "🚨 Still missing values!"

In [11]:
# Inspect which columns have NaN values
training_df.isna().sum().sort_values(ascending=False).head(10)


SEASON         0
ENDING_DATE    0
TOURNAMENT     0
COURSE         0
PLAYER         0
POS            0
FINAL_POS      0
SGTTG_RANK     0
SGTTG          0
SGOTT_RANK     0
dtype: int64

### Check Features for Outliers and General Health (Histograms)
Now that all the data is cleaned, let's look at the distribution graphs.

In [17]:
import plotly.express as px
import plotly.subplots as sp
import pandas as pd
import numpy as np

# Select numeric columns with >1 unique value, excluding label
df_numeric = training_df.select_dtypes(include=np.number)
df_numeric = df_numeric.loc[:, df_numeric.nunique() > 1]
df_numeric = df_numeric.drop(columns=["TOP_20"], errors="ignore")

# Create subplots
n_cols = 3
n_rows = int(np.ceil(len(df_numeric.columns) / n_cols))
fig = sp.make_subplots(rows=n_rows, cols=n_cols, subplot_titles=df_numeric.columns)

# Add one histogram per feature
for i, column in enumerate(df_numeric.columns):
    row = i // n_cols + 1
    col = i % n_cols + 1
    fig.add_trace(
        px.histogram(df_numeric, x=column, nbins=1000, histnorm="density", marginal="rug").data[0],
        row=row,
        col=col
    )

# Style the figure
fig.update_layout(
    height=300 * n_rows,
    width=1000,
    title_text="Feature Distributions",
    template="plotly_dark",  # Automatically dark and high-DPI
    showlegend=False
)

fig.show()

# Current Week Data

## Import 2025 Field
1. **Important:**  Save curren't week's `DKSalaries.csv` into Data.

In [20]:
importlib.reload(utils.db_utils)
from utils.db_utils import DK_PLAYER_NAME_MAP

# Load DraftKings player list
dk = pd.read_csv("data/DKSalaries.csv", usecols=["Name", "Salary"])

# Normalize DK names to PGA naming convention
dk["Name"] = dk["Name"].replace(DK_PLAYER_NAME_MAP)

# Rename for consistency
dk = dk.rename(columns={"Name": "PLAYER", "Salary": "SALARY"})

dk.head(10)

Unnamed: 0,PLAYER,SALARY
0,Scottie Scheffler,13700
1,Tommy Fleetwood,10000
2,Daniel Berger,9900
3,Jordan Spieth,9800
4,Hideki Matsuyama,9600
5,Maverick McNealy,9500
6,J.T. Poston,9400
7,Robert Macintyre,9300
8,Harris English,9200
9,Aaron Rai,9100


## Vegas Odds
Confirm the [Vegas Odds](http://golfodds.com/weekly-odds.html) page is updated with current tournament.

Note: sometimes this site can give errors and not load correctly.  Just have to do it again later.
Another trick to try is change the url try to run it and then put the url back the way it was.  Worked once, maybe random coincidence.

In [25]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_current_week_odds

odds_current = get_current_week_odds(season=2025, tournament_name=new_tournament_name)
odds_current.head(10)


Unnamed: 0,SEASON,TOURNAMENT,PLAYER,ODDS,VEGAS_ODDS
1,2025,Charles Schwab Challenge,Scottie Scheffler,9/4,2.25
2,2025,Charles Schwab Challenge,Daniel Berger,20/1,20.0
3,2025,Charles Schwab Challenge,Tommy Fleetwood,25/1,25.0
4,2025,Charles Schwab Challenge,Hideki Matsuyama,25/1,25.0
5,2025,Charles Schwab Challenge,Jordan Spieth,25/1,25.0
6,2025,Charles Schwab Challenge,Maverick McNealy,30/1,30.0
7,2025,Charles Schwab Challenge,Aaron Rai,30/1,30.0
8,2025,Charles Schwab Challenge,Harris English,35/1,35.0
9,2025,Charles Schwab Challenge,Si Woo Kim,40/1,40.0
10,2025,Charles Schwab Challenge,J.T. Poston,40/1,40.0


## Cut Percentage and FedEx Points

In [34]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_cut_and_fedex_history

cuts_rolling = get_cut_and_fedex_history(db_path, this_week_history, window_months=9)
cuts_current = cuts_rolling[this_week_key].copy()
cuts_current.head(10)

Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,CUTS_MADE,FEDEX_CUP_POINTS,CUT_PERCENTAGE,form_density,CONSECUTIVE_CUTS,ENDING_DATE,TOURNAMENT
0,Aaron Baddeley,15,7,55.951,46.7,3.73,0,2025-05-25,Charles Schwab Challenge
1,Aaron Rai,14,12,785.48,85.7,56.11,4,2025-05-25,Charles Schwab Challenge
2,Adam Hadwin,16,10,192.713,62.5,12.04,0,2025-05-25,Charles Schwab Challenge
3,Adam Long,1,0,0.0,0.0,0.0,0,2025-05-25,Charles Schwab Challenge
4,Adam Schenk,17,7,253.954,41.2,14.94,1,2025-05-25,Charles Schwab Challenge
5,Adam Scott,11,9,422.096,81.8,38.37,3,2025-05-25,Charles Schwab Challenge
6,Adam Svensson,18,10,164.158,55.6,9.12,0,2025-05-25,Charles Schwab Challenge
7,Adrien Dumont de Chassart,8,3,200.5,37.5,25.06,0,2025-05-25,Charles Schwab Challenge
8,Akshay Bhatia,14,11,931.99,78.6,66.57,0,2025-05-25,Charles Schwab Challenge
9,Aldrich Potgieter,10,3,355.0,30.0,35.5,0,2025-05-25,Charles Schwab Challenge


## Recent Form

In [35]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_recent_avg_finish

recent_form = get_recent_avg_finish(db_path, this_week_history, window_months=9)
recent_form_current = recent_form[this_week_key].copy()

recent_form_current.head(10)

Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,RECENT_FORM,adj_form,ENDING_DATE,TOURNAMENT
0,Bryson DeChambeau,2,3.5,3.19,2025-05-25,Charles Schwab Challenge
1,Scottie Scheffler,12,7.4,2.89,2025-05-25,Charles Schwab Challenge
2,Jon Rahm,2,11.0,10.01,2025-05-25,Charles Schwab Challenge
3,John Keefer,1,13.0,18.76,2025-05-25,Charles Schwab Challenge
4,Bubba Watson,1,14.0,20.2,2025-05-25,Charles Schwab Challenge
5,Ian Gilligan,1,16.0,23.08,2025-05-25,Charles Schwab Challenge
6,Taisei Shimizu,1,16.0,23.08,2025-05-25,Charles Schwab Challenge
7,Rory McIlroy,9,18.0,7.82,2025-05-25,Charles Schwab Challenge
8,Joaquin Niemann,2,18.5,16.84,2025-05-25,Charles Schwab Challenge
9,Tommy Fleetwood,11,20.5,8.25,2025-05-25,Charles Schwab Challenge


## Course History

In [36]:
importlib.reload(utils.db_utils)
from utils.db_utils import get_course_history

course_hist = get_course_history(db_path, this_week_history, lookback_years=7)
course_hist_current = course_hist[this_week_key].copy()

course_hist_current.head(10)


Unnamed: 0,PLAYER,TOTAL_EVENTS_PLAYED,COURSE_HISTORY,adj_ch,ENDING_DATE,COURSE,TOURNAMENT
0,Aaron Baddeley,3,53.0,38.23,2025-05-25,Colonial Country Club,Charles Schwab Challenge
1,Aaron Rai,3,37.3,26.91,2025-05-25,Colonial Country Club,Charles Schwab Challenge
2,Aaron Wise,1,90.0,129.84,2025-05-25,Colonial Country Club,Charles Schwab Challenge
3,Abraham Ancer,5,45.6,25.45,2025-05-25,Colonial Country Club,Charles Schwab Challenge
4,Adam Hadwin,3,34.3,24.74,2025-05-25,Colonial Country Club,Charles Schwab Challenge
5,Adam Long,5,50.8,28.35,2025-05-25,Colonial Country Club,Charles Schwab Challenge
6,Adam Schenk,7,67.1,32.27,2025-05-25,Colonial Country Club,Charles Schwab Challenge
7,Adam Scott,2,32.0,29.13,2025-05-25,Colonial Country Club,Charles Schwab Challenge
8,Adam Svensson,2,32.0,29.13,2025-05-25,Colonial Country Club,Charles Schwab Challenge
9,Akshay Bhatia,2,73.0,66.45,2025-05-25,Colonial Country Club,Charles Schwab Challenge


## Merged Dataframe

In [40]:
importlib.reload(utils.db_utils)
from utils.db_utils import load_all_stats  # New helper function

# === Load current season stats directly from database ===
season = tournament_config["new"]["season"]
stats_df = load_all_stats(db_path)
stats_df = stats_df[stats_df["SEASON"] == season].copy()

# === Standardize PLAYER columns for consistent merges ===
for df in [stats_df, odds_current, cuts_current, recent_form_current, course_hist_current, dk]:
    df["PLAYER"] = df["PLAYER"].astype(str).str.strip()

# === Merge all engineered and DK features ===
this_week = stats_df.copy()
this_week = this_week.merge(odds_current[["PLAYER", "VEGAS_ODDS"]], on="PLAYER", how="left")
this_week = this_week.merge(cuts_current[["PLAYER", "CUT_PERCENTAGE", "FEDEX_CUP_POINTS", "form_density", "CONSECUTIVE_CUTS"]], on="PLAYER", how="left")
this_week = this_week.merge(recent_form_current[["PLAYER", "RECENT_FORM", "adj_form"]], on="PLAYER", how="left")
this_week = this_week.merge(course_hist_current[["PLAYER", "COURSE_HISTORY", "adj_ch"]], on="PLAYER", how="left")
this_week = this_week.merge(dk[["PLAYER", "SALARY"]], on="PLAYER", how="right")  # keep only DK players

# === Final cleanup ===
this_week = this_week.sort_values("PLAYER").reset_index(drop=True)
this_week["SEASON"] = this_week["SEASON"].astype(float)  # allows averaging if needed

this_week.head()

Unnamed: 0,SEASON,PLAYER,SGTTG_RANK,SGTTG,SGOTT_RANK,SGOTT,SGAPR_RANK,SGAPR,SGATG_RANK,SGATG,SGP_RANK,SGP,BIRDIES_RANK,BIRDIES,PAR_3_RANK,PAR_3,PAR_4_RANK,PAR_4,PAR_5_RANK,PAR_5,TOTAL_DRIVING_RANK,TOTAL_DRIVING,DRIVING_DISTANCE_RANK,DRIVING_DISTANCE,DRIVING_ACCURACY_RANK,DRIVING_ACCURACY,GIR_RANK,GIR,SCRAMBLING_RANK,SCRAMBLING,OWGR_RANK,OWGR,VEGAS_ODDS,CUT_PERCENTAGE,FEDEX_CUP_POINTS,form_density,CONSECUTIVE_CUTS,RECENT_FORM,adj_form,COURSE_HISTORY,adj_ch,SALARY
0,2025.0,Aaron Rai,21.0,0.831,35.0,0.33,20.0,0.513,107.0,-0.012,96.0,0.004,27.0,23.86%,179.0,3.19,6.0,3.95,25.0,4.53,75.0,170.0,169.0,286.7,1.0,73.13%,16.0,69.57%,131.0,58.51%,28.0,2.7748,30.0,85.7,785.48,56.11,4.0,31.3,11.56,37.3,26.91,9100
1,2025.0,Adam Schenk,79.0,0.135,49.0,0.24,90.0,0.017,129.0,-0.122,104.0,-0.045,93.0,21.90%,61.0,3.05,127.0,4.05,40.0,4.55,129.0,207.0,41.0,306.7,166.0,52.06%,162.0,62.38%,66.0,61.60%,143.0,0.9231,250.0,41.2,253.954,14.94,1.0,66.3,22.94,67.1,32.27,6500
2,2025.0,Adam Svensson,70.0,0.191,110.0,-0.021,108.0,-0.049,32.0,0.261,159.0,-0.472,160.0,19.51%,85.0,3.06,115.0,4.04,96.0,4.61,102.0,189.0,160.0,291.5,29.0,64.70%,50.0,67.31%,91.0,60.47%,185.0,0.7574,200.0,55.6,164.158,9.12,0.0,63.1,21.43,32.0,29.13,6600
3,2025.0,Akshay Bhatia,96.0,0.012,118.0,-0.047,46.0,0.326,153.0,-0.267,9.0,0.614,6.0,25.66%,10.0,2.98,22.0,3.98,160.0,4.69,140.0,212.0,157.0,292.9,55.0,61.56%,54.0,67.20%,156.0,56.05%,30.0,2.694,60.0,78.6,931.99,66.57,0.0,37.4,13.81,73.0,66.45,8000
4,2025.0,Aldrich Potgieter,136.0,-0.325,8.0,0.544,151.0,-0.399,172.0,-0.47,58.0,0.188,108.0,21.33%,61.0,3.05,166.0,4.08,170.0,4.71,69.0,166.0,1.0,323.9,165.0,52.11%,144.0,63.44%,174.0,52.94%,142.0,0.9265,250.0,30.0,355.0,35.5,0.0,69.4,28.94,,,6400


### Dataframe Normalization

Run the same normalization techniques that I ran on the historical data:
- Any NaN Odds go to 1000/1 and clip all values at 1000/1
- OWGR NaN values go to 1000
- Recent form NaN values act like a MC at 90
- Average everything else.

In [46]:
# === Force critical stat columns to numeric (some may be strings) ===
stats_to_fix = ["SCRAMBLING", "DRIVING_ACCURACY", "BIRDIES", "GIR"]

for col in stats_to_fix:
    if col in this_week.columns:
        this_week[col] = pd.to_numeric(this_week[col], errors="coerce")

# === Normalize & Assign Odds for Poor or Unknown Players ===
this_week["VEGAS_ODDS"] = this_week["VEGAS_ODDS"].fillna(1000).clip(upper=1000)

# === Normalize & Assign OWGR & OWGR_RANK for Unranked Players ===
# OWGR (raw) — assign worst actual rank in dataset
if "OWGR" in this_week.columns:
    max_owgr = this_week["OWGR"].dropna().max()
    this_week["OWGR"] = this_week["OWGR"].fillna(max_owgr).astype(float).clip(upper=1000)

# OWGR_RANK — if present, use 1000 for missing (very poor)
if "OWGR_RANK" in this_week.columns:
    this_week["OWGR_RANK"] = this_week["OWGR_RANK"].fillna(1000).clip(upper=1000)

# === Assign RECENT FORM Score for Players Who Haven’t Played Recently ===
this_week["RECENT_FORM"] = this_week["RECENT_FORM"].fillna(90)

# === Assign 0 to FEDEX_CUP_POINTS Where No Data ===
this_week["FEDEX_CUP_POINTS"] = this_week["FEDEX_CUP_POINTS"].fillna(0)

# === Course History NaNs: Fill with Mean (not 0 — 0 implies bad, not missing) ===
if "COURSE_HISTORY" in this_week.columns:
    this_week["COURSE_HISTORY"] = this_week["COURSE_HISTORY"].fillna(this_week["COURSE_HISTORY"].mean())

# === Fill Remaining NaNs in Numeric Columns with Column Mean ===
numeric_cols = this_week.select_dtypes(include=["float64", "int64"])
numeric_filled = numeric_cols.fillna(numeric_cols.mean()).round(decimals=0)
this_week.update(numeric_filled)



In [47]:
# === Final Check: No NaNs Should Remain ===
assert this_week.isna().sum().sum() == 0, "🚨 Still missing values in prediction set!"

AssertionError: 🚨 Still missing values in prediction set!

In [48]:
# === Check for Remaining Missing Values ===
missing_summary = this_week.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

print("🧹 Columns still containing NaN values:")
display(missing_summary)

🧹 Columns still containing NaN values:


BIRDIES             136
DRIVING_ACCURACY    136
GIR                 136
SCRAMBLING          136
dtype: int64