# Place Classification

- Goal: Identify pga golfers that have the best chance of winning and top 5, 10, or 20 finishes in a particular tournamnt based on past performance, the couse, field, and weather data.

In [19]:
import sqlite3
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score)
from xgboost import XGBRegressor
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import pandas as pd
import numpy as np

conn = sqlite3.connect('/Users/nickospelt/Documents/App_Projects/PGA_Score_Predictor/Data/PGA_SQL_DB/PGA.db')

## Historical Data

In [20]:
tournament_query = """SELECT *,
    ((SG_PUTT * 4) + (SG_AROUND_THE_GREEN * 4) + (SG_APPROACH * 4) + (SG_OFF_THE_TEE * 4)) / 4 AS SG_TOTAL,
    (CASE
        WHEN POSITION = 1 THEN "WIN"
        WHEN POSITION <= 5 THEN "TOP 5"
        WHEN POSITION <= 10 THEN "TOP 10"
        WHEN POSITION <= 20 THEN "TOP 20"
        WHEN POSITION IS NOT NULL THEN "MADE CUT"
        ELSE "CUT"
    END) AS FINISH
FROM RAW_TOURNAMENT_ROUNDS_V5
ORDER BY TOURNAMENT_NAME DESC, TOTAL_SCORE"""

tournament_df = pd.read_sql_query(tournament_query, conn)
tournament_df

Unnamed: 0,TOURNAMENT_NAME,TOURNAMENT_DATE,ELEVATION,R1_TEMPERATURE,R1_PRECIPITATION,R1_WIND_SPEED,R1_WIND_DIRECTION,R2_TEMPERATURE,R2_PRECIPITATION,R2_WIND_SPEED,...,R1_SCORE,R2_SCORE,R3_SCORE,R4_SCORE,TOTAL_SCORE,POSITION,EARNINGS,FEDEX_PTS,SG_TOTAL,FINISH
0,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,64,64,65.0,67.0,260,1.0,1530000.0,,3.63700,WIN
1,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,66,64,65.0,66.0,261,2.0,748000.0,,3.38650,TOP 5
2,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,64,68,64.0,65.0,261,2.0,748000.0,,3.38575,TOP 5
3,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,68,64,67.0,64.0,263,4.0,408000.0,,2.88600,TOP 5
4,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,69,68,63.0,65.0,265,5.0,340000.0,,2.38525,TOP 5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11768,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,78,72,73.0,76.0,299,50.0,30140.0,9.0,,MADE CUT
11769,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,79,70,73.0,78.0,300,51.0,28600.0,9.0,,MADE CUT
11770,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,74,76,79.0,76.0,305,52.0,27720.0,8.0,,MADE CUT
11771,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,72,75,83.0,78.0,308,53.0,27060.0,7.0,,MADE CUT


## Distribution of classes

In [21]:
total_rows = tournament_df.shape[0]
individual_counts = tournament_df['FINISH'].value_counts()
class_distribution_df = pd.DataFrame(individual_counts)
class_distribution_df['Percent (%)'] = (class_distribution_df['count'] / total_rows) * 100
class_distribution_df

Unnamed: 0_level_0,count,Percent (%)
FINISH,Unnamed: 1_level_1,Unnamed: 2_level_1
CUT,5452,46.309352
MADE CUT,4332,36.796059
TOP 20,958,8.137263
TOP 10,517,4.391404
TOP 5,426,3.618449
WIN,88,0.747473


- Classes are extremly in-balanced as expected

## Feature Engineering

- Historical Performance: 
    - Strokes Gained & Round Scores
    - Exponentially weighted moving average (Same as from regression feature engineering)
    - Should find a way to normalize these values by the field (subtract the average for the tournament)
- EARNINGS/FEDEX_PTS:
    - Amateurs will have null earnings and 0 fedex_pts
    - Winnings over a specific time period
    - Winnings per tournament
- Number of cuts made
    - Only have strokes gained data and round 3/4 scores if make cut
    - Include other classes as well?
- Field:
    - Average performance of the field
    - All golfers input into the field?
- Weather:
    - Use same round scores as previously. (May need to generalize to the tournament level by averaging if correlation a problem)