In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix

In [3]:
#Get path
current_directory = os.getcwd()
current_directory

df = pd.read_csv(current_directory+'/datasets/df_prediction.csv')

In [4]:
#Review Dataset
df.head(40)

Unnamed: 0,player,year_month,sent_pga,pga_lag1,pga_lag2,pga_lag3,sent_liv,sent_both,LIV_CHANGE
0,brooks koepka,2020-11-01,0.9584,,,,0.6517,,0
1,brooks koepka,2021-02-01,0.7906,0.9584,,,0.83925,,0
2,brooks koepka,2021-04-01,0.670867,0.7906,0.9584,,,,0
3,brooks koepka,2021-06-01,0.851413,0.670867,0.7906,0.9584,0.1026,0.9623,0
4,brooks koepka,2021-07-01,0.423,0.851413,0.670867,0.7906,,,0
5,brooks koepka,2021-09-01,,,,,0.8658,,0
6,brooks koepka,2021-10-01,0.972567,0.423,0.851413,0.670867,0.6325,,0
7,brooks koepka,2022-02-01,0.7532,0.972567,0.423,0.851413,0.78875,,0
8,brooks koepka,2022-06-01,0.466633,0.7532,0.972567,0.423,0.89225,,1
9,brooks koepka,2022-09-01,,,,,0.7579,,0


In [7]:
df.isna().mean()

player        0.000000
year_month    0.000000
sent_pga      0.200772
pga_lag1      0.243243
pga_lag2      0.285714
pga_lag3      0.328185
sent_liv      0.281853
sent_both     0.679537
LIV_CHANGE    0.000000
dtype: float64

In [5]:
# Reset index

df['sent_pga'] = df.groupby('player')['sent_pga'].transform(lambda x: x.interpolate(method='linear'))
df['sent_liv'] = df.groupby('player')['sent_liv'].transform(lambda x: x.interpolate(method='linear')) # Media entre forward y back

df['pga_lag1'] = df.groupby('player')['pga_lag1'].transform(lambda x: x.ffill().bfill())

df['pga_lag2'] = df.groupby('player')['pga_lag2'].transform(lambda x: x.ffill().bfill())



# Forward fill and backward fill for other columns if necessary
# df['other_column'] = df.groupby('player')['other_column'].apply(lambda x: x.ffill().bfill())

# Linear interpolation for remaining missing values in other columns if necessary
# df['other_column'] = df.groupby('player')['other_column'].apply(lambda x: x.interpolate(method='linear'))


In [6]:
df['sent_pga'] = df.groupby('player')['sent_pga'].transform(lambda x: x.ffill().bfill())
df['sent_liv'] = df.groupby('player')['sent_liv'].transform(lambda x: x.ffill().bfill())

In [9]:
df = df.drop(['sent_both','pga_lag3'], axis=1)
#Quitar both

# Mark with one all the LIV players, try to predict with a random forest and logistic reg if that player is LIV...

In [10]:
df

Unnamed: 0,player,year_month,sent_pga,pga_lag1,pga_lag2,sent_liv,LIV_CHANGE
0,brooks koepka,2020-11-01,0.958400,0.958400,0.958400,0.651700,0
1,brooks koepka,2021-02-01,0.790600,0.958400,0.958400,0.839250,0
2,brooks koepka,2021-04-01,0.670867,0.790600,0.958400,0.470925,0
3,brooks koepka,2021-06-01,0.851413,0.670867,0.790600,0.102600,0
4,brooks koepka,2021-07-01,0.423000,0.851413,0.670867,0.484200,0
...,...,...,...,...,...,...,...
254,sergio garcia,2023-04-01,0.714000,0.732300,0.375300,0.607133,0
255,sergio garcia,2023-05-01,0.870600,0.557400,0.732300,0.992900,0
256,sergio garcia,2023-06-01,0.870600,0.557400,0.732300,0.517933,0
257,sergio garcia,2023-08-01,0.870600,0.557400,0.732300,0.970775,0


In [13]:
X = df.drop('LIV_CHANGE', axis=1)
Y = df['LIV_CHANGE']

In [18]:
dummies = pd.get_dummies(X['player'], prefix='player')

# Concatenate the dummy variables DataFrame with the original DataFrame
X_with_dummies = pd.concat([X, dummies], axis=1)

## Split Data

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X_with_dummies, Y, 
                                                  test_size=0.20, stratify=df['LIV_CHANGE'], 
                                                  random_state=42)

In [21]:
train_categorical = X_train[['player','year_month']]
train_numerical = X_train[['sent_pga','pga_lag1','pga_lag2','sent_liv']]
val_categorical = X_val[['player','year_month']]
val_numerical = X_val[['sent_pga','pga_lag1','pga_lag2','sent_liv']]


# Need to separate Player and Date for the scaling

In [16]:
## SCALING
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to your training data and transform it
X_train_scaled = scaler.fit_transform(train_numerical)

# Transform your validation data using the scaler fitted on the training data
X_val_scaled = scaler.transform(val_numerical)


In [27]:
model = LogisticRegression()

model.fit(X_train_scaled,y_train)

# Make predictions
test_predictions = model.predict(X_val_scaled)

# calculate probas
test_predictions_prob = model.predict_proba(X_val_scaled)


In [29]:
test_predictions_prob

array([[0.98418905, 0.01581095],
       [0.97367155, 0.02632845],
       [0.98240136, 0.01759864],
       [0.97851568, 0.02148432],
       [0.99018218, 0.00981782],
       [0.97738981, 0.02261019],
       [0.97597776, 0.02402224],
       [0.98976746, 0.01023254],
       [0.98218514, 0.01781486],
       [0.91361923, 0.08638077],
       [0.98746171, 0.01253829],
       [0.97936133, 0.02063867],
       [0.9777406 , 0.0222594 ],
       [0.989352  , 0.010648  ],
       [0.97806749, 0.02193251],
       [0.98733863, 0.01266137],
       [0.98186341, 0.01813659],
       [0.98095062, 0.01904938],
       [0.98694754, 0.01305246],
       [0.98187841, 0.01812159],
       [0.98052652, 0.01947348],
       [0.98048114, 0.01951886],
       [0.9806401 , 0.0193599 ],
       [0.97731878, 0.02268122],
       [0.98268334, 0.01731666],
       [0.98947625, 0.01052375],
       [0.98609901, 0.01390099],
       [0.98518518, 0.01481482],
       [0.97421738, 0.02578262],
       [0.98988115, 0.01011885],
       [0.

In [30]:
# Random Forest classifier y ver las features.
