In [47]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix

In [48]:
#Get path
current_directory = os.getcwd()
current_directory

df = pd.read_csv(current_directory+'/datasets/df_prediction.csv')

In [49]:
#Review Dataset
df

Unnamed: 0,player,year_month,sent_pga,pga_lag1,pga_lag2,pga_lag3,sent_liv,sent_both,LIV_CHANGE
0,brooks koepka,2020-11-01,0.958400,,,,0.651700,,0
1,brooks koepka,2021-02-01,0.790600,0.958400,,,0.839250,,0
2,brooks koepka,2021-04-01,0.670867,0.790600,0.958400,,,,0
3,brooks koepka,2021-06-01,0.851413,0.670867,0.790600,0.9584,0.102600,0.96230,0
4,brooks koepka,2021-07-01,0.423000,0.851413,0.670867,0.7906,,,0
...,...,...,...,...,...,...,...,...,...
254,sergio garcia,2023-04-01,,,,,0.607133,0.83260,0
255,sergio garcia,2023-05-01,0.870600,0.557400,0.732300,0.3753,0.992900,0.99065,0
256,sergio garcia,2023-06-01,,,,,0.517933,0.90610,0
257,sergio garcia,2023-08-01,,,,,0.970775,0.92170,0


In [50]:
df.isna().mean()

player        0.000000
year_month    0.000000
sent_pga      0.200772
pga_lag1      0.243243
pga_lag2      0.285714
pga_lag3      0.328185
sent_liv      0.281853
sent_both     0.679537
LIV_CHANGE    0.000000
dtype: float64

In [51]:
# Reset index

df['sent_pga'] = df.groupby('player')['sent_pga'].transform(lambda x: x.interpolate(method='linear'))
df['sent_liv'] = df.groupby('player')['sent_liv'].transform(lambda x: x.interpolate(method='linear'))

df['pga_lag1'] = df.groupby('player')['pga_lag1'].transform(lambda x: x.ffill().bfill())

df['pga_lag2'] = df.groupby('player')['pga_lag2'].transform(lambda x: x.ffill().bfill())

df['pga_lag3'] = df.groupby('player')['pga_lag3'].transform(lambda x: x.ffill().bfill())



# Forward fill and backward fill for other columns if necessary
# df['other_column'] = df.groupby('player')['other_column'].apply(lambda x: x.ffill().bfill())

# Linear interpolation for remaining missing values in other columns if necessary
# df['other_column'] = df.groupby('player')['other_column'].apply(lambda x: x.interpolate(method='linear'))


In [60]:
df['sent_pga'] = df.groupby('player')['sent_pga'].transform(lambda x: x.ffill().bfill())
df['sent_liv'] = df.groupby('player')['sent_liv'].transform(lambda x: x.ffill().bfill())

In [65]:
df

Unnamed: 0,player,year_month,sent_pga,pga_lag1,pga_lag2,pga_lag3,sent_liv,LIV_CHANGE
0,brooks koepka,2020-11-01,0.958400,0.958400,0.958400,0.9584,0.651700,0
1,brooks koepka,2021-02-01,0.790600,0.958400,0.958400,0.9584,0.839250,0
2,brooks koepka,2021-04-01,0.670867,0.790600,0.958400,0.9584,0.470925,0
3,brooks koepka,2021-06-01,0.851413,0.670867,0.790600,0.9584,0.102600,0
4,brooks koepka,2021-07-01,0.423000,0.851413,0.670867,0.7906,0.484200,0
...,...,...,...,...,...,...,...,...
254,sergio garcia,2023-04-01,0.714000,0.732300,0.375300,0.8704,0.607133,0
255,sergio garcia,2023-05-01,0.870600,0.557400,0.732300,0.3753,0.992900,0
256,sergio garcia,2023-06-01,0.870600,0.557400,0.732300,0.3753,0.517933,0
257,sergio garcia,2023-08-01,0.870600,0.557400,0.732300,0.3753,0.970775,0


In [63]:
df.isna().mean()

player        0.0
year_month    0.0
sent_pga      0.0
pga_lag1      0.0
pga_lag2      0.0
pga_lag3      0.0
sent_liv      0.0
LIV_CHANGE    0.0
dtype: float64

## Split Data

In [81]:
X_train, X_val, y_train, y_val = train_test_split(df.drop('LIV_CHANGE', axis=1), df['LIV_CHANGE'], 
                                                  test_size=0.20, stratify=df['LIV_CHANGE'], 
                                                  random_state=42)

In [85]:
## SCALING
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to your training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform your validation data using the scaler fitted on the training data
X_val_scaled = scaler.transform(X_val)


In [91]:
model = LogisticRegression()

model.fit(X_train_scaled,y_train)

# Make predictions
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_val)

# calculate probas
train_predictions_prob = model.predict_proba(X_train)
test_predictions_prob = model.predict_proba(X_val)


