In [1124]:
%run ../utils.ipynb

In [1125]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

from warnings import simplefilter

In [1126]:
simplefilter(action='ignore', category=FutureWarning)

In [1127]:
data_dir = f"{project_dir}data\\matchup_summary\\"

In [1128]:
df = pd.read_csv(data_dir + 'weekly_position_scoring_with_matchup_result.csv')

### Select the season

In [1129]:
df = df[df['season_week'].str.contains('2023')]

### Manufacture new columns adding the guaranteed WR and RB position points

In [1130]:
df['WR1WR2_points'] = df['WR1_points'] + df['WR2_points']

In [1131]:
df['RB1RB2_points'] = df['RB1_points'] + df['RB2_points']

### Create W/R/T position dummies

In this case, I wanted to interpret LogReg coefficients relative to starting a Tight End (TE). In order for that to happen using pd.get_dummies(dropfirst=), I had to rename TE and sort, ensuring it was dropped first. 

Then, after creating dummies, I set it back to "TE" and shuffled the dataset

In [1132]:
df['W/R/T position'] = np.where(df['W/R/T position'] == 'TE', 'AA', df['W/R/T position'])

In [1133]:
df = df.sort_values(by = 'W/R/T position')


In [1134]:
flex_dummies = pd.get_dummies(df['W/R/T position'], drop_first=True)

In [1135]:
df = df.join(flex_dummies)

In [1136]:
df['W/R/T position'] = np.where(df['W/R/T position'] == 'AA', 'TE', df['W/R/T position'])

In [1137]:
df = df.sample(X.shape[0])

### Some baseline models

Without knowing anything about the data - baseline accuracy would be a coin flip (win or lose) == Accuracy 0.5

In [1138]:
df['score > mean'] = np.where(df['points_for'] > df['points_for'].mean(), 1, 0)

In [1139]:
df['baseline_pred'] = df['score > mean'] == df['win']

In [1140]:
df['baseline_pred'].mean()

0.780373831775701

### Set the index and select features/label

In [1141]:
df.set_index('season_week', inplace=True)

In [1142]:
X = df[['QB_points', 'WR1WR2_points', 'RB1RB2_points', 'TE_points', 'DEF_points', 'W/R/T_points', 'RB', 'WR']]

In [1143]:
y = df['win']

In [1144]:
X

Unnamed: 0_level_0,QB_points,WR1WR2_points,RB1RB2_points,TE_points,DEF_points,W/R/T_points,RB,WR
season_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Buddy|2023|2,25.22,26.1,6.5,10.6,5.0,8.8,False,True
Tim|2023|11,16.88,26.0,26.2,4.8,17.0,12.8,False,True
Kyle|2023|15,25.68,10.9,29.6,4.8,34.0,10.8,True,False
Jake|2023|13,9.88,48.2,26.8,1.0,24.0,14.7,False,True
Buddy|2023|7,23.26,6.8,19.6,29.9,15.0,9.5,False,True
...,...,...,...,...,...,...,...,...
Eric|2023|3,10.66,37.2,13.1,6.2,21.0,21.6,False,True
Eric|2023|1,14.54,24.9,26.7,5.4,16.0,6.5,False,True
Kyle|2023|14,21.42,31.0,33.2,27.0,17.0,7.6,True,False
Nick|2023|16,23.96,7.2,36.0,4.2,8.0,7.0,True,False


### Instantiate the Logistic Regression (with Cross Validation)

In [1145]:
lr1 = LogisticRegressionCV(cv=5, random_state=2024)

### Fit the model, using raw scores

In [1146]:
lr1.fit(X, y)

### Evaluate the model

In [1147]:
np.mean(cross_val_score(lr1, X, y))

0.7478405315614618

### Interpret coefficients

In [1148]:
log_odds1 = np.exp(lr1.coef_)

In [1149]:
pd.DataFrame(log_odds1, columns=X.columns)

Unnamed: 0,QB_points,WR1WR2_points,RB1RB2_points,TE_points,DEF_points,W/R/T_points,RB,WR
0,1.039857,1.051263,1.064851,1.029535,1.043304,1.059242,0.9988,1.000359


### Same data, but with numeric features scaled

In [1150]:
sc = StandardScaler()

Grab only the numeric columns and scale them

In [1151]:
X_sc = sc.fit_transform(X.iloc[:, 0:6 ])

In [1152]:
X_sc_df = pd.DataFrame(X_sc)

In [1153]:
X_sc_df

Unnamed: 0,0,1,2,3,4,5
0,0.997228,0.214377,-1.678101,0.350656,-1.500558,-0.267899
1,-0.054944,0.205188,0.336714,-0.560677,0.155408,0.274489
2,1.055261,-1.182284,0.684448,-0.560677,2.501360,0.003295
3,-0.938062,2.245048,0.398079,-1.157757,1.121388,0.532123
4,0.749955,-1.559015,-0.338300,3.383195,-0.120586,-0.172981
...,...,...,...,...,...,...
209,-0.839658,1.234307,-1.003087,-0.340700,0.707397,1.467743
210,-0.350158,0.104114,0.387851,-0.466401,0.017411,-0.579772
211,0.517821,0.664616,1.052638,2.927529,0.155408,-0.430616
212,0.838266,-1.522261,1.339007,-0.654953,-1.086567,-0.511974


Reset back to the common index

In [1154]:
X_sc_df.set_index(X.index, inplace=True)

Grab the dummied columns and join them to the end of the scaled numeric features

In [1155]:
dummies = X.iloc[:, 6:8]

In [1156]:
X_sc_df = pd.concat([X_sc_df, dummies], axis=1)

Cast all feature names to string so the LogReg fit method doesn't complain

In [1157]:
X_sc_df.columns = X_sc_df.columns.astype(str)

### Fit the model with the new data

In [1158]:
lr2 = LogisticRegressionCV(cv = 5, random_state=2024)

In [1159]:
lr2.fit(X_sc_df,y)

### Evaluate the model

In [1160]:
np.mean(cross_val_score(lr2, X_sc_df, y))

0.7620155038759691

In [1161]:
lr2.C_

array([0.35938137])

In [1162]:
lr2.scores_

{1: array([[0.55813953, 0.60465116, 0.65116279, 0.65116279, 0.65116279,
         0.65116279, 0.65116279, 0.65116279, 0.65116279, 0.65116279],
        [0.51162791, 0.72093023, 0.6744186 , 0.72093023, 0.74418605,
         0.74418605, 0.74418605, 0.74418605, 0.74418605, 0.74418605],
        [0.53488372, 0.76744186, 0.8372093 , 0.79069767, 0.81395349,
         0.76744186, 0.74418605, 0.74418605, 0.74418605, 0.74418605],
        [0.48837209, 0.58139535, 0.76744186, 0.76744186, 0.79069767,
         0.79069767, 0.79069767, 0.79069767, 0.79069767, 0.79069767],
        [0.5       , 0.80952381, 0.78571429, 0.83333333, 0.80952381,
         0.78571429, 0.80952381, 0.80952381, 0.80952381, 0.80952381]])}

In [1163]:
log_odds2 = np.exp(lr2.coef_)

In [1164]:
pd.DataFrame(log_odds2, columns=X.columns)

Unnamed: 0,QB_points,WR1WR2_points,RB1RB2_points,TE_points,DEF_points,W/R/T_points,RB,WR
0,1.659347,2.079883,2.463104,1.434176,1.630769,1.99128,0.75558,0.902308


### Retry a scaled attempt with no TE in Flex position

In [1165]:
df = pd.read_csv(data_dir + 'weekly_position_scoring_with_matchup_result.csv')

In [1166]:
df['WR1WR2_points'] = df['WR1_points'] + df['WR2_points']

In [1167]:
df['RB1RB2_points'] = df['RB1_points'] + df['RB2_points']

In [1168]:
df = df[df['season_week'].str.contains('2023')]
df = df[df['W/R/T position'] != 'TE']
df.shape

(214, 27)

In [1169]:
df['is_RB'] = np.where(df['W/R/T position'] == 'RB', 1,0)

In [1170]:
df['is_RB'].mean()

0.34579439252336447

In [1171]:
df.set_index('season_week', inplace = True)

In [1172]:
X = df[['QB_points', 'WR1WR2_points', 'RB1RB2_points', 'TE_points', 'DEF_points', 'W/R/T_points', 'is_RB']]

In [1173]:
y = df['win']

In [1174]:
X_sc

array([[ 0.99722755,  0.2143768 , -1.67810112,  0.35065615, -1.50055818,
        -0.26789925],
       [-0.05494432,  0.20518824,  0.3367137 , -0.56067684,  0.15540805,
         0.27448901],
       [ 1.05526101, -1.18228412,  0.68444824, -0.56067684,  2.5013602 ,
         0.00329488],
       ...,
       [ 0.51782071,  0.66461618,  1.05263775,  2.92752874,  0.15540805,
        -0.43061573],
       [ 0.83826633, -1.52226079,  1.33900737, -0.65495267, -1.08656662,
        -0.51197397],
       [-0.44351618,  0.87595303, -0.75762734,  0.0835413 ,  1.67337708,
        -0.41705602]])

In [1175]:
X_sc = sc.fit_transform(X.drop('is_RB', axis=1))

In [1176]:
X_sc_df = pd.DataFrame(X_sc)
X_sc_df.set_index(X.index, inplace=True)

In [1177]:
X_sc_df = X_sc_df.join(X['is_RB'])

In [1178]:
X_sc_df.columns = X_sc_df.columns.astype(str)

In [1179]:
lr3 = LogisticRegressionCV(cv=5, random_state=2024)

In [1180]:
lr3.fit(X_sc_df, y)

In [1181]:
np.mean(cross_val_score(lr3, X_sc_df, y))

0.7572535991140643

In [1182]:
log_odds3 = np.exp(lr3.coef_)

In [1183]:
data = [[val for val in X.std()], log_odds3[0].tolist()]

pd.DataFrame(data, columns=X.columns, index = ['Std', 'Log Odds'])

Unnamed: 0,QB_points,WR1WR2_points,RB1RB2_points,TE_points,DEF_points,W/R/T_points,is_RB
Std,8.062564,11.018079,9.92732,6.186331,7.450537,7.60255,0.476742
Log Odds,1.372124,1.605067,1.691854,1.277638,1.419977,1.547093,0.950726
