# Modeling

## From the end of EDA:

### Conclusion

So the moral of the story currently is that we have at the minimum a couple of heuristics for choosing players:

- Choose value players, ie players with moderate price tags but good matchups
- Choose players based on Def they play
- Avoid expensive players, since statistically they are unable to produce high scores consistently.

With these guidelines, week 1 will be a total gamble, since we won't have any real data besides salaries. Week 2 will be the first time we can use any defensive data to help with our decision making.

## Goal for this notebook:

Based on the conclusions from the EDA, we want to see if we can find a model that confirms these ideas across seasons, and also has a high enough (cross-validated) accuracy to warrant trying to use this with real money.

### Note:
Sci-kit Learn says, according to https://scikit-learn.org/stable/tutorial/machine_learning_map/, that the model to use should be either Lasso or Elastic net, but we are going to try many different models to see what produces the best result.

## Logic

The idea behind this notebook is that player performances follow a predictable pattern, and therefore output should be directly predictable. The benefit of this would be to predict high performance players across each position and draft high scoring lineups. 

Obviously we want to get as many high performers as possible, but getting 100% accuracy on that seems implausible. 

That being said, if we can come up with a model that correctly guesses players scoring more than 15 points over 50% of the time, that'd be an impressive edge for competitions where we only need to score better than 50% of the other competition (Double ups). 

If we can get a model that has, say 70% or more, that could potentially be used to create lineups that might be in the running for a $1 million.

## Import Libraries

In [1]:
from collections import defaultdict
from datetime import datetime
import pickle
import random
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None # to remove some warnings
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LassoCV, ElasticNetCV, RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler 
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

from xgboost import XGBRegressor

## Helper Functions

In [2]:
def get_weekly_data(week, year):
    file_path = f"./csv's/{year}/year-{year}-week-{week}-DK-player_data.csv"
    df = pd.read_csv(file_path)
    return df

def get_ytd_season_data(year, current_week):
    df = get_weekly_data(1,year)
    for week in range(2,current_week+1):
        try:
            df = df.append(get_weekly_data(week, year), ignore_index=True)
        except:
            print("No data for week: "+str(week))
    df = df.drop(['Unnamed: 0', 'Year'], axis=1)
    return df

def get_season_data(year, drop_year=True):
    df = get_weekly_data(1,year)
    for week in range(2,17):
        try:
            df = df.append(get_weekly_data(week, year), ignore_index=True)
        except:
            print("No data for week: "+str(week))
    if drop_year:
        df = df.drop(['Unnamed: 0', 'Year'], axis=1)
    else:
        df = df.drop(['Unnamed: 0'], axis=1)
    return df

def get_all_seasons(drop_year=False):
    df = get_season_data(2014, drop_year)
    for year in range(2015,datetime.today().year+1):
        try:
            df = df.append(get_season_data(year, drop_year), ignore_index=True)
        except:
            print("No data for year: "+str(year))
    return df

def scale_features(sc, X_train, X_test):
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test

def handle_nulls(df):
    # players that have nulls for any of the columns are 
    # extremely likely to be under performing or going into a bye.
    # the one caveat is that some are possibly coming off a bye.
    # to handle this later, probably will drop them, save those
    # as a variable, and then re-merge after getting rid of the other
    # null values.
    df = df.dropna()
    return df

def eval_model(df):
    df['score_ratio'] = round(df['actual_points'] / df['pred'],4)
    return df

def remove_outliers_btwn_ij(df, i=-1, j=5):
    s = df.loc[(df.score_ratio > i) & (df.score_ratio < j)]
    return s, i, j

def get_RMSE(y_true, y_pred):
    MSE = mean_squared_error(y_true, y_pred)
    RMSE = np.sqrt(MSE)
    return RMSE

def summarize_df(df, o_u_thresh=15):
    df = eval_model(df)
    RMSE = get_RMSE(df['actual_points'], df['pred'])
    print(f"Total entries analyzed: {len(df)}")
    s, i, j = remove_outliers_btwn_ij(df)
    print(f"Total entries after outliers removed: {len(s)}. Left boundary: {i}x Right Boundary: {j}x")
    correct_preds_over_thresh = s[(s.pred >= o_u_thresh)&(s.actual_points>=o_u_thresh)]
    correct_preds_under_thresh = s[(s.pred <= o_u_thresh)&(s.actual_points<=o_u_thresh)]
    incorrect_preds_under_thresh = s[(s.pred <= o_u_thresh)&(s.actual_points>=o_u_thresh)]
    incorrect_preds_over_thresh = s[(s.pred >= o_u_thresh)&(s.actual_points<=o_u_thresh)]
    print(f"Correct predictions of over {o_u_thresh} pts: {len(correct_preds_over_thresh)}. Percent: {round(len(correct_preds_over_thresh)/len(s)*100,2)}") # True Positive
    print(f"Correct predictions of under {o_u_thresh} pts: {len(correct_preds_under_thresh)}. Percent: {round(len(correct_preds_under_thresh)/len(s)*100,2)}") # True Negative
    print(f"Incorrect predictions of over {o_u_thresh} pts: {len(incorrect_preds_over_thresh)}. Percent: {round(len(incorrect_preds_over_thresh)/len(s)*100,2)}") # False Positive
    print(f"Incorrect predictions of under {o_u_thresh} pts: {len(incorrect_preds_under_thresh)}. Percent: {round(len(incorrect_preds_under_thresh)/len(s)*100,2)}") # False Negative
    print(f"RMSE: {RMSE}")

## Import Data

In [3]:
season = 2020
week = 6
next_week = week + 1
dataset = get_season_data(season)
# dataset

In [4]:
df = handle_nulls(dataset)
df

Unnamed: 0,Week,Name,Pos,Team,h/a,Oppt,DK points,DK salary
0,1,"Wilson, Russell",QB,sea,a,atl,34.78,7000.0
1,1,"Rodgers, Aaron",QB,gnb,a,min,33.76,6300.0
2,1,"Allen, Josh",QB,buf,h,nyj,33.18,6500.0
3,1,"Ryan, Matt",QB,atl,h,sea,27.90,6700.0
4,1,"Jackson, Lamar",QB,bal,h,cle,27.50,8100.0
...,...,...,...,...,...,...,...,...
6548,16,Indianapolis,Def,ind,a,pit,0.00,3200.0
6549,16,Jacksonville,Def,jac,h,chi,-1.00,2200.0
6550,16,Tennessee,Def,ten,a,gnb,-1.00,2600.0
6551,16,Houston,Def,hou,h,cin,-4.00,2800.0


In [5]:
def_df = df.loc[df.Pos == 'Def']
def_df

Unnamed: 0,Week,Name,Pos,Team,h/a,Oppt,DK points,DK salary
410,1,New Orleans,Def,nor,h,tam,17.0,2400.0
411,1,Washington,Def,was,h,phi,15.0,2000.0
412,1,Baltimore,Def,bal,h,cle,15.0,3100.0
413,1,New England,Def,nwe,h,mia,11.0,3200.0
414,1,LA Chargers,Def,lac,a,cin,11.0,2800.0
...,...,...,...,...,...,...,...,...
6548,16,Indianapolis,Def,ind,a,pit,0.0,3200.0
6549,16,Jacksonville,Def,jac,h,chi,-1.0,2200.0
6550,16,Tennessee,Def,ten,a,gnb,-1.0,2600.0
6551,16,Houston,Def,hou,h,cin,-4.0,2800.0


In [6]:
def_df['fantasy_points_allowed_lw'] = 0
df['Oppt_pts_allowed_lw'] = 0
def_teams = [x for x in def_df['Team'].unique()]

for week in range(1,17):
    for team in def_teams:
        try:
            offense_df1 = df.loc[(df['Oppt']==team)&(df['Week']==week)]
            offense_df2 = df.loc[(df['Oppt']==team)&(df['Week']==week+1)]
            sum_ = offense_df1['DK points'].sum()
            def_df.loc[(df['Team']==team)&(df['Week']==week+1), 'fantasy_points_allowed_lw'] = sum_
            df.loc[(df['Oppt']==team)&(df['Week']==week+1), 'Oppt_pts_allowed_lw'] = sum_
        except:
            print('couldnt append data')
            pass

def_df

Unnamed: 0,Week,Name,Pos,Team,h/a,Oppt,DK points,DK salary,fantasy_points_allowed_lw
410,1,New Orleans,Def,nor,h,tam,17.0,2400.0,0.00
411,1,Washington,Def,was,h,phi,15.0,2000.0,0.00
412,1,Baltimore,Def,bal,h,cle,15.0,3100.0,0.00
413,1,New England,Def,nwe,h,mia,11.0,3200.0,0.00
414,1,LA Chargers,Def,lac,a,cin,11.0,2800.0,0.00
...,...,...,...,...,...,...,...,...,...
6548,16,Indianapolis,Def,ind,a,pit,0.0,3200.0,118.52
6549,16,Jacksonville,Def,jac,h,chi,-1.0,2200.0,120.90
6550,16,Tennessee,Def,ten,a,gnb,-1.0,2600.0,102.98
6551,16,Houston,Def,hou,h,cin,-4.0,2800.0,102.62


In [7]:
# drop week 1 as there won't be any data there
# and that also means this model won't be really
# of any use until week 2
df = df[df.Week != 1] 

In [8]:
X = df.drop(labels='DK points', axis=1)
y = df['DK points']

In [9]:
X

Unnamed: 0,Week,Name,Pos,Team,h/a,Oppt,DK salary,Oppt_pts_allowed_lw
442,2,"Prescott, Dak",QB,dal,h,atl,6800.0,139.48
443,2,"Newton, Cam",QB,nwe,a,sea,6400.0,143.00
444,2,"Allen, Josh",QB,buf,a,mia,6700.0,89.70
445,2,"Wilson, Russell",QB,sea,h,nwe,6500.0,61.14
446,2,"Murray, Kyler",QB,ari,h,was,6100.0,90.50
...,...,...,...,...,...,...,...,...
6548,16,Indianapolis,Def,ind,a,pit,3200.0,64.66
6549,16,Jacksonville,Def,jac,h,chi,2200.0,110.74
6550,16,Tennessee,Def,ten,a,gnb,2600.0,81.62
6551,16,Houston,Def,hou,h,cin,2800.0,67.40


In [10]:
y

442     43.80
443     38.58
444     37.48
445     34.42
446     33.14
        ...  
6548     0.00
6549    -1.00
6550    -1.00
6551    -4.00
6552    -4.00
Name: DK points, Length: 6110, dtype: float64

In [11]:
# Encode data - label encoding, because one hot encoding was 
# creating huge amounts of unbalanced data
# borrowed from https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
# d = defaultdict(LabelEncoder)
# X_le = X.apply(LabelEncoder().fit_transform)

In [12]:
X = pd.get_dummies(X)

In [13]:
print(X)

      Week  DK salary  Oppt_pts_allowed_lw  Name_Abdullah, Ameer  \
442      2     6800.0               139.48                     0   
443      2     6400.0               143.00                     0   
444      2     6700.0                89.70                     0   
445      2     6500.0                61.14                     0   
446      2     6100.0                90.50                     0   
...    ...        ...                  ...                   ...   
6548    16     3200.0                64.66                     0   
6549    16     2200.0               110.74                     0   
6550    16     2600.0                81.62                     0   
6551    16     2800.0                67.40                     0   
6552    16     2900.0                72.48                     0   

      Name_Adams, Davante  Name_Adams, Josh  Name_Agholor, Nelson  \
442                     0                 0                     0   
443                     0                 0  

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
xtr_cols = X_train.columns
xte_cols = X_test.columns

In [15]:
data_to_use = 'scaled'
# data_to_use = 'un-scaled' # comment out this line for using scaled data

In [16]:
if data_to_use == 'scaled':
    sc = StandardScaler()
    sc = MinMaxScaler()
    X_train['DK salary'] = sc.fit_transform(X_train['DK salary'].values.reshape(-1,1))
    X_test['DK salary'] = sc.fit_transform(X_test['DK salary'].values.reshape(-1,1))

In [17]:
X_train

Unnamed: 0,Week,DK salary,Oppt_pts_allowed_lw,"Name_Abdullah, Ameer","Name_Adams, Davante","Name_Adams, Josh","Name_Agholor, Nelson","Name_Agnew, Jamal","Name_Ahmed, Salvon","Name_Aiyuk, Brandon",...,Oppt_nwe,Oppt_nyg,Oppt_nyj,Oppt_phi,Oppt_pit,Oppt_sea,Oppt_sfo,Oppt_tam,Oppt_ten,Oppt_was
5608,14,0.263158,57.26,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4081,11,0.473684,82.08,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
888,3,0.578947,133.96,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5473,14,0.315789,107.38,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4139,11,0.421053,148.74,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5374,14,0.421053,80.22,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3706,10,0.421053,78.80,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2095,5,0.242105,103.86,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3049,8,0.778947,109.98,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Non-Boost Methods

#### Linear Regression

In [18]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression()

In [19]:
y_pred = lin_reg.predict(X_test)

In [20]:
for x in range(0, len(y_pred)):
    y_pred[x] = float(round(y_pred[x],2))
y_pred

array([19.51,  8.11,  4.41, ...,  1.63, 17.78,  6.37])

In [21]:
df_results = X_test.copy()
df_results

Unnamed: 0,Week,DK salary,Oppt_pts_allowed_lw,"Name_Abdullah, Ameer","Name_Adams, Davante","Name_Adams, Josh","Name_Agholor, Nelson","Name_Agnew, Jamal","Name_Ahmed, Salvon","Name_Aiyuk, Brandon",...,Oppt_nwe,Oppt_nyg,Oppt_nyj,Oppt_phi,Oppt_pit,Oppt_sea,Oppt_sfo,Oppt_tam,Oppt_ten,Oppt_was
5006,13,0.82,55.34,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
689,2,0.43,116.14,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1949,5,0.36,119.50,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1761,5,0.55,126.20,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2043,5,0.28,118.92,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3159,8,0.32,157.16,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2275,6,0.82,86.98,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5186,13,0.25,56.60,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
922,3,0.69,112.04,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [22]:
# how to decode one hot columns: 
# https://stackoverflow.com/questions/49372640/python-pandas-how-to-reverse-one-hot-encoding-back-to-categorical
# https://stackoverflow.com/questions/22548731/how-to-reverse-sklearn-onehotencoder-transform-to-recover-original-data
one_hot_columns = (df_results.iloc[:, 3:] == 1).idxmax(1)
df_results['player_name'] = one_hot_columns
df_results['pred'] = y_pred
df_results['actual_points'] = y_test
df_results['player_name'] = df_results['player_name'].str.replace("Name_", "")

In [23]:
pd.set_option("display.max_rows", None, "display.max_columns", 10)
# df_results

In [24]:
subset_cols = ['Week', 'DK salary', 'player_name', 'pred', 'actual_points']
df_results_linear = df_results[subset_cols]
df_results_linear = df_results_linear.sort_values(by='Week')
df_results_linear

Unnamed: 0,Week,DK salary,player_name,pred,actual_points
759,2,0.42,"Smith, Jonnu",9.14,24.4
576,2,0.4,"Gillaspia, Cullen",-0.19,0.0
669,2,0.3,"McKenzie, Isaiah",4.66,6.7
818,2,0.39,"Olsen, Greg",4.29,0.0
750,2,0.35,"Arcega-Whiteside, JJ",2.38,0.0
446,2,0.61,"Murray, Kyler",23.91,33.14
486,2,0.44,"Robinson, James",21.91,24.0
471,2,0.65,"Brady, Tom",22.96,10.68
532,2,0.45,"Murray, Latavius",10.82,5.3
636,2,0.64,"Woods, Robert",19.06,11.3


### Lasso

In [25]:
lasso_reg = LassoCV()
lasso_reg.fit(X_train, y_train)

LassoCV()

In [26]:
y_pred2 = lasso_reg.predict(X_test)

In [27]:
for x in range(0, len(y_pred2)):
    y_pred2[x] = float(round(y_pred2[x],2))
y_pred2

array([19.85,  7.81,  5.47, ...,  1.94, 12.77,  9.68])

In [28]:
df_results['pred'] = y_pred2

In [29]:
subset_cols = ['Week', 'DK salary', 'player_name', 'pred', 'actual_points']
df_results_lasso = df_results[subset_cols]
df_results_lasso = df_results_lasso.sort_values(by='Week')
df_results_lasso

Unnamed: 0,Week,DK salary,player_name,pred,actual_points
759,2,0.42,"Smith, Jonnu",7.53,24.4
576,2,0.4,"Gillaspia, Cullen",4.07,0.0
669,2,0.3,"McKenzie, Isaiah",3.47,6.7
818,2,0.39,"Olsen, Greg",6.4,0.0
750,2,0.35,"Arcega-Whiteside, JJ",5.07,0.0
446,2,0.61,"Murray, Kyler",15.13,33.14
486,2,0.44,"Robinson, James",5.38,24.0
471,2,0.65,"Brady, Tom",17.06,10.68
532,2,0.45,"Murray, Latavius",5.76,5.3
636,2,0.64,"Woods, Robert",14.25,11.3


### Elastic Net

In [30]:
elastic_net_reg = ElasticNetCV()
elastic_net_reg.fit(X_train, y_train)

ElasticNetCV()

In [31]:
y_pred3 = elastic_net_reg.predict(X_test)

In [32]:
for x in range(0, len(y_pred3)):
    y_pred3[x] = float(round(y_pred3[x],2))
y_pred3

array([16.76,  7.98,  6.36, ...,  2.3 , 11.04,  9.09])

In [33]:
df_results['pred'] = y_pred3

In [34]:
subset_cols = ['Week', 'DK salary', 'player_name', 'pred', 'actual_points']
df_results_elastic = df_results[subset_cols]
df_results_elastic = df_results_elastic.sort_values(by='Week')
df_results_elastic

Unnamed: 0,Week,DK salary,player_name,pred,actual_points
759,2,0.42,"Smith, Jonnu",7.14,24.4
576,2,0.4,"Gillaspia, Cullen",5.2,0.0
669,2,0.3,"McKenzie, Isaiah",4.81,6.7
818,2,0.39,"Olsen, Greg",6.29,0.0
750,2,0.35,"Arcega-Whiteside, JJ",4.77,0.0
446,2,0.61,"Murray, Kyler",15.07,33.14
486,2,0.44,"Robinson, James",6.93,24.0
471,2,0.65,"Brady, Tom",17.71,10.68
532,2,0.45,"Murray, Latavius",6.25,5.3
636,2,0.64,"Woods, Robert",13.24,11.3


### Ridge

In [35]:
ridge_reg = RidgeCV()
ridge_reg.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]))

In [36]:
y_pred4 = ridge_reg.predict(X_test)

In [37]:
for x in range(0, len(y_pred4)):
    y_pred4[x] = float(round(y_pred4[x],2))
y_pred4

array([19.38,  8.14,  4.47, ...,  1.59, 17.38,  6.46])

In [38]:
df_results['pred'] = y_pred4

In [39]:
subset_cols = ['Week', 'DK salary', 'player_name', 'pred', 'actual_points']
df_results_ridge = df_results[subset_cols]
df_results_ridge = df_results_ridge.sort_values(by='Week')
df_results_ridge

Unnamed: 0,Week,DK salary,player_name,pred,actual_points
759,2,0.42,"Smith, Jonnu",8.94,24.4
576,2,0.4,"Gillaspia, Cullen",0.01,0.0
669,2,0.3,"McKenzie, Isaiah",4.64,6.7
818,2,0.39,"Olsen, Greg",4.34,0.0
750,2,0.35,"Arcega-Whiteside, JJ",2.53,0.0
446,2,0.61,"Murray, Kyler",23.11,33.14
486,2,0.44,"Robinson, James",20.91,24.0
471,2,0.65,"Brady, Tom",22.78,10.68
532,2,0.45,"Murray, Latavius",10.49,5.3
636,2,0.64,"Woods, Robert",18.84,11.3


### SVR (linear)

In [40]:
svr1_reg = SVR(kernel='linear')
svr1_reg.fit(X_train, y_train)

SVR(kernel='linear')

In [41]:
y_pred44 = svr1_reg.predict(X_test)

In [42]:
for x in range(0, len(y_pred44)):
    y_pred44[x] = float(round(y_pred44[x],2))
y_pred44

array([16.64,  4.9 ,  3.22, ...,  0.77, 11.94,  7.  ])

In [43]:
df_results['pred'] = y_pred44

In [44]:
subset_cols = ['Week', 'DK salary', 'player_name', 'pred', 'actual_points']
df_results_svr1 = df_results[subset_cols]
df_results_svr1 = df_results_svr1.sort_values(by='Week')
df_results_svr1

Unnamed: 0,Week,DK salary,player_name,pred,actual_points
759,2,0.42,"Smith, Jonnu",6.54,24.4
576,2,0.4,"Gillaspia, Cullen",0.14,0.0
669,2,0.3,"McKenzie, Isaiah",2.17,6.7
818,2,0.39,"Olsen, Greg",4.91,0.0
750,2,0.35,"Arcega-Whiteside, JJ",3.14,0.0
446,2,0.61,"Murray, Kyler",14.58,33.14
486,2,0.44,"Robinson, James",7.03,24.0
471,2,0.65,"Brady, Tom",16.64,10.68
532,2,0.45,"Murray, Latavius",6.02,5.3
636,2,0.64,"Woods, Robert",13.56,11.3


### SVR (rbf)

In [45]:
svr2_reg = SVR(kernel='rbf')
svr2_reg.fit(X_train, y_train)

SVR()

In [46]:
y_pred45 = svr2_reg.predict(X_test)

In [47]:
for x in range(0, len(y_pred45)):
    y_pred45[x] = float(round(y_pred45[x],2))
y_pred45

array([3.87, 4.85, 4.82, ..., 3.79, 4.76, 5.  ])

In [48]:
df_results['pred'] = y_pred45

In [49]:
subset_cols = ['Week', 'DK salary', 'player_name', 'pred', 'actual_points']
df_results_svr2 = df_results[subset_cols]
df_results_svr2 = df_results_svr2.sort_values(by='Week')
df_results_svr2

Unnamed: 0,Week,DK salary,player_name,pred,actual_points
759,2,0.42,"Smith, Jonnu",4.76,24.4
576,2,0.4,"Gillaspia, Cullen",4.07,0.0
669,2,0.3,"McKenzie, Isaiah",4.44,6.7
818,2,0.39,"Olsen, Greg",3.92,0.0
750,2,0.35,"Arcega-Whiteside, JJ",4.53,0.0
446,2,0.61,"Murray, Kyler",4.49,33.14
486,2,0.44,"Robinson, James",4.28,24.0
471,2,0.65,"Brady, Tom",4.6,10.68
532,2,0.45,"Murray, Latavius",4.6,5.3
636,2,0.64,"Woods, Robert",4.42,11.3


### Decision Tree

In [50]:
decision_tree_reg = DecisionTreeRegressor()
decision_tree_reg.fit(X_train, y_train)

DecisionTreeRegressor()

In [51]:
y_pred5 = decision_tree_reg.predict(X_test)

In [52]:
for x in range(0, len(y_pred5)):
    y_pred5[x] = float(round(y_pred5[x],2))
y_pred5

array([43.1,  7.8, 13.7, ...,  0. , 29.3, 11.4])

In [53]:
df_results['pred'] = y_pred5

In [54]:
subset_cols = ['Week', 'DK salary', 'player_name', 'pred', 'actual_points']
df_results_dt = df_results[subset_cols]
df_results_dt = df_results_dt.sort_values(by='Week')
df_results_dt

Unnamed: 0,Week,DK salary,player_name,pred,actual_points
759,2,0.42,"Smith, Jonnu",3.0,24.4
576,2,0.4,"Gillaspia, Cullen",0.0,0.0
669,2,0.3,"McKenzie, Isaiah",2.0,6.7
818,2,0.39,"Olsen, Greg",8.2,0.0
750,2,0.35,"Arcega-Whiteside, JJ",1.5,0.0
446,2,0.61,"Murray, Kyler",14.66,33.14
486,2,0.44,"Robinson, James",2.8,24.0
471,2,0.65,"Brady, Tom",7.94,10.68
532,2,0.45,"Murray, Latavius",19.0,5.3
636,2,0.64,"Woods, Robert",21.4,11.3


### Random Forest

In [55]:
random_forest_reg = RandomForestRegressor()
random_forest_reg.fit(X_train, y_train)

RandomForestRegressor()

In [56]:
y_pred6 = random_forest_reg.predict(X_test)

In [57]:
for x in range(0, len(y_pred6)):
    y_pred6[x] = float(round(y_pred6[x],2))
y_pred6

array([22.18,  7.17,  8.63, ...,  0.84, 15.43,  9.02])

In [58]:
df_results['pred'] = y_pred6

In [59]:
subset_cols = ['Week', 'DK salary', 'player_name', 'pred', 'actual_points']
df_results_rf = df_results[subset_cols]
df_results_rf = df_results_rf.sort_values(by='Week')
df_results_rf

Unnamed: 0,Week,DK salary,player_name,pred,actual_points
759,2,0.42,"Smith, Jonnu",9.64,24.4
576,2,0.4,"Gillaspia, Cullen",0.76,0.0
669,2,0.3,"McKenzie, Isaiah",2.61,6.7
818,2,0.39,"Olsen, Greg",4.96,0.0
750,2,0.35,"Arcega-Whiteside, JJ",3.4,0.0
446,2,0.61,"Murray, Kyler",16.66,33.14
486,2,0.44,"Robinson, James",9.33,24.0
471,2,0.65,"Brady, Tom",24.18,10.68
532,2,0.45,"Murray, Latavius",10.44,5.3
636,2,0.64,"Woods, Robert",17.17,11.3


## Boost Methods

### Ada Boost

In [60]:
ada_boost_reg = AdaBoostRegressor()
ada_boost_reg.fit(X_train, y_train)

AdaBoostRegressor()

In [61]:
y_pred7 = ada_boost_reg.predict(X_test)

In [62]:
for x in range(0, len(y_pred7)):
    y_pred7[x] = float(round(y_pred7[x],2))
y_pred7

array([22.88, 14.16, 13.95, ...,  9.96, 21.61, 14.25])

In [63]:
df_results['pred'] = y_pred7

In [64]:
subset_cols = ['Week', 'DK salary', 'player_name', 'pred', 'actual_points']
df_results_ada = df_results[subset_cols]
df_results_ada = df_results_ada.sort_values(by='Week')
df_results_ada

Unnamed: 0,Week,DK salary,player_name,pred,actual_points
759,2,0.42,"Smith, Jonnu",14.06,24.4
576,2,0.4,"Gillaspia, Cullen",12.47,0.0
669,2,0.3,"McKenzie, Isaiah",11.11,6.7
818,2,0.39,"Olsen, Greg",13.71,0.0
750,2,0.35,"Arcega-Whiteside, JJ",12.21,0.0
446,2,0.61,"Murray, Kyler",19.05,33.14
486,2,0.44,"Robinson, James",13.95,24.0
471,2,0.65,"Brady, Tom",21.49,10.68
532,2,0.45,"Murray, Latavius",14.16,5.3
636,2,0.64,"Woods, Robert",18.41,11.3


### Gradient Boost

In [65]:
gradient_boost_reg = GradientBoostingRegressor()
gradient_boost_reg.fit(X_train, y_train)

GradientBoostingRegressor()

In [66]:
y_pred8 = gradient_boost_reg.predict(X_test)

In [67]:
for x in range(0, len(y_pred8)):
    y_pred8[x] = float(round(y_pred8[x],2))
y_pred8

array([19.95,  7.39,  5.91, ...,  2.41, 14.05,  9.84])

In [68]:
df_results['pred'] = y_pred8

In [69]:
subset_cols = ['Week', 'DK salary', 'player_name', 'pred', 'actual_points']
df_results_grad = df_results[subset_cols]
df_results_grad = df_results_grad.sort_values(by='Week')
df_results_grad

Unnamed: 0,Week,DK salary,player_name,pred,actual_points
759,2,0.42,"Smith, Jonnu",5.91,24.4
576,2,0.4,"Gillaspia, Cullen",3.21,0.0
669,2,0.3,"McKenzie, Isaiah",2.94,6.7
818,2,0.39,"Olsen, Greg",5.91,0.0
750,2,0.35,"Arcega-Whiteside, JJ",5.86,0.0
446,2,0.61,"Murray, Kyler",16.92,33.14
486,2,0.44,"Robinson, James",8.82,24.0
471,2,0.65,"Brady, Tom",20.54,10.68
532,2,0.45,"Murray, Latavius",7.56,5.3
636,2,0.64,"Woods, Robert",15.3,11.3


### XG Boost

In [70]:
xgb_reg = XGBRegressor()
xgb_reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [71]:
y_pred9 = xgb_reg.predict(X_test)

In [72]:
for x in range(0, len(y_pred9)):
    y_pred9[x] = float(round(y_pred9[x],2))
y_pred9

array([18.93,  7.38,  6.19, ...,  1.76, 15.26,  9.  ], dtype=float32)

In [73]:
df_results['pred'] = y_pred9

In [74]:
subset_cols = ['Week', 'DK salary', 'player_name', 'pred', 'actual_points']
df_results_xgb = df_results[subset_cols]
df_results_xgb = df_results_xgb.sort_values(by='Week')
df_results_xgb

Unnamed: 0,Week,DK salary,player_name,pred,actual_points
759,2,0.42,"Smith, Jonnu",7.2,24.4
576,2,0.4,"Gillaspia, Cullen",2.57,0.0
669,2,0.3,"McKenzie, Isaiah",2.81,6.7
818,2,0.39,"Olsen, Greg",6.72,0.0
750,2,0.35,"Arcega-Whiteside, JJ",5.24,0.0
446,2,0.61,"Murray, Kyler",16.780001,33.14
486,2,0.44,"Robinson, James",18.34,24.0
471,2,0.65,"Brady, Tom",27.440001,10.68
532,2,0.45,"Murray, Latavius",7.48,5.3
636,2,0.64,"Woods, Robert",18.41,11.3


## Evaluate Models

In [75]:
summarize_df(df_results_linear)

Total entries analyzed: 1222
Total entries after outliers removed: 1169. Left boundary: -1x Right Boundary: 5x
Correct predictions of over 15 pts: 125. Percent: 10.69
Correct predictions of under 15 pts: 853. Percent: 72.97
Incorrect predictions of over 15 pts: 88. Percent: 7.53
Incorrect predictions of under 15 pts: 106. Percent: 9.07
RMSE: 249761936.53057602


In [76]:
summarize_df(df_results_lasso)

Total entries analyzed: 1222
Total entries after outliers removed: 1198. Left boundary: -1x Right Boundary: 5x
Correct predictions of over 15 pts: 84. Percent: 7.01
Correct predictions of under 15 pts: 944. Percent: 78.8
Incorrect predictions of over 15 pts: 28. Percent: 2.34
Incorrect predictions of under 15 pts: 145. Percent: 12.1
RMSE: 7.057899579986839


In [77]:
summarize_df(df_results_elastic)

Total entries analyzed: 1222
Total entries after outliers removed: 1202. Left boundary: -1x Right Boundary: 5x
Correct predictions of over 15 pts: 67. Percent: 5.57
Correct predictions of under 15 pts: 955. Percent: 79.45
Incorrect predictions of over 15 pts: 20. Percent: 1.66
Incorrect predictions of under 15 pts: 165. Percent: 13.73
RMSE: 7.1013192856948315


In [78]:
summarize_df(df_results_ridge)

Total entries analyzed: 1222
Total entries after outliers removed: 1167. Left boundary: -1x Right Boundary: 5x
Correct predictions of over 15 pts: 123. Percent: 10.54
Correct predictions of under 15 pts: 861. Percent: 73.78
Incorrect predictions of over 15 pts: 80. Percent: 6.86
Incorrect predictions of under 15 pts: 106. Percent: 9.08
RMSE: 6.655115287704265


In [79]:
summarize_df(df_results_svr1)

Total entries analyzed: 1222
Total entries after outliers removed: 1120. Left boundary: -1x Right Boundary: 5x
Correct predictions of over 15 pts: 55. Percent: 4.91
Correct predictions of under 15 pts: 883. Percent: 78.84
Incorrect predictions of over 15 pts: 21. Percent: 1.88
Incorrect predictions of under 15 pts: 163. Percent: 14.55
RMSE: 7.191980479966737


In [80]:
summarize_df(df_results_svr2)

Total entries analyzed: 1222
Total entries after outliers removed: 1113. Left boundary: -1x Right Boundary: 5x
Correct predictions of over 15 pts: 0. Percent: 0.0
Correct predictions of under 15 pts: 985. Percent: 88.5
Incorrect predictions of over 15 pts: 0. Percent: 0.0
Incorrect predictions of under 15 pts: 131. Percent: 11.77
RMSE: 9.487135613241026


In [81]:
summarize_df(df_results_dt)

Total entries analyzed: 1222
Total entries after outliers removed: 800. Left boundary: -1x Right Boundary: 5x
Correct predictions of over 15 pts: 98. Percent: 12.25
Correct predictions of under 15 pts: 513. Percent: 64.12
Incorrect predictions of over 15 pts: 115. Percent: 14.37
Incorrect predictions of under 15 pts: 82. Percent: 10.25
RMSE: 9.157041399999747


In [82]:
summarize_df(df_results_rf)

Total entries analyzed: 1222
Total entries after outliers removed: 1144. Left boundary: -1x Right Boundary: 5x
Correct predictions of over 15 pts: 101. Percent: 8.83
Correct predictions of under 15 pts: 870. Percent: 76.05
Incorrect predictions of over 15 pts: 58. Percent: 5.07
Incorrect predictions of under 15 pts: 118. Percent: 10.31
RMSE: 6.960951738076937


In [83]:
summarize_df(df_results_ada)

Total entries analyzed: 1222
Total entries after outliers removed: 1222. Left boundary: -1x Right Boundary: 5x
Correct predictions of over 15 pts: 145. Percent: 11.87
Correct predictions of under 15 pts: 882. Percent: 72.18
Incorrect predictions of over 15 pts: 103. Percent: 8.43
Incorrect predictions of under 15 pts: 95. Percent: 7.77
RMSE: 9.501691790957214


In [84]:
summarize_df(df_results_grad)

Total entries analyzed: 1222
Total entries after outliers removed: 1196. Left boundary: -1x Right Boundary: 5x
Correct predictions of over 15 pts: 95. Percent: 7.94
Correct predictions of under 15 pts: 944. Percent: 78.93
Incorrect predictions of over 15 pts: 37. Percent: 3.09
Incorrect predictions of under 15 pts: 123. Percent: 10.28
RMSE: 6.723207674931801


In [85]:
summarize_df(df_results_xgb)

Total entries analyzed: 1222
Total entries after outliers removed: 1183. Left boundary: -1x Right Boundary: 5x
Correct predictions of over 15 pts: 102. Percent: 8.62
Correct predictions of under 15 pts: 908. Percent: 76.75
Incorrect predictions of over 15 pts: 54. Percent: 4.56
Incorrect predictions of under 15 pts: 123. Percent: 10.4
RMSE: 6.798967127172297


In [86]:
# filter with lasso / elastic and then run ada boost as predictor
# y_pred_filt = lasso_reg.predict(X_test)
# y_pred_filt = elastic_net_reg.predict(X_test) # just comment this line out to try lasso (in my testing, results don't change)
y_pred_filt = gradient_boost_reg.predict(X_test) # further testing showed this model actually performs a little better for filtering
new_df_results = X_test.copy()
new_df_results['pred'] = y_pred_filt
new_df_results

Unnamed: 0,Week,DK salary,Oppt_pts_allowed_lw,"Name_Abdullah, Ameer","Name_Adams, Davante",...,Oppt_sfo,Oppt_tam,Oppt_ten,Oppt_was,pred
5006,13,0.82,55.34,0,0,...,0,0,0,0,19.949114
689,2,0.43,116.14,0,0,...,0,0,0,0,7.388904
1949,5,0.36,119.5,0,0,...,0,0,0,0,5.909209
1761,5,0.55,126.2,0,0,...,0,0,0,0,13.349065
2043,5,0.28,118.92,0,0,...,0,0,0,0,2.938115
1330,4,0.58,101.7,0,0,...,0,0,0,0,16.366199
2553,7,0.53,61.48,0,0,...,0,0,0,0,8.614353
1805,5,0.4,84.8,0,0,...,0,0,0,0,3.21212
2134,6,0.54,132.9,0,0,...,0,0,0,0,13.349065
2214,6,0.4,0.0,0,0,...,0,0,0,0,3.549434


In [87]:
df_filtered = new_df_results[new_df_results['pred']>15]
df_filtered

Unnamed: 0,Week,DK salary,Oppt_pts_allowed_lw,"Name_Abdullah, Ameer","Name_Adams, Davante",...,Oppt_sfo,Oppt_tam,Oppt_ten,Oppt_was,pred
5006,13,0.82,55.34,0,0,...,0,0,0,0,19.949114
1330,4,0.58,101.7,0,0,...,0,0,0,0,16.366199
5438,14,0.67,111.84,0,0,...,0,0,0,0,15.999896
4868,13,0.77,98.9,0,0,...,0,0,1,0,17.370037
1869,5,0.71,98.6,0,0,...,0,0,0,0,15.499152
454,2,0.58,86.92,0,0,...,0,0,0,0,16.367262
456,2,0.63,105.86,0,0,...,0,0,0,0,17.719703
5838,15,0.88,100.98,0,0,...,0,0,0,0,19.451148
1313,4,0.71,131.24,0,0,...,0,0,0,0,20.359048
4556,12,0.88,0.0,0,1,...,0,0,0,0,20.737051


In [88]:
df_filtered = df_filtered.drop(labels=['pred'], axis=1)
df_filtered

Unnamed: 0,Week,DK salary,Oppt_pts_allowed_lw,"Name_Abdullah, Ameer","Name_Adams, Davante",...,Oppt_sea,Oppt_sfo,Oppt_tam,Oppt_ten,Oppt_was
5006,13,0.82,55.34,0,0,...,0,0,0,0,0
1330,4,0.58,101.7,0,0,...,0,0,0,0,0
5438,14,0.67,111.84,0,0,...,0,0,0,0,0
4868,13,0.77,98.9,0,0,...,0,0,0,1,0
1869,5,0.71,98.6,0,0,...,0,0,0,0,0
454,2,0.58,86.92,0,0,...,0,0,0,0,0
456,2,0.63,105.86,0,0,...,0,0,0,0,0
5838,15,0.88,100.98,0,0,...,0,0,0,0,0
1313,4,0.71,131.24,0,0,...,0,0,0,0,0
4556,12,0.88,0.0,0,1,...,0,0,0,0,0


In [89]:
y_pred_final = ada_boost_reg.predict(df_filtered)
final_df_results = df_filtered.copy()
final_df_results['pred'] = y_pred_final
final_df_results

Unnamed: 0,Week,DK salary,Oppt_pts_allowed_lw,"Name_Abdullah, Ameer","Name_Adams, Davante",...,Oppt_sfo,Oppt_tam,Oppt_ten,Oppt_was,pred
5006,13,0.82,55.34,0,0,...,0,0,0,0,22.877558
1330,4,0.58,101.7,0,0,...,0,0,0,0,16.325948
5438,14,0.67,111.84,0,0,...,0,0,0,0,19.24307
4868,13,0.77,98.9,0,0,...,0,0,1,0,23.372113
1869,5,0.71,98.6,0,0,...,0,0,0,0,20.81928
454,2,0.58,86.92,0,0,...,0,0,0,0,16.325948
456,2,0.63,105.86,0,0,...,0,0,0,0,19.24307
5838,15,0.88,100.98,0,0,...,0,0,0,0,23.122923
1313,4,0.71,131.24,0,0,...,0,0,0,0,21.835832
4556,12,0.88,0.0,0,1,...,0,0,0,0,22.304231


In [90]:
one_hot_columns = (final_df_results.iloc[:, 3:] == 1).idxmax(1)
final_df_results['player_name'] = one_hot_columns
subset_cols = ['Week', 'DK salary', 'player_name', 'pred']
final_df_results = final_df_results[subset_cols]
final_df_results

Unnamed: 0,Week,DK salary,player_name,pred
5006,13,0.82,"Name_Metcalf, D.K.",22.877558
1330,4,0.58,"Name_Brees, Drew",16.325948
5438,14,0.67,"Name_Johnson, Diontae",19.24307
4868,13,0.77,"Name_Chubb, Nick",23.372113
1869,5,0.71,"Name_Metcalf, D.K.",20.81928
454,2,0.58,"Name_Goff, Jared",16.325948
456,2,0.63,"Name_Roethlisberger, Ben",19.24307
5838,15,0.88,"Name_Hill, Tyreek",23.122923
1313,4,0.71,"Name_Rodgers, Aaron",21.835832
4556,12,0.88,"Name_Adams, Davante",22.304231


In [91]:
final_df_results['player_name'] = final_df_results['player_name'].str.replace("Name_", "")
final_df_results['actual_points'] = 0
final_df_results

Unnamed: 0,Week,DK salary,player_name,pred,actual_points
5006,13,0.82,"Metcalf, D.K.",22.877558,0
1330,4,0.58,"Brees, Drew",16.325948,0
5438,14,0.67,"Johnson, Diontae",19.24307,0
4868,13,0.77,"Chubb, Nick",23.372113,0
1869,5,0.71,"Metcalf, D.K.",20.81928,0
454,2,0.58,"Goff, Jared",16.325948,0
456,2,0.63,"Roethlisberger, Ben",19.24307,0
5838,15,0.88,"Hill, Tyreek",23.122923,0
1313,4,0.71,"Rodgers, Aaron",21.835832,0
4556,12,0.88,"Adams, Davante",22.304231,0


In [92]:
week_arr = [num for num in final_df_results['Week']]
player_arr = [name for name in final_df_results['player_name']]

In [93]:
for i in range(len(final_df_results)):
    num = df_results.loc[(df_results['Week']==week_arr[i])&(df_results['player_name']==player_arr[i]), 'actual_points']
    final_df_results.loc[(final_df_results['Week']==week_arr[i])&(final_df_results['player_name']==player_arr[i]), 'actual_points'] = num

In [94]:
final_df_results

Unnamed: 0,Week,DK salary,player_name,pred,actual_points
5006,13,0.82,"Metcalf, D.K.",22.877558,13.0
1330,4,0.58,"Brees, Drew",16.325948,16.54
5438,14,0.67,"Johnson, Diontae",19.24307,8.0
4868,13,0.77,"Chubb, Nick",23.372113,17.6
1869,5,0.71,"Metcalf, D.K.",20.81928,27.3
454,2,0.58,"Goff, Jared",16.325948,23.98
456,2,0.63,"Roethlisberger, Ben",19.24307,22.24
5838,15,0.88,"Hill, Tyreek",23.122923,17.4
1313,4,0.71,"Rodgers, Aaron",21.835832,32.58
4556,12,0.88,"Adams, Davante",22.304231,18.1


In [95]:
summarize_df(final_df_results)

Total entries analyzed: 132
Total entries after outliers removed: 132. Left boundary: -1x Right Boundary: 5x
Correct predictions of over 15 pts: 94. Percent: 71.21
Correct predictions of under 15 pts: 0. Percent: 0.0
Incorrect predictions of over 15 pts: 37. Percent: 28.03
Incorrect predictions of under 15 pts: 1. Percent: 0.76
RMSE: 10.565005666810908


In [96]:
accuracies = cross_val_score(estimator = lin_reg, X = X_train, y = y_train, cv = KFold(shuffle=True))
print(f"Accuracy: {accuracies.mean()*100}%")
print(f"Standard Deviation: {accuracies.std()*100}%")

Accuracy: -5.23385827642366e+18%
Standard Deviation: 9.704932141585533e+18%


In [97]:
accuracies = cross_val_score(estimator = lasso_reg, X = X_train, y = y_train, cv = KFold(shuffle=True))
print(f"Accuracy: {accuracies.mean()*100}%")
print(f"Standard Deviation: {accuracies.std()*100}%")

Accuracy: 37.613659651160376%
Standard Deviation: 1.6374986303377335%


In [98]:
accuracies = cross_val_score(estimator = elastic_net_reg, X = X_train, y = y_train, cv = KFold(shuffle=True))
print(f"Accuracy: {accuracies.mean()*100}%")
print(f"Standard Deviation: {accuracies.std()*100}%")

Accuracy: 36.7680108550433%
Standard Deviation: 2.168377449775357%


In [99]:
accuracies = cross_val_score(estimator = ridge_reg, X = X_train, y = y_train, cv = KFold(shuffle=True))
print(f"Accuracy: {accuracies.mean()*100}%")
print(f"Standard Deviation: {accuracies.std()*100}%")

Accuracy: 41.29346961368717%
Standard Deviation: 3.5799542339451484%


In [105]:
accuracies = cross_val_score(estimator = svr1_reg, X = X_train, y = y_train, cv = KFold(shuffle=True))
print(f"Accuracy: {accuracies.mean()*100}%")
print(f"Standard Deviation: {accuracies.std()*100}%")

Accuracy: 36.34010905353997%
Standard Deviation: 2.947981312281536%


In [106]:
accuracies = cross_val_score(estimator = svr2_reg, X = X_train, y = y_train, cv = KFold(shuffle=True))
print(f"Accuracy: {accuracies.mean()*100}%")
print(f"Standard Deviation: {accuracies.std()*100}%")

Accuracy: -11.287398106479705%
Standard Deviation: 0.3661146604209673%


In [100]:
accuracies = cross_val_score(estimator = decision_tree_reg, X = X_train, y = y_train, cv = KFold(shuffle=True))
print(f"Accuracy: {accuracies.mean()*100}%")
print(f"Standard Deviation: {accuracies.std()*100}%")

Accuracy: -3.1909660849519383%
Standard Deviation: 10.381563886891689%


In [101]:
accuracies = cross_val_score(estimator = random_forest_reg, X = X_train, y = y_train, cv = KFold(shuffle=True))
print(f"Accuracy: {accuracies.mean()*100}%")
print(f"Standard Deviation: {accuracies.std()*100}%")

Accuracy: 40.3930914432095%
Standard Deviation: 1.404573915080275%


In [102]:
accuracies = cross_val_score(estimator = ada_boost_reg, X = X_train, y = y_train, cv = KFold(shuffle=True))
print(f"Accuracy: {accuracies.mean()*100}%")
print(f"Standard Deviation: {accuracies.std()*100}%")

Accuracy: -19.608925884655875%
Standard Deviation: 28.85440707568727%


In [103]:
accuracies = cross_val_score(estimator = gradient_boost_reg, X = X_train, y = y_train, cv = KFold(shuffle=True))
print(f"Accuracy: {accuracies.mean()*100}%")
print(f"Standard Deviation: {accuracies.std()*100}%")

Accuracy: 41.10223766601915%
Standard Deviation: 1.5255120705429586%


In [104]:
accuracies = cross_val_score(estimator = xgb_reg, X = X_train, y = y_train, cv = KFold(shuffle=True))
print(f"Accuracy: {accuracies.mean()*100}%")
print(f"Standard Deviation: {accuracies.std()*100}%")

Accuracy: 39.06443332617483%
Standard Deviation: 3.3967670645193024%


## Summary

With the most recent season (2020 at the time of this writing) stats, using un-scaled data, the model correctly picks players that score 15+ pts about 65% of the time.

After scaling salary data with a Min Max scaler, that percentage goes up to around 77% of the time. Standard scaled data is correct about 72% of the time.

Cross validation yields a different result, which keeps me wary, even though the initial results are promising. With cross validation, it seems that the best I could hope for is about 40-42% accuracy. In the context of correctly predicting fantasy points, I'd say that's actually pretty good, since our main metric of success is the ability to predict players that score 15+ points.

In [None]:
class Lineup:
    """ 
    takes the results of the model prediction (dataframe 
    with attached predictions) and builds out a few lineups 
    """
    def __init__(self, df):
        self.df = df
        self.current_salary = 0
        self.no_duplicates = False
        self.top_5_lineups = []
        self.qbs = []
        self.rbs = []
        self.wrs = []
        self.tes = []
        self.flex = []
        self.defs = []
    
    def find_top_10(self, position):
        arr = []
        end_of_range = len(self.df.loc[self.df['Pos']==position])
        if position == 'Flex':
            position_df = self.df.loc[(self.df['Pos']=='RB')|(self.df['Pos']=='TE')|(self.df['Pos']=='WR')]
            end_of_range = (len(self.df.loc[self.df['Pos']=='RB'])+
                            len(self.df.loc[self.df['Pos']=='WR'])+
                            len(self.df.loc[self.df['Pos']=='TE']))
        else:
            position_df = self.df.loc[self.df['Pos']==position]
        if position == 'Def':
            position_df = position_df.sort_values(by='avg_points', ascending=False)
        else:
            position_df = position_df.sort_values(by='pred_scoring_pot', ascending=False)
        # print(position_df)
        for row in range(0,end_of_range):
            player = {
                'name': position_df.iloc[row]['Name'],
                'team': position_df.iloc[row]['Team'],
                'h/a': position_df.iloc[row]['h/a'],
                'pos': position_df.iloc[row]['Pos'],
                'salary': position_df.iloc[row]['DK salary'],
                'avg_points': position_df.iloc[row]['avg_points'],
                'scoring_pot': position_df.iloc[row]['pred_scoring_pot'],
                'act_pts':position_df.iloc[row]['act_pts_scored']
            }
            if len(arr) < end_of_range:
                arr.append(player)
            else: 
                break
        return arr
    
    def get_players(self):
        top_10_qbs = self.find_top_10(position='QB')
        top_10_rbs = self.find_top_10(position='RB')
        top_10_wrs = self.find_top_10(position='WR')
        top_10_tes = self.find_top_10(position='TE')
        top_10_flex = self.find_top_10(position='Flex')
        top_10_defs = self.find_top_10(position='Def')
        return top_10_qbs, top_10_rbs, top_10_wrs, top_10_tes, top_10_flex, top_10_defs
    
    def check_salary(self, lineup):
        current_salary = 0
        for keys in lineup.keys():
            current_salary += lineup[keys]['salary']
        return current_salary
    
    def check_duplicates(self, lineup):
        rb1_name = lineup['RB1']['name']
        rb2_name = lineup['RB2']['name']
        flex_name = lineup['Flex']['name']
        wr1_name = lineup['WR1']['name']
        wr2_name = lineup['WR2']['name']
        wr3_name = lineup['WR3']['name']
        te_name = lineup['TE']['name']
        names = [flex_name, rb1_name, rb2_name, wr1_name, wr2_name, wr3_name, te_name ]
        while len(names) > 1:
            if names[0] in names[1:-1]:
                return False
            else:
                names.pop(0)   
        return True
    
    def shuffle_players(self):
        lineup = {
            'QB': self.qbs[random.randrange(len(self.df.loc[self.df['Pos']=='QB']))],
            'RB1': self.rbs[random.randrange(len(self.df.loc[self.df['Pos']=='RB']))],
            'RB2': self.rbs[random.randrange(len(self.df.loc[self.df['Pos']=='RB']))],
            'WR1': self.wrs[random.randrange(len(self.df.loc[self.df['Pos']=='WR']))],
            'WR2': self.wrs[random.randrange(len(self.df.loc[self.df['Pos']=='WR']))],
            'WR3': self.wrs[random.randrange(len(self.df.loc[self.df['Pos']=='WR']))],
            'TE': self.tes[random.randrange(len(self.df.loc[self.df['Pos']=='TE']))],
            'Flex': self.flex[random.randrange(len(self.df.loc[self.df['Pos']=='RB'])+
                                               len(self.df.loc[self.df['Pos']=='WR'])+
                                               len(self.df.loc[self.df['Pos']=='TE']))],
            'Def': self.defs[random.randrange(len(self.df.loc[self.df['Pos']=='Def']))]
        }
        return lineup
    
    def build_lineup(self):
        self.current_salary = 100*1000
        self.no_duplicates = False
        self.qbs, self.rbs, self.wrs, self.tes, self.flex, self.defs = self.get_players()
        lineup = {
            'QB': self.qbs[0],
            'RB1': self.rbs[0],
            'RB2': self.rbs[1],
            'WR1': self.wrs[0],
            'WR2': self.wrs[1],
            'WR3': self.wrs[2],
            'TE': self.tes[0],
            'Flex': self.flex[9], # started at the end of flex to avoid duplicating players
            'Def': self.defs[0]
        }
        # in theory, because of the legwork done by the algorithm,
        # any lineup should be good as long as it abides by the
        # constraints of DraftKings' team structures. So for
        # now, this will just give us the first 5 lineups that
        # fit within the salary cap and meet the other requirements
        
        while True:
            if self.current_salary < 50*1000 and self.current_salary > 48.5*1000 and self.no_duplicates:
                break
            lineup = self.shuffle_players()
            self.current_salary = self.check_salary(lineup)
            # make sure there are no duplicates
            self.no_duplicates = self.check_duplicates(lineup)
        
        self.top_5_lineups.append(lineup)
    
lineup = Lineup(df_for_lineups)