# User Inputs

In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sqlalchemy import create_engine

import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [None]:
#==========
# General Setting
#==========

# set core path
path = '/Users/Mark/Documents/Github/Fantasy_Football/'

db_name = 'Model_Inputs.sqlite3'

# set to position to analyze: 'RB', 'WR', 'QB', or 'TE'
set_pos = 'WR'


#==========
# Postgres Database
#==========

# postgres login information
pg_log = {
    'USER': 'postgres',
    'PASSWORD': 'Ctdim#1bf!!!!!',
    'HOST': 'localhost',
    'PORT': '5432', 
    'DATABASE_NAME': 'fantasyfootball'
}

# create engine for connecting to database
engine = create_engine('postgres+psycopg2://{}:{}@{}:{}/{}'.format(pg_log['USER'], pg_log['PASSWORD'], pg_log['HOST'],
                                                                   pg_log['PORT'], pg_log['DATABASE_NAME']))

# specify schema and table to write out intermediate results
table_info = {
    'engine': engine,
    'schema': 'websitedev',
}

#==========
# Data Filtering
#==========

# set year to analyze
set_year = 2019
earliest_year = 2001

# set required touches (or pass thrown) and games for consideration
req_games = 8
req_touch = 50


#==========
# Fantasy Point Values
#==========

# define point values for all statistical categories
pass_yd_per_pt = 0.04 
pass_td_pt = 5
int_pts = -2
sacks = -1
rush_yd_per_pt = 0.1 
rec_yd_per_pt = 0.1
rush_rec_td = 7
ppr = 0.5
fumble = -2

# creating dictionary containing point values for each position
pts_dict = {}
pts_dict['QB'] = [pass_yd_per_pt, pass_td_pt, rush_yd_per_pt, rush_rec_td, int_pts, sacks, fumble]
pts_dict['RB'] = [rush_yd_per_pt, rec_yd_per_pt, ppr, rush_rec_td, fumble]
pts_dict['WR'] = [rec_yd_per_pt, ppr, rush_rec_td, fumble]
pts_dict['TE'] = [rec_yd_per_pt, ppr, rush_rec_td, fumble]


#==========
# Model Settings
#==========

# correlation with target for initial feature reduction
corr_cutoff = 0.2

# VIF threshold to include remaining features
vif_thresh = 10

# number of hypersearch rounds for training ensemble
iter_rounds = 100

# whether or not to plot feature importances following modeling
plot_importance = True

# Load Libraries

In [None]:
# core packages
import pandas as pd
import numpy as np
import os
import sqlite3

# jupyter specifications
pd.options.mode.chained_assignment = None
from IPython.core.interactiveshell import InteractiveShell
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# plotting functions
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# change directory temporarily to helper scripts
os.chdir(path + 'Scripts/Analysis/Helper_Scripts')

# load custom plot functions
from my_plot import PrettyPlot
PrettyPlot(plt)

# load custom helper functions and ensure lightgbm runs
from helper_functions import *
os.environ['KMP_DUPLICATE_LIB_OK']='True';

# Merge and Clean Data Files

In [None]:
#==========
# Pull and clean compiled data
#==========

# connect to database and pull in positional data
conn = sqlite3.connect(path + 'Data/' + db_name)
df = pd.read_sql_query('SELECT * FROM ' + set_pos + '_' + str(set_year), con=conn)

# ay = pd.read_sql_query('SELECT * FROM AirYards', con=sqlite3.connect(path + 'Data/Season_Stats.sqlite3'))
# df = pd.merge(df, ay, how='inner', left_on=['player', 'year'], right_on=['player', 'year'])

# split old and new to filter past years based on touches.
# leave all new players in to ensure everyone gets a prediction
old = df[(df[pos[set_pos]['touch_filter']] > req_touch) & (df.games > req_games) & (df.year < set_year-1)].reset_index(drop=True)
this_year = df[df.year==set_year-1]

# merge old and new back together after filtering
df = pd.concat([old, this_year], axis=0)

# create dataframes to store results
df_train_results = pd.DataFrame([old.player, old.year]).T
df_test_results = pd.DataFrame([this_year.player]).T

# calculate FP and generate list of relevant metrics
df = calculate_fp(df, pts_dict, pos=set_pos).reset_index(drop=True)

In [None]:
#---------
# QB Dictionary
#---------

# initilize RB dictionary
pos['QB'] = {}

# total touch filter name
pos['QB']['touch_filter'] = 'qb_att'

# metrics to calculate stats for
pos['QB']['metrics'] = ['qb_yd_per_game', 'pass_td_per_game','rush_yd_per_game', 
                        'rush_td_per_game' ,'int_per_game', 'sacks_per_game' ]

pos['QB']['med_features'] = ['fp', 'qb_tds','qb_rating', 'qb_yds', 'pass_off', 'qb_complete_pct', 'qb_td_pct', 
                             'sack_pct', 'avg_pick', 'sacks_allowed', 'qbr', 'adj_yd_per_att', 'adj_net_yd_per_att',
                             'int', 'int_pct', 'rush_att', 'rush_yds', 'rush_td', 'rush_yd_per_att',
                             'rz_20_pass_complete', 'rz_20_pass_att',
                               'rz_20_complete_pct', 'rz_20_pass_yds', 'rz_20_pass_td', 'rz_20_int',
                               'rz_10_pass_complete', 'rz_10_pass_att', 'rz_10_complete_pct',
                               'rz_10_pass_yds', 'rz_10_pass_td', 'rz_10_int', 'rz_20_rush_att',
                               'rz_20_rush_yds', 'rz_20_rush_td', 'rz_20_rush_pct', 'rz_10_rush_att',
                               'rz_10_rush_yds', 'rz_10_rush_td', 'rz_10_rush_pct', 'rz_5_rush_att',
                               'rz_5_rush_yds', 'rz_5_rush_td', 'rz_5_rush_pct']
pos['QB']['max_features'] = ['fp', 'qb_rating', 'qb_yds', 'qb_tds', 'int', 'int_pct', 'sack_pct', 'rush_yd_per_att']
pos['QB']['age_features'] = ['fp', 'qb_rating', 'qb_yds', 'qb_complete_pct', 'qb_td_pct', 'sack_pct', 
                             'avg_pick', 'qbr', 'int', 'int_pct', 'rush_att', 'rush_yds', 'rush_td', 'rush_yd_per_att']
pos['QB']['sum_features'] = ['qb_tds', 'qb_yds', 'fourth_qt_comeback', 'game_winning_drives', 'fp']



# initilize RB dictionary
pos['RB'] = {}

# total touch filter name
pos['RB']['touch_filter'] = 'total_touches'

# metrics to be predicted for fantasy point generation
pos['RB']['metrics'] = ['rush_yd_per_game', 'rec_yd_per_game', 'rec_per_game', 'td_per_game']

# median feature categories
pos['RB']['med_features'] = ['fp', 'tgt', 'receptions', 'total_touches', 'rush_yds', 'rec_yds', 
                           'rush_yd_per_game', 'rec_yd_per_game', 'rush_td', 'games_started', 
                           'qb_rating', 'qb_yds', 'pass_off', 'tm_rush_td', 'tm_rush_yds', 
                           'tm_rush_att', 'adjust_line_yds', 'ms_rush_yd', 'ms_rec_yd', 'ms_rush_td',
                           'avg_pick', 'fp_per_touch', 'team_rush_avg_att',
                            'rz_20_rush_att', 'rz_20_rush_yds', 'rz_20_rush_td', 'rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                            'rz_10_rush_att', 'rz_10_rush_yds', 'rz_10_rush_td', 'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',
                            'rz_5_rush_att', 'rz_5_rush_yds', 'rz_5_rush_td']

# sum feature categories
pos['RB']['sum_features'] = ['total_touches', 'att', 'total_yds', 'rush_td', 'fp', 'rec_yds', 
                             'rush_yds', 'qb_yds']

# max feature categories
pos['RB']['max_features'] = ['fp', 'rush_td', 'tgt', 'rush_yds', 'rec_yds', 'total_yds', 
                             'rush_yd_per_game', 'rec_yd_per_game', 'ms_rush_yd']

# age feature categories
pos['RB']['age_features'] = ['fp', 'rush_yd_per_game', 'rec_yd_per_game', 'total_touches', 'receptions', 'tgt',
                             'ms_rush_yd', 'ms_rec_yd', 'available_rush_att', 'available_tgt', 'total_touches_sum',
                             'total_yds_sum', 'avg_pick', 'fp_per_touch', 'ms_rush_yd_per_att', 'ms_tgts',
                            'rz_20_rush_att', 'rz_20_rush_yds', 'rz_20_rush_td', 'rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                            'rz_10_rush_att', 'rz_10_rush_yds', 'rz_10_rush_td', 'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',
                            'rz_5_rush_att', 'rz_5_rush_yds', 'rz_5_rush_td']

# initilize RB dictionary
pos['WR'] = {}

# total touch filter name
pos['WR']['touch_filter'] = 'tgt'

# metrics to calculate stats for
pos['WR']['metrics'] = ['rec_yd_per_game', 'rec_per_game', 'td_per_game']

# median feature categories
pos['WR']['med_features'] = ['fp', 'tgt', 'receptions', 'rec_yds', 'rec_yd_per_game', 'rec_td', 'games_started', 
                             'qb_rating', 'qb_yds', 'pass_off', 'ms_tgts', 'ms_rec_yd', 
                             'tm_net_pass_yds', 'avg_pick',  'rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                             'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',]
# sum feature categories
pos['WR']['sum_features'] = ['receptions', 'rec_yds', 'tgt']

# max feature categories
pos['WR']['max_features'] = ['fp', 'rec_td', 'tgt', 'ms_tgts', 'ms_rec_yd', 'rec_yd_per_game',
                             'rz_20_tgt', 'rz_20_receptions', 
                             'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',]

# age feature categories
pos['WR']['age_features'] = ['fp', 'rec_yd_per_game', 'receptions', 'tgt', 'ms_tgts', 'ms_rec_yd', 
                             'avg_pick', 'ms_yds_per_tgts', 'rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                             'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',]

# initilize RB dictionary
pos['TE'] = {}

# total touch filter name
pos['TE']['touch_filter'] = 'tgt'

# metrics to calculate stats for
pos['TE']['metrics'] = ['rec_yd_per_game', 'rec_per_game', 'td_per_game']

# median feature categories
pos['TE']['med_features'] = ['fp', 'tgt', 'receptions', 'rec_yds', 'rec_yd_per_game', 'rec_td', 'games_started', 
                             'qb_rating', 'qb_yds', 'pass_off', 'ms_tgts', 'ms_rec_yd', 
                             'tm_net_pass_yds', 'avg_pick','rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                             'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',]
# sum feature categories
pos['TE']['sum_features'] = ['receptions', 'rec_yds', 'tgt', 'rec_td', 'qb_yds']

# max feature categories
pos['TE']['max_features'] = ['fp', 'rec_td', 'tgt', 'ms_tgts', 'rec_yds', 'ms_rec_yd', 'rec_yd_per_game',
                             'rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                             'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',]

# age feature categories
pos['TE']['age_features'] = ['fp', 'rec_yd_per_game', 'receptions', 'tgt', 'ms_tgts', 'ms_rec_yd', 
                             'avg_pick', 'ms_yds_per_tgts','rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                             'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',]

In [None]:
#==========
# Loop to create statistical predictions
#==========

output = {}

for metric in pos[set_pos]['metrics']:
    
    # create dataframes to store chunk data
    df_train_chunks = pd.DataFrame()
    df_test_chunks = pd.DataFrame()

    # print which metric is being calculated
    print('Running Models for ' + metric)
    print('----------------------------------')

    #--------
    # Create train and predict dataframes
    #--------
    
    # create training and prediction dataframes
    df_train_full, df_predict_full = features_target(df,
                                                     earliest_year, set_year-1,
                                                     pos[set_pos]['med_features'],
                                                     pos[set_pos]['sum_features'],
                                                     pos[set_pos]['max_features'],
                                                     pos[set_pos]['age_features'],
                                                     target_feature=metric)
    
    # drop any rows that have a null target value (likely due to injuries or other missed season)
    df_train_full = df_train_full.dropna(subset=['y_act']).reset_index(drop=True)
    df_train_full = df_train_full.fillna(df_train_full.mean())
    df_predict_full = df_predict_full.dropna().reset_index(drop=True)
    
    print('Number of Training Samples:', df_train_full.shape[0])
    
    # loop through different k's specific to draft position cutoffs
    for k in [1]:
        
        if k == 1:
            
            df_train = df_train_full
            df_predict = df_predict_full
            #df_train = df_train_full[df_train_full.avg_pick < 4.6].reset_index(drop=True)
            #df_predict = df_predict_full[df_predict_full.avg_pick < 4.6].reset_index(drop=True)
        
        #if k == 2:
        #    df_train = df_train_full[(df_train_full.avg_pick >= 3.6) & (df_train_full.avg_pick < 4.6)].reset_index(drop=True)
        #    df_predict = df_predict_full[(df_predict_full.avg_pick >= 3.6) & (df_predict_full.avg_pick < 4.6)].reset_index(drop=True)
        
        #if k == 2:
        #    df_train = df_train_full[df_train_full.avg_pick >= 4.6].reset_index(drop=True)
        #    df_predict = df_predict_full[df_predict_full.avg_pick >= 4.6].reset_index(drop=True)
            
        #--------
        # Remove low correlation features and high VIF features
        #--------

        # remove low correlation features
        df_train, df_predict = corr_removal(df_train, df_predict, corr_cutoff=corr_cutoff)

        # select only features with low vif for modeling
        transformer = ReduceVIF(thresh=vif_thresh, scale=True, print_progress=False)
        df_train_ = transformer.fit_transform(df_train.drop(['y_act', 'player'], axis=1), df_train.y_act)

        # extract best columns and filter down df_predict
        best_cols = list(df_train_.columns)
        best_cols.extend(['player', 'year', 'avg_pick'])
        df_predict = df_predict[best_cols]
        df_predict = df_predict.loc[:,~df_predict.columns.duplicated()]

        # add target and filter down df_train
        best_cols.extend(['y_act', 'year', 'avg_pick'])
        df_train = df_train[best_cols]
        df_train = df_train.loc[:,~df_train.columns.duplicated()]

        #--------
        # Run ensemble model with parameter optimization
        #--------

        # generate a master dictionary of parameters (must match the)
        param_list = [lgbm_params, xgb_params, lasso_params, ridge_params]
        est_names = ['lgbm', 'xgb', 'lasso', 'ridge']

        params = {}
        for i, param in enumerate(param_list):
            params[est_names[i]] = param

        print('Training Ensemble Model')
        param_results, summary, df_train_results_, errors = validation(est_names, params, df_train, 
                                                                       iterations=iter_rounds, random_state=1234, skip_years=1)

        #--------
        # Print best results
        #--------

        # print a summary of error metrics, weightings of various models, and a comparison to 
        # using straight adp as as a prediction for next year's stats
        print(summary.head(10))
        path = '/Users/Mark/Documents/Github/Fantasy_Football/Scripts/Analysis/ParamSearch/'
        label = '{}{}_G{}_Touch{}_Corr{}_VIF{}.csv'.format(path, set_pos, req_games, req_touch, corr_cutoff, vif_thresh)
        summary.to_csv(label, index=False, mode='a')

        # pull out the best result for the random hyperparameter search of models
        best_result = summary.index[0]

        # pass the best hyperparameters into the generation_prediction function, which
        # will return the test results for the current year and the trained best models
        df_test_results_, models = generate_predictions(best_result, param_results, summary, df_train, df_predict)

        #--------
        # Aggregate all results through merging
        #--------

        # add models to output dictionary
        output[metric] = {}
        output[metric][k] = {}
        output[metric][k]['models'] = models

        # add params to output dictionary
        output[metric][k]['params'] = param_results

        # add columns to output dictionary
        cols = list(df_train.columns)
        cols.remove('y_act')
        cols.remove('player')
        output[metric]['cols'] = cols
        
        # concat the chunk for each metric together into one dataframe
        df_train_results_ = df_train_results_.rename(columns={'pred': 'pred_' + metric})
        df_train_results_ = df_train_results_[['player', 'year', 'pred_' + metric]]
        df_train_chunks = pd.concat([df_train_chunks, df_train_results_], axis=0).reset_index(drop=True)
        
        # concat the chunk for each metric together into one dataframe
        df_test_results_ = df_test_results_.rename(columns={'pred': 'pred_' + metric})
        df_test_results_ = df_test_results_[['player', 'pred_' + metric]]
        df_test_chunks = pd.concat([df_test_chunks, df_test_results_], axis=0).reset_index(drop=True)
    
    # merge the train results for the given metric with all other metric outputs
    df_train_results = pd.merge(df_train_results, df_train_chunks, 
                                how='inner', left_on=['player', 'year'], right_on=['player', 'year'])
    # merge the test results for the given metric with all other metric outputs
    df_test_results = pd.merge(df_test_results, df_test_chunks, 
                               how='inner', left_on='player', right_on='player')

# after loop, set the df_train to have the y_act as fp_per_game
df_train, df_predict = features_target(df, earliest_year, set_year-1, 
                                           pos[set_pos]['med_features'], 
                                           pos[set_pos]['sum_features'],
                                           pos[set_pos]['max_features'], 
                                           pos[set_pos]['age_features'],
                                           target_feature='fp_per_game')

## Write out results to Postgres

In [None]:
#--------
# Append additional stat categories to the results
#--------

# add actual results and adp to the train df
train_results = pd.merge(df_train_results, df_train[['player', 'year', 'age', 'year_exp', 'avg_pick', 'y_act']],
                           how='inner', left_on=['player', 'year'], right_on=['player', 'year']).drop('year', axis=1)

# add adp to the test df
test_results = pd.merge(df_test_results, df_predict[['player', 'age', 'year_exp', 'avg_pick']],
                           how='inner', left_on='player', right_on='player')

#--------
# Set up proper database connections to save out single dataset
#--------

train_results.to_sql(set_pos + '_Train_' + str(set_year), engine, schema='websitedev', index=False, if_exists='replace')
test_results.to_sql(set_pos + '_Test_' + str(set_year), engine, schema='websitedev', index=False, if_exists='replace')

In [None]:
#--------
# Calculate Fantasy Points for Given Scoring System
#-------- 

# extract points list and get the idx of point attributes based on length of list
pts_list = pts_dict[set_pos][:-1]
print(pts_list)
c_idx = len(pts_list) + 1


train_plot = df_train_results.copy()
test_plot = df_test_results.copy()

# multiply stat categories by corresponding point values
train_plot.iloc[:, 1:c_idx] = train_plot.iloc[:, 1:c_idx] * pts_list
test_plot.iloc[:, 1:c_idx] = test_plot.iloc[:, 1:c_idx] * pts_list

# add a total predicted points stat category
train_plot.loc[:, 'pred'] = train_plot.iloc[:, 1:c_idx].sum(axis=1)
test_plot.loc[:, 'pred'] = test_plot.iloc[:, 1:c_idx].sum(axis=1)

#==========
# Plot Predictions for Each Player
#==========

# set length of plot based on number of results
plot_length = int(test_plot.shape[0] / 3.5)

# plot results from highest predicted FP to lowest predicted FP
test_plot.sort_values('pred').plot.barh(x='player', y='pred', figsize=(5, plot_length));

In [None]:
#==========
# If desired, plot feature importances for a given metric / model
#==========
plot_importance=True
if plot_importance == True:
    
    metric = 'int_per_game'
    k = 1
    j = 3
    try:
        plot_results(output[metric][k]['models'][j].feature_importances_, col_names=output[metric]['cols']);
    except:
        plot_results(output[metric][k]['models'][j].coef_, col_names=output[metric]['cols']);

# Creating Salary, Injury Tables and SQLite Output

### Push Salary to DB

In [None]:
sal = pd.read_csv('/Users/Mark/Documents/GitHub/Fantasy_Football/Data/OtherData/Salaries/salaries_2019.csv')
sal.to_sql('salaries_' + str(set_year), engine, schema='websitedev', index=False, if_exists='replace')

### Push Injuries to DB

In [None]:
from sklearn.preprocessing import StandardScaler

inj = pd.read_csv('/Users/Mark/Documents/GitHub/Fantasy_Football/Data/OtherData/InjuryPredictor/injury_predictor_2019_v2.csv',  
                  encoding='latin-1', header=None)
inj.columns = ['player', 'pct_miss_one', 'proj_games_missed', 'inj_pct_per_game', 'inj_risk', 'points']
inj.player = inj.player.apply(lambda x: x.split(',')[0])
inj.pct_miss_one = inj.pct_miss_one.apply(lambda x: float(x.strip('%')))
inj.inj_pct_per_game = inj.inj_pct_per_game.apply(lambda x: float(x.strip('%')))
inj = inj.drop(['points', 'inj_risk'], axis=1)

X = StandardScaler().fit_transform(inj.iloc[:, 1:])
inj = pd.concat([pd.DataFrame(inj.player), 
                 pd.DataFrame(X, columns=['pct_miss_one', 'proj_games_missed', 'pct_per_game'])], 
                axis=1)
for col in ['pct_miss_one', 'proj_games_missed', 'pct_per_game']:
    inj[col] = inj[col] + abs(inj[col].min())

inj['mean_risk'] = inj.iloc[:, 1:].mean(axis=1)
inj = inj[['player', 'mean_risk']].sort_values(by='mean_risk').reset_index(drop=True)
inj.loc[inj.player=='Kareem Hunt', 'mean_risk'] = 8
inj.loc[inj.player=='Melvin Gordon', 'mean_risk'] = inj.loc[inj.player=='Melvin Gordon', 'mean_risk'] + 2
inj.loc[inj.player=='Todd Gurley', 'mean_risk'] = inj.loc[inj.player=='Todd Gurley', 'mean_risk'] + 1
inj.loc[inj.player=='Antonio Brown', 'mean_risk'] = inj.loc[inj.player=='Antonio Brown', 'mean_risk'] + 1
inj.loc[inj.player=='Kenyan Drake', 'mean_risk'] = inj.loc[inj.player=='Kenyan Drake', 'mean_risk'] + 2
inj.loc[inj.player=='Derrius Guice', 'mean_risk'] = inj.loc[inj.player=='Derrius Guice', 'mean_risk'] + 2

inj = inj[['player', 'mean_risk']]
inj.to_sql('injuries_' + str(set_year), engine, schema='websitedev', index=False, if_exists='replace')

### Drop All Data into SQLite Database

In [None]:
db = sqlite3.connect('/Users/Mark/Desktop/FF_Sim/SimInput_v2.sqlite3')

for set_pos in ['QB', 'RB', 'WR', 'TE']:
    
    train = pd.read_sql_query('SELECT * FROM {}."{}_Train_{}"' \
                                     .format(table_info['schema'], set_pos, str(set_year)), table_info['engine'])
    test = pd.read_sql_query('SELECT * FROM {}."{}_Test_{}"' \
                                    .format(table_info['schema'], set_pos, str(set_year)), table_info['engine'])
    
    train.to_sql(name='{}_Train_{}'.format(set_pos, str(set_year)), con=db, if_exists='replace', index=False)
    test.to_sql(name='{}_Test_{}'.format(set_pos, str(set_year)), con=db, if_exists='replace', index=False)
    

sal = pd.read_sql_query('SELECT * FROM {}."salaries_2019"' \
                                    .format(table_info['schema'], set_pos, str(set_year)), table_info['engine'])
sal.to_sql(name='salaries_2019', con=db, if_exists='replace', index=False)

inj = pd.read_sql_query('SELECT * FROM {}."injuries_2019"' \
                                    .format(table_info['schema'], set_pos, str(set_year)), table_info['engine'])
inj.to_sql(name='injuries_2019', con=db, if_exists='replace', index=False)

# Comparing Fantasy Pros

In [None]:
# create training and prediction dataframes
fantasy_points = []

if set_pos == 'RB':
    cat_list = ['rush_yd_per_game', 'rec_yd_per_game', 'rec_per_game', 'td_per_game']
    
elif set_pos == 'WR' or set_pos == 'TE':
    cat_list = ['rec_yd_per_game', 'rec_per_game', 'td_per_game']
    
elif set_pos == 'QB':
    cat_list = ['qb_yd_per_game', 'pass_td_per_game','rush_yd_per_game', 
                        'rush_td_per_game' ,'int_per_game', 'sacks_per_game' ]

for m in cat_list:
    df_train_full, df_predict_full = features_target(df,
                                                     earliest_year, set_year-1,
                                                     pos[set_pos]['med_features'],
                                                     pos[set_pos]['sum_features'],
                                                     pos[set_pos]['max_features'],
                                                     pos[set_pos]['age_features'],
                                                     target_feature=m)
    fantasy_points.append(list(df_train_full.y_act.values))
    
fantasy_pts = pd.DataFrame(fantasy_points).T.reset_index(drop=True)

if set_pos == 'RB':
    fantasy_pts = (fantasy_pts * [0.1, 0.1, 0.5, 7]).sum(axis=1)
elif set_pos == 'WR' or set_pos == 'TE':
    fantasy_pts = (fantasy_pts * [0.1, 0.5, 7]).sum(axis=1)
elif set_pos == 'QB':
    fantasy_pts = (fantasy_pts * [0.04, 5, 0.1, 7, -2, -1]).sum(axis=1)
    
fantasy_pts.name = 'fantasy_pts'

In [None]:
stats_fp = []
stats_all = []
    
for metric in cat_list:

    df_train_full, df_predict_full = features_target(df,
                                                         earliest_year, set_year-1,
                                                         pos[set_pos]['med_features'],
                                                         pos[set_pos]['sum_features'],
                                                         pos[set_pos]['max_features'],
                                                         pos[set_pos]['age_features'],
                                                         target_feature=metric)

    df_train_full = pd.concat([df_train_full, fantasy_pts], axis=1)

    fp = pd.read_sql_query('SELECT * FROM FantasyPros', con=sqlite3.connect('/Users/Mark/Documents/Github/Fantasy_Football/Data/Season_Stats.sqlite3'))
    fp.year = fp.year-1
    df_train_full = pd.merge(df_train_full, fp, how='inner', left_on=['player', 'year'], right_on=['player', 'year'])
    df_train_full = df_train_full[df_train_full.fp > 5]


    from sklearn.linear_model import Lasso
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import mean_squared_error
    from sklearn.linear_model import Ridge


    df_train_full = df_train_full.dropna()
    lass = Lasso(alpha=250)
#     y = 'fantasy_pts'
#     y_other = 'y_act'
    
    y_other = 'fantasy_pts'
    y = 'y_act'


    stat_all = []
    stat_fp = []
    results_all = []
    results_fp = []
    for i in [2012, 2013, 2014, 2015, 2016, 2017]:

        print(i)
        X_train = df_train_full.loc[df_train_full.year < i].drop([y_other, y], axis=1)
        y_train = df_train_full.loc[df_train_full.year < i, y]

        X_fp = X_train[['year', 'rank', 'adp', 'best', 'worst', 'avg', 'std_dev']]
        X_all = X_train.drop(['player', 'team', 'pos','rank', 'adp', 'best', 'worst', 'avg', 'std_dev' ], axis=1)

        X_predict = df_train_full.loc[df_train_full.year== i].drop([y_other, y], axis=1)
        X_pred_fp = X_predict[['year', 'rank', 'adp', 'best', 'worst', 'avg', 'std_dev']]
        X_pred_all = X_predict.drop(['player', 'team', 'pos', 'rank', 'adp', 'best', 'worst', 'avg', 'std_dev'], axis=1)

        y_pred = df_train_full.loc[df_train_full.year == i, y]

        lass.fit(X_fp, y_train)
        fp_pred = lass.predict(X_pred_fp)
        stat_fp.extend(list(fp_pred))
        print('FP error:', round(np.mean(np.sqrt(abs(mean_squared_error(fp_pred, y_pred)))), 3))
        results_fp.append(round(np.mean(np.sqrt(abs(mean_squared_error(fp_pred, y_pred)))), 3))

        lass.fit(X_all.replace([np.inf, -np.inf], np.nan).fillna(0), y_train)
        all_pred = lass.predict(X_pred_all.replace([np.inf, -np.inf], np.nan).fillna(0))
        stat_all.extend(list(all_pred))
        print('All error:', round(np.mean(np.sqrt(abs(mean_squared_error(all_pred, y_pred)))), 3))
        results_all.append(round(np.mean(np.sqrt(abs(mean_squared_error(all_pred, y_pred)))), 3))
    
    stats_fp.append(stat_fp)
    stats_all.append(stat_all)
    
    if y == 'fantasy_pts':
        print('--------------')
        print('Fantasy Pros straight FP Error:', round(np.mean(results_fp), 3))
        print('All straight FP Error:', round(np.mean(results_all), 3))
        break

In [None]:
#----------------
# Convert Fantasy Pros and Lasso Stat Results to Points
#----------------

df_all = pd.DataFrame(stats_all).T
df_fp = pd.DataFrame(stats_fp).T

if set_pos == 'RB':
    
    df_all = (df_all * [0.1, 0.1, 0.5, 7]).sum(axis=1)
    df_fp = (df_fp * [0.1, 0.1, 0.5, 7]).sum(axis=1)

elif set_pos == 'WR' or set_pos == 'TE':
    df_all = (df_all * [0.1, 0.5, 7]).sum(axis=1)
    df_fp = (df_fp * [0.1, 0.5, 7]).sum(axis=1)
    
elif set_pos == 'QB':
    df_all = (df_all * [0.04, 5, 0.1, 7, -2, -1]).sum(axis=1)
    df_fp = (df_fp * [0.04, 5, 0.1, 7, -2, -1]).sum(axis=1)


y_test = df_train_full.loc[(df_train_full.year <= i) & (df_train_full.year > 2011), y_other]

print('Lasso error:', round(np.mean(np.sqrt(abs(mean_squared_error(df_all, y_test)))), 2))
print('FantasyPros error:', round(np.mean(np.sqrt(abs(mean_squared_error(df_fp, y_test)))), 2))

#----------------
# Merge Fantasy Pros Data with Full Model Results to get Matching Player Sets
#----------------

full_models = pd.merge(
              df_train_full.loc[(df_train_full.year <= i) & (df_train_full.year > 2011), ['player', 'year']],
              df_train_results, 
              how='inner', left_on=['player', 'year'], right_on=['player', 'year'])

y_test = df_train_full.loc[(df_train_full.year <= i) & (df_train_full.year > 2011), y_other]

if set_pos == 'RB':
    full_models['fantasy_pts'] = (full_models.iloc[:,2:]* [0.1, 0.1, 0.5, 7]).sum(axis=1)

elif set_pos == 'WR' or set_pos =='TE':
    full_models['fantasy_pts'] = (full_models.iloc[:,2:]* [0.1, 0.5, 7]).sum(axis=1)
    
elif set_pos == 'QB':
    full_models['fantasy_pts'] = (full_models.iloc[:,2:]* [0.04, 5, 0.1, 7, -2, -1]).sum(axis=1)

In [None]:
print('All Models:', round(np.mean(np.sqrt(abs(mean_squared_error(full_models.fantasy_pts, y_test)))), 2))

# RB

### No Air Yards (Large Dataset)

In [None]:
(4.34-3.8) / (np.mean([4.34, 3.8]))

### Using Air Yards (Small Dataset)

In [None]:
(4.34-4) / (np.mean([4.34, 4]))

# WR

### No Air Yards (Large Dataset)

In [None]:
(3.18 - 2.83) / (np.mean([3.18, 2.83]))

### Using Air Yards (Small Dataset)

In [None]:
(3.18 - 2.9) / (np.mean([3.18, 2.9]))

# TE (all years, no air yards)

In [None]:
(2.66-2.42) / (np.mean([2.66, 2.42]))

# QB

### Starting 2009

In [None]:
(3.13 - 3.01) / (np.mean([3.13, 3.01]))