# User Inputs

In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sqlalchemy import create_engine

import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [None]:
#==========
# General Setting
#==========

# set core path
path = '/Users/Mark/Documents/Github/Fantasy_Football/'

db_name = 'Model_Inputs.sqlite3'

# set to position to analyze: 'RB', 'WR', 'QB', or 'TE'
set_pos = 'WR'


#==========
# Postgres Database
#==========

# postgres login information
pg_log = {
    'USER': 'postgres',
    'PASSWORD': 'Ctdim#1bf!!!!!',
    'HOST': 'localhost',
    'PORT': '5432', 
    'DATABASE_NAME': 'fantasyfootball'
}

# create engine for connecting to database
engine = create_engine('postgres+psycopg2://{}:{}@{}:{}/{}'.format(pg_log['USER'], pg_log['PASSWORD'], pg_log['HOST'],
                                                                   pg_log['PORT'], pg_log['DATABASE_NAME']))

# specify schema and table to write out intermediate results
table_info = {
    'engine': engine,
    'schema': 'websitedev',
}

#==========
# Data Filtering
#==========

# set year to analyze
set_year = 2019
earliest_year = 2001

# set required touches (or pass thrown) and games for consideration
req_games = 8
req_touch = 60


#==========
# Fantasy Point Values
#==========

# define point values for all statistical categories
pass_yd_per_pt = 0.04 
pass_td_pt = 5
int_pts = -2
sacks = -1
rush_yd_per_pt = 0.1 
rec_yd_per_pt = 0.1
rush_rec_td = 7
ppr = 0.5
fumble = -2

# creating dictionary containing point values for each position
pts_dict = {}
pts_dict['QB'] = [pass_yd_per_pt, pass_td_pt, rush_yd_per_pt, rush_rec_td, int_pts, sacks, fumble]
pts_dict['RB'] = [rush_yd_per_pt, rec_yd_per_pt, ppr, rush_rec_td, fumble]
pts_dict['WR'] = [rec_yd_per_pt, ppr, rush_rec_td, fumble]
pts_dict['TE'] = [rec_yd_per_pt, ppr, rush_rec_td, fumble]


#==========
# Model Settings
#==========

# correlation with target for initial feature reduction
corr_cutoff = 0.25

# VIF threshold to include remaining features
vif_thresh = 5

# number of hypersearch rounds for training ensemble
iter_rounds = 50

# whether or not to plot feature importances following modeling
plot_importance = True

# Load Libraries

In [None]:
# core packages
import pandas as pd
import numpy as np
import os
import sqlite3

# jupyter specifications
pd.options.mode.chained_assignment = None
from IPython.core.interactiveshell import InteractiveShell
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# plotting functions
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# change directory temporarily to helper scripts
os.chdir(path + 'Scripts/Analysis/Helper_Scripts')

# load custom plot functions
from my_plot import PrettyPlot
PrettyPlot(plt)

# load custom helper functions and ensure lightgbm runs
from helper_functions import *
os.environ['KMP_DUPLICATE_LIB_OK']='True';

# Merge and Clean Data Files

In [None]:
#==========
# Pull and clean compiled data
#==========

# connect to database and pull in positional data
conn = sqlite3.connect(path + 'Data/' + db_name)
df = pd.read_sql_query('SELECT * FROM ' + set_pos + '_' + str(set_year), con=conn)

#pff = pd.read_sql_query('SELECT * FROM PFF_Receiving', con=sqlite3.connect(path + 'Data/Season_Stats.sqlite3'))
#df = pd.merge(df, pff, how='inner', left_on=['player', 'year'], right_on=['player', 'year'])

# split old and new to filter past years based on touches.
# leave all new players in to ensure everyone gets a prediction
old = df[(df[pos[set_pos]['touch_filter']] > req_touch) & (df.games > req_games) & (df.year < set_year-1)].reset_index(drop=True)
this_year = df[df.year==set_year-1]

# merge old and new back together after filtering
df = pd.concat([old, this_year], axis=0)

# create dataframes to store results
df_train_results = pd.DataFrame([old.player, old.year]).T
df_test_results = pd.DataFrame([this_year.player]).T

# calculate FP and generate list of relevant metrics
df = calculate_fp(df, pts_dict, pos=set_pos).reset_index(drop=True)

In [None]:
#==========
# Loop to create statistical predictions
#==========

output = {}

for metric in pos[set_pos]['metrics']:
    
    # create dataframes to store chunk data
    df_train_chunks = pd.DataFrame()
    df_test_chunks = pd.DataFrame()

    # print which metric is being calculated
    print('Running Models for ' + metric)
    print('----------------------------------')

    #--------
    # Create train and predict dataframes
    #--------
    
    # create training and prediction dataframes
    df_train_full, df_predict_full = features_target(df,
                                                     earliest_year, set_year-1,
                                                     pos[set_pos]['med_features'],
                                                     pos[set_pos]['sum_features'],
                                                     pos[set_pos]['max_features'],
                                                     pos[set_pos]['age_features'],
                                                     target_feature=metric)
    
    # drop any rows that have a null target value (likely due to injuries or other missed season)
    df_train_full = df_train_full.dropna(subset=['y_act']).reset_index(drop=True)
    df_train_full = df_train_full.fillna(df_train_full.mean())
    df_predict_full = df_predict_full.dropna().reset_index(drop=True)
    
    # loop through different k's specific to draft position cutoffs
    for k in [1]:
        
        if k == 1:
            
            df_train = df_train_full
            df_predict = df_predict_full
            #df_train = df_train_full[df_train_full.avg_pick < 4.6].reset_index(drop=True)
            #df_predict = df_predict_full[df_predict_full.avg_pick < 4.6].reset_index(drop=True)
        
        #if k == 2:
        #    df_train = df_train_full[(df_train_full.avg_pick >= 3.6) & (df_train_full.avg_pick < 4.6)].reset_index(drop=True)
        #    df_predict = df_predict_full[(df_predict_full.avg_pick >= 3.6) & (df_predict_full.avg_pick < 4.6)].reset_index(drop=True)
        
        #if k == 2:
        #    df_train = df_train_full[df_train_full.avg_pick >= 4.6].reset_index(drop=True)
        #    df_predict = df_predict_full[df_predict_full.avg_pick >= 4.6].reset_index(drop=True)
            
        #--------
        # Remove low correlation features and high VIF features
        #--------

        # remove low correlation features
        df_train, df_predict = corr_removal(df_train, df_predict, corr_cutoff=corr_cutoff)

        # select only features with low vif for modeling
        transformer = ReduceVIF(thresh=vif_thresh, scale=True, print_progress=False)
        df_train_ = transformer.fit_transform(df_train.drop(['y_act', 'player'], axis=1), df_train.y_act)

        # extract best columns and filter down df_predict
        best_cols = list(df_train_.columns)
        best_cols.extend(['player', 'year', 'avg_pick'])
        df_predict = df_predict[best_cols]
        df_predict = df_predict.loc[:,~df_predict.columns.duplicated()]

        # add target and filter down df_train
        best_cols.extend(['y_act', 'year', 'avg_pick'])
        df_train = df_train[best_cols]
        df_train = df_train.loc[:,~df_train.columns.duplicated()]

        #--------
        # Run ensemble model with parameter optimization
        #--------

        # generate a master dictionary of parameters (must match the)
        param_list = [lgbm_params, xgb_params, lasso_params, ridge_params]
        est_names = ['lgbm', 'xgb', 'lasso', 'ridge']

        params = {}
        for i, param in enumerate(param_list):
            params[est_names[i]] = param

        print('Training Ensemble Model')
        param_results, summary, df_train_results_, errors = validation(est_names, params, df_train, iterations=iter_rounds, random_state=1234)

        #--------
        # Print best results
        #--------

        # print a summary of error metrics, weightings of various models, and a comparison to 
        # using straight adp as as a prediction for next year's stats
        print(summary.head(10))
        path = '/Users/Mark/Documents/Github/Fantasy_Football/Scripts/Analysis/ParamSearch/'
        label = '{}{}_G{}_Touch{}_Corr{}_VIF{}.csv'.format(path, set_pos, req_games, req_touch, corr_cutoff, vif_thresh)
        summary.to_csv(label, index=False, mode='a')

        # pull out the best result for the random hyperparameter search of models
        best_result = summary.index[0]

        # pass the best hyperparameters into the generation_prediction function, which
        # will return the test results for the current year and the trained best models
        df_test_results_, models = generate_predictions(best_result, param_results, summary, df_train, df_predict)

        #--------
        # Aggregate all results through merging
        #--------

        # add models to output dictionary
        output[metric] = {}
        output[metric][k] = {}
        output[metric][k]['models'] = models

        # add params to output dictionary
        output[metric][k]['params'] = param_results

        # add columns to output dictionary
        cols = list(df_train.columns)
        cols.remove('y_act')
        cols.remove('player')
        output[metric]['cols'] = cols
        
        # concat the chunk for each metric together into one dataframe
        df_train_results_ = df_train_results_.rename(columns={'pred': 'pred_' + metric})
        df_train_results_ = df_train_results_[['player', 'year', 'pred_' + metric]]
        df_train_chunks = pd.concat([df_train_chunks, df_train_results_], axis=0).reset_index(drop=True)
        
        # concat the chunk for each metric together into one dataframe
        df_test_results_ = df_test_results_.rename(columns={'pred': 'pred_' + metric})
        df_test_results_ = df_test_results_[['player', 'pred_' + metric]]
        df_test_chunks = pd.concat([df_test_chunks, df_test_results_], axis=0).reset_index(drop=True)
    
    # merge the train results for the given metric with all other metric outputs
    df_train_results = pd.merge(df_train_results, df_train_chunks, 
                                how='inner', left_on=['player', 'year'], right_on=['player', 'year'])
    # merge the test results for the given metric with all other metric outputs
    df_test_results = pd.merge(df_test_results, df_test_chunks, 
                               how='inner', left_on='player', right_on='player')

# after loop, set the df_train to have the y_act as fp_per_game
df_train, df_predict = features_target(df, earliest_year, set_year-1, 
                                           pos[set_pos]['med_features'], 
                                           pos[set_pos]['sum_features'],
                                           pos[set_pos]['max_features'], 
                                           pos[set_pos]['age_features'],
                                           target_feature='fp_per_game')

## Write out results to Postgres

In [None]:
#--------
# Append additional stat categories to the results
#--------

# add actual results and adp to the train df
train_results = pd.merge(df_train_results, df_train[['player', 'year', 'age', 'year_exp', 'avg_pick', 'y_act']],
                           how='inner', left_on=['player', 'year'], right_on=['player', 'year']).drop('year', axis=1)

# add adp to the test df
test_results = pd.merge(df_test_results, df_predict[['player', 'age', 'year_exp', 'avg_pick']],
                           how='inner', left_on='player', right_on='player')

#--------
# Set up proper database connections to save out single dataset
#--------

train_results.to_sql(set_pos + '_Train_' + str(set_year), engine, schema='websitedev', index=False, if_exists='replace')
test_results.to_sql(set_pos + '_Test_' + str(set_year), engine, schema='websitedev', index=False, if_exists='replace')

# #--------
# # Calculate Fantasy Points for Given Scoring System
# #-------- 

# # extract points list and get the idx of point attributes based on length of list
# pts_list = pts_dict[set_pos][:-1]
# print(pts_list)
# c_idx = len(pts_list) + 1

# # multiply stat categories by corresponding point values
# train_results.iloc[:, 1:c_idx] = train_results.iloc[:, 1:c_idx] * pts_list
# test_results.iloc[:, 1:c_idx] = test_results.iloc[:, 1:c_idx] * pts_list

# # add a total predicted points stat category
# train_results.loc[:, 'pred'] = train_results.iloc[:, 1:c_idx].sum(axis=1)
# test_results.loc[:, 'pred'] = test_results.iloc[:, 1:c_idx].sum(axis=1)

# #==========
# # Plot Predictions for Each Player
# #==========

# # set length of plot based on number of results
# plot_length = int(test_results.shape[0] / 3.5)

# # plot results from highest predicted FP to lowest predicted FP
# test_results.sort_values('pred').plot.barh(x='player', y='pred', figsize=(5, plot_length));

In [None]:
#==========
# If desired, plot feature importances for a given metric / model
#==========
plot_importance=True
if plot_importance == True:
    
    metric = 'rush_yd_per_game'
    k = 1
    j = 3
    try:
        plot_results(output[metric][k]['models'][j].feature_importances_, col_names=output[metric]['cols']);
    except:
        plot_results(output[metric][k]['models'][j].coef_, col_names=output[metric]['cols']);

# Creating Salary, Injury Tables and SQLite Output

In [None]:
sal = pd.read_csv('/Users/Mark/Desktop/salaries.csv')
sal.to_sql('salaries_' + str(set_year), engine, schema='websitedev', index=False, if_exists='replace')

In [None]:
from sklearn.preprocessing import StandardScaler

inj = pd.read_csv('/Users/Mark/Desktop/injury_predictor_2019.csv',  encoding='latin-1', header=None)
inj.columns = ['player', 'pct_miss_one', 'proj_games_missed', 'inj_pct_per_game', 'inj_risk', 'points']
inj.player = inj.player.apply(lambda x: x.split(',')[0])
inj.pct_miss_one = inj.pct_miss_one.apply(lambda x: float(x.strip('%')))
inj.inj_pct_per_game = inj.inj_pct_per_game.apply(lambda x: float(x.strip('%')))
inj = inj.drop(['points', 'inj_risk'], axis=1)

X = StandardScaler().fit_transform(inj.iloc[:, 1:])
inj = pd.concat([pd.DataFrame(inj.player), 
                 pd.DataFrame(X, columns=['pct_miss_one', 'proj_games_missed', 'pct_per_game'])], 
                axis=1)
for col in ['pct_miss_one', 'proj_games_missed', 'pct_per_game']:
    inj[col] = inj[col] + abs(inj[col].min())

inj['mean_risk'] = inj.iloc[:, 1:].mean(axis=1)
inj = inj[['player', 'mean_risk']].sort_values(by='mean_risk').reset_index(drop=True)
inj.loc[inj.player=='Kareem Hunt', 'mean_risk'] = 8
inj.loc[inj.player=='Melvin Gordon', 'mean_risk'] = inj.loc[inj.player=='Melvin Gordon', 'mean_risk'] + 1

inj = inj[['player', 'mean_risk']]
inj.to_sql('injuries_' + str(set_year), engine, schema='websitedev', index=False, if_exists='replace')

In [None]:
db = sqlite3.connect('/Users/Mark/Desktop/FF_Sim/SimInput_v1.sqlite3')

for set_pos in ['QB', 'RB', 'WR', 'TE']:
    
    train = pd.read_sql_query('SELECT * FROM {}."{}_Train_{}"' \
                                     .format(table_info['schema'], set_pos, str(set_year)), table_info['engine'])
    test = pd.read_sql_query('SELECT * FROM {}."{}_Test_{}"' \
                                    .format(table_info['schema'], set_pos, str(set_year)), table_info['engine'])
    
    train.to_sql(name='{}_Train_{}'.format(set_pos, str(set_year)), con=db, if_exists='replace', index=False)
    test.to_sql(name='{}_Test_{}'.format(set_pos, str(set_year)), con=db, if_exists='replace', index=False)
    

sal = pd.read_sql_query('SELECT * FROM {}."salaries_2019"' \
                                    .format(table_info['schema'], set_pos, str(set_year)), table_info['engine'])
sal.to_sql(name='salaries_2019', con=db, if_exists='replace', index=False)

inj = pd.read_sql_query('SELECT * FROM {}."injuries_2019"' \
                                    .format(table_info['schema'], set_pos, str(set_year)), table_info['engine'])
inj.to_sql(name='injuries_2019', con=db, if_exists='replace', index=False)

# Comparing Fantasy Pros

In [None]:
# create training and prediction dataframes
fantasy_points = []
#for m in ['rush_yd_per_game', 'rec_yd_per_game', 'rec_per_game', 'td_per_game']:
for m in ['rec_yd_per_game', 'rec_per_game', 'td_per_game']:
    df_train_full, df_predict_full = features_target(df,
                                                     earliest_year, set_year-1,
                                                     pos[set_pos]['med_features'],
                                                     pos[set_pos]['sum_features'],
                                                     pos[set_pos]['max_features'],
                                                     pos[set_pos]['age_features'],
                                                     target_feature=m)
    fantasy_points.append(list(df_train_full.y_act.values))
    
fantasy_pts = pd.DataFrame(fantasy_points).T.reset_index(drop=True)
#fantasy_pts = (fantasy_pts * [0.1, 0.1, 0.5, 7]).sum(axis=1)
fantasy_pts = (fantasy_pts * [0.1, 0.5, 7]).sum(axis=1)
fantasy_pts.name = 'fantasy_pts'

In [None]:
stats_fp = []
stats_all = []
#for metric in ['rush_yd_per_game', 'rec_yd_per_game', 'rec_per_game', 'td_per_game']:
for metric in ['rec_yd_per_game', 'rec_per_game', 'td_per_game']:

    df_train_full, df_predict_full = features_target(df,
                                                         earliest_year, set_year-1,
                                                         pos[set_pos]['med_features'],
                                                         pos[set_pos]['sum_features'],
                                                         pos[set_pos]['max_features'],
                                                         pos[set_pos]['age_features'],
                                                         target_feature=metric)

    df_train_full = pd.concat([df_train_full, fantasy_pts], axis=1)

    fp = pd.read_sql_query('SELECT * FROM FantasyPros', con=sqlite3.connect('/Users/Mark/Documents/Github/Fantasy_Football/Data/Season_Stats.sqlite3'))
    fp.year = fp.year-1
    df_train_full = pd.merge(df_train_full, fp, how='inner', left_on=['player', 'year'], right_on=['player', 'year'])
    df_train_full = df_train_full[df_train_full.fp > 5]


    from sklearn.linear_model import Lasso
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import mean_squared_error
    from sklearn.linear_model import Ridge


    df_train_full = df_train_full.dropna()
    lass = Lasso(alpha=250)
#     y = 'fantasy_pts'
#     y_other = 'y_act'
    
    y_other = 'fantasy_pts'
    y = 'y_act'


    stat_all = []
    stat_fp = []
    results_all = []
    results_fp = []
    for i in [2012, 2013, 2014, 2015, 2016, 2017]:

        print(i)
        X_train = df_train_full.loc[df_train_full.year < i].drop([y_other, y], axis=1)
        y_train = df_train_full.loc[df_train_full.year < i, y]

        X_fp = X_train[['year', 'rank', 'adp', 'best', 'worst', 'avg', 'std_dev']]
        X_all = X_train.drop(['player', 'team', 'pos','rank', 'adp', 'best', 'worst', 'avg', 'std_dev' ], axis=1)

        X_predict = df_train_full.loc[df_train_full.year== i].drop([y_other, y], axis=1)
        X_pred_fp = X_predict[['year', 'rank', 'adp', 'best', 'worst', 'avg', 'std_dev']]
        X_pred_all = X_predict.drop(['player', 'team', 'pos', 'rank', 'adp', 'best', 'worst', 'avg', 'std_dev'], axis=1)

        y_pred = df_train_full.loc[df_train_full.year == i, y]

        lass.fit(X_fp, y_train)
        fp_pred = lass.predict(X_pred_fp)
        stat_fp.extend(list(fp_pred))
        print('FP error:', round(np.mean(np.sqrt(abs(mean_squared_error(fp_pred, y_pred)))), 3))
        results_fp.append(round(np.mean(np.sqrt(abs(mean_squared_error(fp_pred, y_pred)))), 3))



        lass.fit(X_all, y_train)
        all_pred = lass.predict(X_pred_all)
        stat_all.extend(list(all_pred))
        print('All error:', round(np.mean(np.sqrt(abs(mean_squared_error(all_pred, y_pred)))), 3))
        results_all.append(round(np.mean(np.sqrt(abs(mean_squared_error(all_pred, y_pred)))), 3))
    
    stats_fp.append(stat_fp)
    stats_all.append(stat_all)

In [None]:
print('Fantasy Pros straight FP Error:', round(np.mean(results_fp), 3))
print('All straight FP Error:', round(np.mean(results_all), 3))

In [None]:
df_all = pd.DataFrame(stats_all).T
df_fp = pd.DataFrame(stats_fp).T

# df_all = (df_all * [0.1, 0.1, 0.5, 7]).sum(axis=1)
# df_fp = (df_fp * [0.1, 0.1, 0.5, 7]).sum(axis=1)

df_all = (df_all * [0.1, 0.5, 7]).sum(axis=1)
df_fp = (df_fp * [0.1, 0.5, 7]).sum(axis=1)

y_test = df_train_full.loc[(df_train_full.year <= i) & (df_train_full.year > 2011), y_other]

In [None]:
print('Lasso error:', round(np.mean(np.sqrt(abs(mean_squared_error(df_all, y_test)))), 2))
print('FantasyPros error:', round(np.mean(np.sqrt(abs(mean_squared_error(df_fp, y_test)))), 2))

In [None]:
full_models = pd.merge(
              df_train_full.loc[(df_train_full.year <= i) & (df_train_full.year > 2011), ['player', 'year']],
              df_train_results, 
              how='inner', left_on=['player', 'year'], right_on=['player', 'year'])

In [None]:
#full_models['fantasy_pts'] = (full_models.iloc[:,2:]* [0.1, 0.1, 0.5, 7]).sum(axis=1)
full_models['fantasy_pts'] = (full_models.iloc[:,2:]* [0.1, 0.5, 7]).sum(axis=1)

In [None]:
print('All Models:', round(np.mean(np.sqrt(abs(mean_squared_error(full_models.fantasy_pts, y_test)))), 2))

In [None]:
(4.34-3.93) / (np.mean([4.34, 3.93]))

In [None]:
(3.2 - 2.88) / (np.mean([3.2, 2.88]))