# Load Libraries

In [1]:
# core packages
import pandas as pd
import numpy as np
import os
import sqlite3
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sqlalchemy import create_engine
from imblearn.over_sampling import SMOTE
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# jupyter specifications
pd.options.mode.chained_assignment = None
from IPython.core.interactiveshell import InteractiveShell
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# plotting functions
import matplotlib.pyplot as plt
import seaborn as sns

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# set core path
path = '/Users/Mark/Documents/Github/Fantasy_Football/'

# change directory temporarily to helper scripts
os.chdir(path + 'Scripts/Analysis/Helper_Scripts')

# load custom plot functions
from my_plot import PrettyPlot
PrettyPlot(plt)

# load custom helper functions and ensure lightgbm runs
from helper_functions import *
os.environ['KMP_DUPLICATE_LIB_OK']='True';

# User Inputs

In [175]:
#==========
# General Setting
#==========

db_name = 'Model_Inputs.sqlite3'

# set to position to analyze: 'RB', 'WR', 'QB', or 'TE'
set_pos = 'RB'


#==========
# Postgres Database
#==========

# postgres login information
pg_log = {
    'USER': 'postgres',
    'PASSWORD': 'Ctdim#1bf!!!!!',
    'HOST': 'localhost',
    'PORT': '5432', 
    'DATABASE_NAME': 'fantasyfootball'
}

# create engine for connecting to database
engine = create_engine('postgres+psycopg2://{}:{}@{}:{}/{}'.format(pg_log['USER'], pg_log['PASSWORD'], pg_log['HOST'],
                                                                   pg_log['PORT'], pg_log['DATABASE_NAME']))

# specify schema and table to write out intermediate results
table_info = {
    'engine': engine,
    'schema': 'websitedev',
}

#==========
# Data Filtering
#==========

# set year to analyze
set_year = 2019

# set required touches (or pass thrown) and games for consideration
req_games = 8


#==========
# Fantasy Point Values
#==========

# define point values for all statistical categories
pass_yd_per_pt = 0.04 
pass_td_pt = 4
int_pts = -2
sacks = -1
rush_yd_per_pt = 0.1 
rec_yd_per_pt = 0.1
rush_rec_td = 7
ppr = 0.5
fumble = -2

# creating dictionary containing point values for each position
pts_dict = {}
pts_dict['QB'] = [pass_yd_per_pt, pass_td_pt, rush_yd_per_pt, rush_rec_td, int_pts, sacks]
pts_dict['RB'] = [rush_yd_per_pt, rec_yd_per_pt, ppr, rush_rec_td]
pts_dict['WR'] = [rec_yd_per_pt, ppr, rush_rec_td]
pts_dict['TE'] = [rec_yd_per_pt, ppr, rush_rec_td]


#==========
# Model Settings
#==========

pos['QB']['req_touch'] = 50
pos['RB']['req_touch'] = 50
pos['WR']['req_touch'] = 30
pos['TE']['req_touch'] = 30

pos['QB']['earliest_year'] = 2010
pos['RB']['earliest_year'] = 2001
pos['WR']['earliest_year'] = 1999
pos['TE']['earliest_year'] = 1999

pos['QB']['skip_years'] = 1
pos['RB']['skip_years'] = 3
pos['WR']['skip_years'] = 3
pos['TE']['skip_years'] = 3

pos['QB']['iter_rounds'] = 50
pos['RB']['iter_rounds'] = 100
pos['WR']['iter_rounds'] = 100
pos['TE']['iter_rounds'] = 100

pos['QB']['use_ay'] = False
pos['RB']['use_ay'] = False
pos['WR']['use_ay'] = True
pos['TE']['use_ay'] = True

# whether or not to plot feature importances following modeling
plot_importance = True

In [176]:
#==========
# Pull and clean compiled data
#==========

# connect to database and pull in positional data
conn = sqlite3.connect(path + 'Data/' + db_name)
df = pd.read_sql_query('SELECT * FROM ' + set_pos + '_' + str(set_year), con=conn)

if pos[set_pos]['use_ay']:
    ay = pd.read_sql_query('SELECT * FROM AirYards', con=sqlite3.connect(path + 'Data/Season_Stats.sqlite3'))
    df = pd.merge(df, ay, how='inner', left_on=['player', 'year'], right_on=['player', 'year'])

# pff_rec = pd.read_sql_query('SELECT * FROM PFF_Receiving', con=sqlite3.connect(path + 'Data/Season_Stats.sqlite3'))
# pff_rec = pff_rec.drop_duplicates(subset=['player', 'year']).reset_index(drop=True)
# df = pd.merge(df, pff_rec, how='inner', left_on=['player', 'year'], right_on=['player', 'year'])

# split old and new to filter past years based on touches.
# leave all new players in to ensure everyone gets a prediction
old = df[(df[pos[set_pos]['touch_filter']] > pos[set_pos]['req_touch']) & \
         (df.games > req_games) & (df.year < set_year-1)].reset_index(drop=True)
this_year = df[df.year==set_year-1]

# merge old and new back together after filtering
df = pd.concat([old, this_year], axis=0)

# create dataframes to store results
df_train_results = pd.DataFrame([old.player, old.year]).T
df_test_results = pd.DataFrame([this_year.player, this_year.year]).T

# calculate FP and generate list of relevant metrics
df = calculate_fp(df, pts_dict, pos=set_pos).reset_index(drop=True)

In [177]:
if pos[set_pos]['use_ay']:
    cols = ['rec_yd_per_game', 'rec_per_game', 'td_per_game', 'wosp', 'air_yards', 
            'rz_20_tgt', 'rz_20_receptions', 'avg_pick', 'min_teammate']
    
elif set_pos == 'QB':
    cols = ['fp', 'qb_tds','qb_rating', 'qb_yds', 'pass_off', 'qb_complete_pct', 'qb_td_pct', 
             'sack_pct', 'avg_pick', 'sacks_allowed', 'qbr', 'adj_yd_per_att', 'adj_net_yd_per_att',
             'int', 'int_pct', 'rush_att', 'rush_yds', 'rush_yd_per_game', 'rush_td', 'rush_yd_per_att']
else:
    cols = ['rec_yd_per_game', 'rec_per_game', 'td_per_game', 'rz_20_rush_att', 'rz_5_rush_att', 
            'rz_20_rush_pct', 'rz_5_rush_pct','ms_rush_att',
            'ms_rush_yd', 'ms_rush_td', 'ms_rec_yd', 'ms_tgts', 'rush_rec_ratio',
            'rz_20_tgt', 'rz_20_receptions', 'avg_pick', 'teammate_diff_min', 'teammate_diff_avg']

def add_exp_metrics(df, cols):
    for lab in cols:
        d = df.groupby('year_exp').agg('mean')[lab].reset_index()
        d = d.rename(columns={lab: lab + '_exp'})
        df = pd.merge(df, d, how='inner', left_on='year_exp', right_on='year_exp')
        df[lab + '_exp_diff'] = df[lab] - df[lab + '_exp']
        df[lab + '_exp_div'] = df[lab] / df[lab + '_exp']
        
    return df
        
df = add_exp_metrics(df, cols)

# Merge and Clean Data Files

In [178]:
#---------
# QB Dictionary
#---------

pos['QB']['med_features'] = ['fp', 'qb_tds','qb_rating', 'qb_yds', 'pass_off', 'qb_complete_pct', 'qb_td_pct', 
                             'sack_pct', 'avg_pick', 'sacks_allowed', 'qbr', 'adj_yd_per_att', 'adj_net_yd_per_att',
                             'int', 'int_pct', 'rush_att', 'rush_yds', 'rush_td', 'rush_yd_per_game', 'rush_yd_per_att',
                             'rz_20_pass_complete', 'rz_20_pass_att',
                               'rz_20_complete_pct', 'rz_20_pass_yds', 'rz_20_pass_td', 'rz_20_int',
                               'rz_10_pass_complete', 'rz_10_pass_att', 'rz_10_complete_pct',
                               'rz_10_pass_yds', 'rz_10_pass_td', 'rz_10_int', 'rz_20_rush_att',
                               'rz_20_rush_yds', 'rz_20_rush_td', 'rz_20_rush_pct', 'rz_10_rush_att',
                               'rz_10_rush_yds', 'rz_10_rush_td', 'rz_10_rush_pct', 'rz_5_rush_att',
                               'rz_5_rush_yds', 'rz_5_rush_td', 'rz_5_rush_pct']
pos['QB']['max_features'] = ['fp', 'qb_rating', 'qb_yds', 'qb_tds', 'int', 'int_pct', 'sack_pct', 'rush_yd_per_att']
pos['QB']['age_features'] = ['fp', 'qb_rating', 'qb_yds', 'qb_complete_pct', 'qb_td_pct', 'sack_pct', 'rush_yd_per_game', 
                             'avg_pick', 'qbr', 'int', 'int_pct', 'rush_att', 'rush_yds', 'rush_td', 'rush_yd_per_att']
pos['QB']['sum_features'] = ['qb_tds', 'qb_yds', 'fourth_qt_comeback', 'game_winning_drives', 'fp']


#---------
# RB Dictionary
#---------

# median feature categories
pos['RB']['med_features'] = ['fp', 'tgt', 'receptions', 'total_touches', 'rush_yds', 'rec_yds', 
                           'rush_yd_per_game', 'rec_yd_per_game', 'rush_td', 'games_started', 
                           'qb_rating', 'qb_yds', 'pass_off', 'tm_rush_td', 'tm_rush_yds', 
                           'tm_rush_att', 'adjust_line_yds', 'ms_rush_yd', 'ms_rec_yd', 'ms_rush_td',
                           'avg_pick', 'fp_per_touch', 'team_rush_avg_att',
                            'rz_20_rush_att', 'rz_20_rush_yds', 'rz_20_rush_td', 'rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                            'rz_10_rush_att', 'rz_10_rush_yds', 'rz_10_rush_td', 'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',
                            'rz_5_rush_att', 'rz_5_rush_yds', 'rz_5_rush_td',
                            'rec_yd_per_game_exp', 'rec_yd_per_game_exp_diff',
       'rec_yd_per_game_exp_div', 'rec_per_game_exp', 'rec_per_game_exp_diff',
       'rec_per_game_exp_div', 'td_per_game_exp', 'td_per_game_exp_diff',
       'td_per_game_exp_div', 'rz_20_rush_att_exp', 'rz_20_rush_att_exp_diff',
       'rz_20_rush_att_exp_div', 'rz_5_rush_att_exp', 'rz_5_rush_att_exp_diff',
       'rz_5_rush_att_exp_div', 'rz_20_rush_pct_exp',
       'rz_20_rush_pct_exp_diff', 'rz_20_rush_pct_exp_div',
       'rz_5_rush_pct_exp', 'rz_5_rush_pct_exp_diff', 'rz_5_rush_pct_exp_div',
       'ms_rush_att_exp', 'ms_rush_att_exp_diff', 'ms_rush_att_exp_div',
       'ms_rush_yd_exp', 'ms_rush_yd_exp_diff', 'ms_rush_yd_exp_div',
       'ms_rush_td_exp', 'ms_rush_td_exp_diff', 'ms_rush_td_exp_div',
       'ms_rec_yd_exp', 'ms_rec_yd_exp_diff', 'ms_rec_yd_exp_div',
       'ms_tgts_exp', 'ms_tgts_exp_diff', 'ms_tgts_exp_div',
       'rush_rec_ratio_exp', 'rush_rec_ratio_exp_diff',
       'rush_rec_ratio_exp_div', 'rz_20_tgt_exp', 'rz_20_tgt_exp_diff',
       'rz_20_tgt_exp_div', 'rz_20_receptions_exp',
       'rz_20_receptions_exp_diff', 'rz_20_receptions_exp_div', 'avg_pick_exp',
       'avg_pick_exp_diff', 'avg_pick_exp_div', 'teammate_diff_min_exp',
       'teammate_diff_min_exp_diff', 'teammate_diff_min_exp_div',
       'teammate_diff_avg_exp', 'teammate_diff_avg_exp_diff',
       'teammate_diff_avg_exp_div']

# sum feature categories
pos['RB']['sum_features'] = ['total_touches', 'att', 'total_yds', 'rush_td', 'fp', 'rec_yds', 
                             'rush_yds', 'qb_yds']

# max feature categories
pos['RB']['max_features'] = ['fp', 'rush_td', 'tgt', 'rush_yds', 'rec_yds', 'total_yds', 
                             'rush_yd_per_game', 'rec_yd_per_game', 'ms_rush_yd']

# age feature categories
pos['RB']['age_features'] = ['fp', 'rush_yd_per_game', 'rec_yd_per_game', 'total_touches', 'receptions', 'tgt',
                             'ms_rush_yd', 'ms_rec_yd', 'available_rush_att', 'available_tgt', 'total_touches_sum',
                             'total_yds_sum', 'avg_pick', 'fp_per_touch', 'ms_rush_yd_per_att', 'ms_tgts',
                            'rz_20_rush_att', 'rz_20_rush_yds', 'rz_20_rush_td', 'rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                            'rz_10_rush_att', 'rz_10_rush_yds', 'rz_10_rush_td', 'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',
                            'rz_5_rush_att', 'rz_5_rush_yds', 'rz_5_rush_td',
                            'rec_yd_per_game_exp', 'rec_yd_per_game_exp_diff',
       'rec_yd_per_game_exp_div', 'rec_per_game_exp', 'rec_per_game_exp_diff',
       'rec_per_game_exp_div', 'td_per_game_exp', 'td_per_game_exp_diff',
       'td_per_game_exp_div', 'rz_20_rush_att_exp', 'rz_20_rush_att_exp_diff',
       'rz_20_rush_att_exp_div', 'rz_5_rush_att_exp', 'rz_5_rush_att_exp_diff',
       'rz_5_rush_att_exp_div', 'rz_20_rush_pct_exp',
       'rz_20_rush_pct_exp_diff', 'rz_20_rush_pct_exp_div',
       'rz_5_rush_pct_exp', 'rz_5_rush_pct_exp_diff', 'rz_5_rush_pct_exp_div',
       'ms_rush_att_exp', 'ms_rush_att_exp_diff', 'ms_rush_att_exp_div',
       'ms_rush_yd_exp', 'ms_rush_yd_exp_diff', 'ms_rush_yd_exp_div',
       'ms_rush_td_exp', 'ms_rush_td_exp_diff', 'ms_rush_td_exp_div',
       'ms_rec_yd_exp', 'ms_rec_yd_exp_diff', 'ms_rec_yd_exp_div',
       'ms_tgts_exp', 'ms_tgts_exp_diff', 'ms_tgts_exp_div',
       'rush_rec_ratio_exp', 'rush_rec_ratio_exp_diff',
       'rush_rec_ratio_exp_div', 'rz_20_tgt_exp', 'rz_20_tgt_exp_diff',
       'rz_20_tgt_exp_div', 'rz_20_receptions_exp',
       'rz_20_receptions_exp_diff', 'rz_20_receptions_exp_div', 'avg_pick_exp',
       'avg_pick_exp_diff', 'avg_pick_exp_div', 'teammate_diff_min_exp',
       'teammate_diff_min_exp_diff', 'teammate_diff_min_exp_div',
       'teammate_diff_avg_exp', 'teammate_diff_avg_exp_diff',
       'teammate_diff_avg_exp_div']


#---------
# WR Dictionary
#---------

# median feature categories
pos['WR']['med_features'] = ['fp', 'tgt', 'receptions', 'rec_yds', 'rec_yd_per_game', 'rec_td', 'games_started', 
                             'qb_rating', 'qb_yds', 'pass_off', 'ms_tgts', 'ms_rec_yd', 
                             'tm_net_pass_yds', 'avg_pick',  'rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                             'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',
                            'rec_yd_per_game_exp_diff', 'rec_per_game_exp_diff', 'td_per_game_exp_diff', 
                            'wosp', 'wosp_exp_diff', 'wosp_exp_div', 'air_yards_exp_diff',
                             'rz_20_tgt_exp_diff', 'rz_20_receptions_exp_diff'
                            ]

# sum feature categories
pos['WR']['sum_features'] = ['receptions', 'rec_yds', 'tgt']

# max feature categories
pos['WR']['max_features'] = ['fp', 'rec_td', 'tgt', 'ms_tgts', 'ms_rec_yd', 'rec_yd_per_game',
                             'rz_20_tgt', 'rz_20_receptions', 
                             'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',]

# age feature categories
pos['WR']['age_features'] = ['fp', 'rec_yd_per_game', 'receptions', 'tgt', 'ms_tgts', 'ms_rec_yd', 
                             'avg_pick', 'ms_yds_per_tgts', 'rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                             'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',
                            'rec_yd_per_game_exp_diff', 'rec_per_game_exp_diff', 'td_per_game_exp_diff',
                            'rec_yd_per_game_exp_div', 'rec_per_game_exp_div', 'td_per_game_exp_div',
                            'wosp', 'wosp_exp_diff', 'wosp_exp_div', 'air_yards_exp_diff',
                             'rz_20_tgt_exp_diff', 'rz_20_receptions_exp_diff'
                            ]

#---------
# TE Dictionary
#---------

# median feature categories
pos['TE']['med_features'] = ['fp', 'tgt', 'receptions', 'rec_yds', 'rec_yd_per_game', 'rec_td', 'games_started', 
                             'qb_rating', 'qb_yds', 'pass_off', 'ms_tgts', 'ms_rec_yd', 
                             'tm_net_pass_yds', 'avg_pick','rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                             'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',
                            'rec_yd_per_game_exp_diff', 'rec_per_game_exp_diff', 'td_per_game_exp_diff', 
                            'wosp', 'wosp_exp_diff', 'wosp_exp_div', 'air_yards_exp_diff',
                             'rz_20_tgt_exp_diff', 'rz_20_receptions_exp_diff']
# sum feature categories
pos['TE']['sum_features'] = ['receptions', 'rec_yds', 'tgt', 'rec_td', 'qb_yds']

# max feature categories
pos['TE']['max_features'] = ['fp', 'rec_td', 'tgt', 'ms_tgts', 'rec_yds', 'ms_rec_yd', 'rec_yd_per_game',
                             'rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                             'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',]

# age feature categories
pos['TE']['age_features'] = ['fp', 'rec_yd_per_game', 'receptions', 'tgt', 'ms_tgts', 'ms_rec_yd', 
                             'avg_pick', 'ms_yds_per_tgts','rz_20_tgt', 'rz_20_receptions', 
                            'rz_20_catch_pct', 'rz_20_rec_yds', 'rz_20_rec_tds',
                             'rz_10_tgt', 'rz_10_receptions', 
                            'rz_10_catch_pct', 'rz_10_rec_yds', 'rz_10_rec_tds',
                            'rec_yd_per_game_exp_diff', 'rec_per_game_exp_diff', 'td_per_game_exp_diff', 
                            'wosp', 'wosp_exp_diff', 'wosp_exp_div', 'air_yards_exp_diff',
                             'rz_20_tgt_exp_diff', 'rz_20_receptions_exp_diff']

In [None]:
#==========
# Loop to create statistical predictions
#==========

output = {}


for metric in pos[set_pos]['metrics']:
    
    # print which metric is being calculated
    print('Running Models for ' + metric)
    print('----------------------------------')

    #--------
    # Create train and predict dataframes
    #--------
    
    # create training and prediction dataframes
    df_train, df_predict = features_target(df,
                                           pos[set_pos]['earliest_year'], set_year-1,
                                           pos[set_pos]['med_features'],
                                           pos[set_pos]['sum_features'],
                                           pos[set_pos]['max_features'],
                                           pos[set_pos]['age_features'],
                                           target_feature=metric)
    
    df_train = convert_to_float(df_train)
    df_predict = convert_to_float(df_predict)
    
    # drop any rows that have a null target value (likely due to injuries or other missed season)
    df_train = df_train.dropna(subset=['y_act']).reset_index(drop=True)
    df_train = df_train.fillna(df_train.mean())
    df_predict = df_predict.dropna().reset_index(drop=True)

    print('Number of Training Samples:', df_train.shape[0])

    #--------
    # Run ensemble model with parameter optimization
    #--------

    # generate a master dictionary of parameters
    param_list = [lgbm_params, xgb_params, lasso_params, ridge_params]
    est_names = ['lgbm', 'xgb', 'lasso', 'ridge']

    params = {}
    for i, param in enumerate(param_list):
        params[est_names[i]] = param

    print('Training Ensemble Model')
    param_results, summary, results_tracker, errors = validation(est_names, params, df_train, 
                                                                           iterations=pos[set_pos]['iter_rounds'], 
                                                                           random_state=1234, 
                                                                           skip_years=pos[set_pos]['skip_years'])

    #--------
    # Print best results
    #--------

    # print a summary of error metrics, weightings of various models, and a comparison to 
    # using straight adp as as a prediction for next year's stats
    print(summary.head(10))

    # pull out the best result for the random hyperparameter search of models
    best_result = summary.index[0]
    df_train = corr_collinear_removal(df_train, param_results[best_result]['corr_cutoff'],
                                       param_results[best_result]['collinear_cutoff'])
    train_cols = list(df_train.columns)
    train_cols.remove('y_act')
    df_predict = df_predict[train_cols]

    # pass the best hyperparameters into the generation_prediction function, which
    # will return the test results for the current year and the trained best models
    df_test_results_, models = generate_predictions(best_result, param_results, summary, df_train, df_predict)

    #--------
    # Save all relevant metrics to output dictionary
    #--------

    # add models to output dictionary
    output[metric] = {}
    output[metric] = {}
    output[metric]['models'] = models

    # add params to output dictionary
    output[metric]['params'] = param_results

    # add columns to output dictionary
    cols = list(df_train.columns)
    cols.remove('y_act')
    cols.remove('player')
    output[metric]['cols'] = cols
    
    #--------
    # Aggregate all results through merging
    #--------
    
    df_train_results_ = results_tracker[best_result]

    # rename and select only relevant columns for appending
    df_train_results_ = df_train_results_.rename(columns={'pred': 'pred_' + metric,
                                                          'y_act': 'act_' + metric})
    df_train_results_ = df_train_results_[['player', 'year', 'pred_' + metric, 'act_' + metric]]

    # concat the chunk for each metric together into one dataframe
    df_test_results_ = df_test_results_.rename(columns={'pred': 'pred_' + metric})
    df_test_results_ = df_test_results_[['player', 'year', 'pred_' + metric]]

    # merge the train results for the given metric with all other metric outputs
    df_train_results = pd.merge(df_train_results, df_train_results_, 
                                how='inner', left_on=['player', 'year'], right_on=['player', 'year'])
    # merge the test results for the given metric with all other metric outputs
    df_test_results = pd.merge(df_test_results, df_test_results_, 
                               how='inner', left_on=['player', 'year'], right_on=['player', 'year'])

# reorder the results of the output to have predicted before actual
col_order = ['player', 'year']
col_order.extend([c for c in df_train_results.columns if 'pred' in c])
col_order.extend([c for c in df_train_results.columns if 'act' in c])
df_train_results = df_train_results[col_order]

In [None]:
#--------
# Calculate Fantasy Points for Given Scoring System
#-------- 

# extract points list and get the idx of point attributes based on length of list
pts_list = pts_dict[set_pos]
c_idx = len(pts_list) + 2

train_plot = df_train_results.copy()
test_plot = df_test_results.copy()

# multiply stat categories by corresponding point values
train_plot.iloc[:, 2:c_idx] = train_plot.iloc[:, 2:c_idx] * pts_list
test_plot.iloc[:, 2:c_idx] = test_plot.iloc[:, 2:c_idx] * pts_list

# add a total predicted points stat category
train_plot.loc[:, 'pred'] = train_plot.iloc[:, 2:c_idx].sum(axis=1)
test_plot.loc[:, 'pred'] = test_plot.iloc[:, 2:c_idx].sum(axis=1)

#==========
# Plot Predictions for Each Player
#==========

# set length of plot based on number of results
plot_length = int(test_plot.shape[0] / 3.5)

# plot results from highest predicted FP to lowest predicted FP
test_plot.sort_values('pred').plot.barh(x='player', y='pred', figsize=(5, plot_length));

In [None]:
#==========
# If desired, plot feature importances for a given metric / model
#==========
plot_importance=True
if plot_importance == True:
    
    metric = 'rec_per_game'
    j = 3
    try:
        plot_results(output[metric]['models'][j].feature_importances_, col_names=output[metric]['cols']);
    except:
        plot_results(output[metric]['models'][j].coef_, col_names=output[metric]['cols']);

# Comparing Fantasy Pros

In [None]:
# create training and prediction dataframes
fantasy_points = []

for m in pos[set_pos]['metrics']:
    df_train_full, df_predict_full = features_target(df,
                                                     pos[set_pos]['earliest_year'], set_year-1,
                                                     pos[set_pos]['med_features'],
                                                     pos[set_pos]['sum_features'],
                                                     pos[set_pos]['max_features'],
                                                     pos[set_pos]['age_features'],
                                                     target_feature=m)
    fantasy_points.append(list(df_train_full.y_act.values))
    
fantasy_pts = pd.DataFrame(fantasy_points).T.reset_index(drop=True)
fantasy_pts = (fantasy_pts * pts_dict[set_pos]).sum(axis=1)
fantasy_pts.name = 'fantasy_pts'

In [None]:
stats_fp = []
stats_all = []
min_year = int(max(df_train_results.year.min(), 2012))
    
for metric in pos[set_pos]['metrics']:

    df_train_full, df_predict_full = features_target(df,
                                                     pos[set_pos]['earliest_year'], set_year-1,
                                                     pos[set_pos]['med_features'],
                                                     pos[set_pos]['sum_features'],
                                                     pos[set_pos]['max_features'],
                                                     pos[set_pos]['age_features'],
                                                     target_feature=metric)

    df_train_full = pd.concat([df_train_full, fantasy_pts], axis=1)

    fp = pd.read_sql_query('SELECT * FROM FantasyPros', con=sqlite3.connect('/Users/Mark/Documents/Github/Fantasy_Football/Data/Season_Stats.sqlite3'))
    fp.year = fp.year-1
    df_train_full = pd.merge(df_train_full, fp, how='inner', left_on=['player', 'year'], right_on=['player', 'year'])
    df_train_full = df_train_full[df_train_full.fp > 5]


    from sklearn.linear_model import Lasso
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import mean_squared_error
    from sklearn.linear_model import Ridge


    df_train_full = df_train_full.dropna()
    lass = Lasso(alpha=250)
#     y = 'fantasy_pts'
#     y_other = 'y_act'
    
    y_other = 'fantasy_pts'
    y = 'y_act'


    stat_all = []
    stat_fp = []
    results_all = []
    results_fp = []
    for i in range(min_year, (set_year-1)):

        print(i)
        X_train = df_train_full.loc[df_train_full.year < i].drop([y_other, y], axis=1)
        y_train = df_train_full.loc[df_train_full.year < i, y]

        X_fp = X_train[['year', 'rank', 'adp', 'best', 'worst', 'avg', 'std_dev']]
        X_all = X_train.drop(['player', 'team', 'pos','rank', 'adp', 'best', 'worst', 'avg', 'std_dev' ], axis=1)

        X_predict = df_train_full.loc[df_train_full.year== i].drop([y_other, y], axis=1)
        X_pred_fp = X_predict[['year', 'rank', 'adp', 'best', 'worst', 'avg', 'std_dev']]
        X_pred_all = X_predict.drop(['player', 'team', 'pos', 'rank', 'adp', 'best', 'worst', 'avg', 'std_dev'], axis=1)

        y_pred = df_train_full.loc[df_train_full.year == i, y]

        lass.fit(X_fp, y_train)
        fp_pred = lass.predict(X_pred_fp)
        stat_fp.extend(list(fp_pred))
        print('FP error:', round(np.mean(np.sqrt(abs(mean_squared_error(fp_pred, y_pred)))), 3))
        results_fp.append(round(np.mean(np.sqrt(abs(mean_squared_error(fp_pred, y_pred)))), 3))

        lass.fit(X_all.replace([np.inf, -np.inf], np.nan).fillna(0), y_train)
        all_pred = lass.predict(X_pred_all.replace([np.inf, -np.inf], np.nan).fillna(0))
        stat_all.extend(list(all_pred))
        print('All error:', round(np.mean(np.sqrt(abs(mean_squared_error(all_pred, y_pred)))), 3))
        results_all.append(round(np.mean(np.sqrt(abs(mean_squared_error(all_pred, y_pred)))), 3))
    
    stats_fp.append(stat_fp)
    stats_all.append(stat_all)
    
    if y == 'fantasy_pts':
        print('--------------')
        print('Fantasy Pros straight FP Error:', round(np.mean(results_fp), 3))
        print('All straight FP Error:', round(np.mean(results_all), 3))
        break

In [None]:
#----------------
# Convert Fantasy Pros and Lasso Stat Results to Points
#----------------

df_all = pd.DataFrame(stats_all).T
df_fp = pd.DataFrame(stats_fp).T
    
df_all = (df_all * pts_dict[set_pos]).sum(axis=1)
df_fp = (df_fp * pts_dict[set_pos]).sum(axis=1)

y_test = df_train_full.loc[(df_train_full.year <= i) & (df_train_full.year >= min_year), y_other]

print('Lasso error:', round(np.mean(np.sqrt(abs(mean_squared_error(df_all, y_test)))), 2))
print('FantasyPros error:', round(np.mean(np.sqrt(abs(mean_squared_error(df_fp, y_test)))), 2))

#----------------
# Merge Fantasy Pros Data with Full Model Results to get Matching Player Sets
#----------------

full_models = pd.merge(
              df_train_full.loc[(df_train_full.year <= i) & (df_train_full.year >= min_year), ['player', 'year']],
              df_train_results, 
              how='inner', left_on=['player', 'year'], right_on=['player', 'year']).reset_index(drop=True)

y_test = df_train_full.loc[(df_train_full.year <= i) & (df_train_full.year >= min_year), y_other].reset_index(drop=True)

if set_pos == 'RB':
    pts = pts_dict[set_pos]
    full_models['fantasy_pts'] = (full_models.iloc[:,2:(len(pts)+2)]* pts).sum(axis=1)

elif set_pos == 'WR' or set_pos =='TE':
    pts = pts_dict[set_pos]
    full_models['fantasy_pts'] = (full_models.iloc[:,2:(len(pts)+2)]*pts).sum(axis=1)
    
elif set_pos == 'QB':
    pts = pts_dict[set_pos]
    full_models['fantasy_pts'] = (full_models.iloc[:,2:(len(pts)+2)]* pts).sum(axis=1)

In [None]:
print('All Models:', round(np.mean(np.sqrt(abs(mean_squared_error(full_models.fantasy_pts, y_test)))), 2))

## RB Compare

In [None]:
(4.34-3.67) / (np.mean([4.34, 3.67]))

## WR Compare

In [None]:
(3.37 - 2.85) / (np.mean([3.37, 2.85]))

## TE Compare

In [None]:
(2.61-2.33) / (np.mean([2.61, 2.33]))

## QB Compare

In [None]:
(2.68 - 2.43) / (np.mean([2.68, 2.43]))

## Write out results to Postgres

In [None]:
#--------
# Append additional stat categories to the results
#--------

# add actual results and adp to the train df
train_results = pd.merge(df_train_results, df[['player', 'year', 'age', 'year_exp', 'avg_pick']],
                           how='inner', left_on=['player', 'year'], right_on=['player', 'year'])

# add adp to the test df
test_results = pd.merge(df_test_results, df[['player', 'year', 'age', 'year_exp', 'avg_pick']],
                           how='inner', left_on=['player', 'year'], right_on=['player', 'year'])

#--------
# Set up proper database connections to save out single dataset
#--------

train_results.to_sql(set_pos + '_Train_' + str(set_year), engine, schema='websitedev', index=False, if_exists='replace')
test_results.to_sql(set_pos + '_Test_' + str(set_year), engine, schema='websitedev', index=False, if_exists='replace')

# Break Out Model

In [180]:
lgbm_params_class = {
    'n_estimators':[10, 15, 20, 25, 30, 40],
    'max_depth':[2, 3, 4, 5],
    'feature_fraction':[0.4, 0.5, 0.6, 0.65, 0.7, 0.8],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'min_child_weight': [1, 5, 10, 12, 15, 20],
}

xgb_params_class = {
    'n_estimators': [10, 15, 20, 25, 30, 40, 50], 
    'max_depth': [2, 3, 4, 5], 
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'min_child_weight': [5, 10, 15, 20, 25, 30, 35],
    'feature_fraction':[0.5, 0.6, 0.7, 0.8, 0.9]
}

rf_params_class = {
    'n_estimators': [30, 40, 50, 60, 75, 100, 125, 150], 
    'max_depth': [3, 4, 5, 6, 7, 8], 
    'min_samples_leaf': [1, 2, 3, 5],
    'max_features':[0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

logit_params = {
    'C': [0.1, 1, 5, 10, 25, 50, 100, 150, 200, 250, 500]
}

svm_params = {
    "C": [0.1, 1, 5, 10, 25, 50, 100],
    'degree': [1, 2, 3]
}

def get_estimator_class(name, params, rand=True, random_state=None):
    
    import random
    from numpy import random
    from xgboost import XGBClassifier
    from lightgbm import LGBMClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    
    state = random.RandomState(random_state)
    
    rnd_params = {}
    tmp_params = params[name]
    if rand == True:
        for line in tmp_params.items():
            rnd_params[line[0]] = state.choice(line[1])
    else:
        rnd_params = tmp_params
    
    if name == 'lgbm':
        estimator = LGBMClassifier(random_state=1234, **rnd_params, min_data=1)
        
    if name == 'xgb':
        estimator = XGBClassifier(random_state=1234, **rnd_params)
        
    if name == 'rf':
        estimator = RandomForestClassifier(random_state=1234, **rnd_params)
        
    if name == 'lr':
        estimator = LogisticRegression(random_state=1234, **rnd_params, solver='liblinear', tol=.001)
        
    if name == 'svm':
        estimator = SVC(probability=True, gamma='scale', **rnd_params)
        
    return estimator, rnd_params

In [181]:
# create training and prediction dataframes
df_train, df_predict = features_target(df,
                                       pos[set_pos]['earliest_year'], set_year-1,
                                       pos[set_pos]['med_features'],
                                       pos[set_pos]['sum_features'],
                                       pos[set_pos]['max_features'],
                                       pos[set_pos]['age_features'],
                                       target_feature='fp_per_game')

df_train = convert_to_float(df_train)
df_predict = convert_to_float(df_predict)

# drop any rows that have a null target value (likely due to injuries or other missed season)
df_train = df_train.dropna(subset=['y_act']).reset_index(drop=True)
df_train = df_train.fillna(df_train.mean())
df_predict = df_predict.dropna().reset_index(drop=True)

In [205]:
# get the predictions based on ADP and filter to outlier cases
outlier, lr = get_adp_predictions(df_train, year_min_int=2, pct_off=0.1, act_ppg=12)
outlier = pd.merge(df_train, outlier.drop(['y_act', 'avg_pick'], axis=1), how='inner',
                   left_on=['player', 'year'], right_on=['player', 'year'])

# maintain the actual points scored to join back later
y_act = outlier[['player', 'year', 'y_act', 'pct_off']]
outlier = outlier.drop(['y_act', 'pct_off', 'avg_pick_pred'], axis=1)

# remove collinear variables based on difference of means between the 0 and 1 labeled groups
keep_cols = ['player', 'pos', 'team', 'year', 'label']
outlier = remove_classification_collinear(outlier, collinear_cutoff=0.5, keep_cols=keep_cols)
outlier = outlier.rename(columns={'label': 'y_act'})

try:
    outlier.loc[outlier.rz_td_ratio == np.inf, 'rz_td_ratio'] = 0
except:
    pass

outlier_predict = df_predict[[c for c in outlier.columns if c != 'y_act']]

In [206]:
outlier.y_act.value_counts()

0    282
1     94
Name: y_act, dtype: int64

rookie_rb  = pd.read_sql_query('SELECT * FROM Rookie_RB_Stats', 
                               con=sqlite3.connect('/Users/Mark/Documents/Github/Fantasy_Football/Data/Season_Stats.sqlite3'))
rookie_rb = rookie_rb.drop(['pp_age', 'draft_year', 'rush_yd_per_game', 'rec_yd_per_game', 
                            'rec_per_game', 'td_per_game',
                            'team', 'avg_pick', 'log_avg_pick'], axis=1)
outlier = pd.merge(outlier, rookie_rb, how='inner', left_on='player', right_on='player')

In [207]:
# generate a master dictionary of parameters
param_list = [lgbm_params_class, xgb_params_class, logit_params, rf_params_class, svm_params]
est_names = ['lgbm', 'xgb', 'lr', 'rf', 'svm']

# param_list = [logit_params]
# est_names = ['lr']

params = {}
for i, param in enumerate(param_list):
    params[est_names[i]] = param

iterations = 100
scale = False

In [237]:
from sklearn.metrics import f1_score, precision_score, recall_score
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
import datetime

np.random.seed(1234)

param_tracker = {}
results_tracker = {}
use_smote = True

for est_name in est_names:
    param_tracker[est_name] = {}
    results_tracker[est_name] = {}

for i in range(0, iterations):
        
    # update random state to pull new params, but keep consistency based on starting state
    random_state = 100 + i*20 + i*3

    # print update on progress
    if (i+1) % 10 == 0:
        print(str(datetime.datetime.now())[:-7])
        print('Completed ' + str(i+1) + '/' + str(iterations) + ' iterations')
    
    for est_name in est_names:
        
        est, _ = get_estimator_class(est_name, params, rand=True, random_state=random_state)
        
        # if using smote, set the zero class weight to something close to 1
        if use_smote:
            zero_weight = np.random.uniform(1, 1.5)
        else:
            label_cts = outlier.y_act.value_counts()
            zero_weight = np.random.uniform(1, 1.5) * label_cts[1] / label_cts[0]
        
        est.class_weight = {0: zero_weight, 1: 1}
        
        # run through all years for given estimator and save errors and predictions
        val_error = []    
        train_error = [] 
        val_predictions = np.array([]) 
        years = outlier.year.unique()[1:]
        
        # create empty sub-dictionary for current iteration storage
        results_tracker[est_name][i] = {}
        param_tracker[est_name][i] = est

        for acc_metric in ['f1_score', 'precision', 'recall']:
            results_tracker[est_name][i][acc_metric] = []
        
        for m in years[:-1]:
                
            # create training set for all previous years and validation set for current year
            train_split = outlier[outlier.year < m]
            val_split = outlier[outlier.year == m]

            # splitting the train and validation sets into X_train, y_train, X_val and y_val
            X_train, X_val, y_train, y_val = X_y_split(train_split, val_split, scale=scale)
            
            if use_smote:
                knn = int(len(y_train[y_train==1])*0.5)
                smt = SMOTE(k_neighbors=knn)
                X_train, y_train = smt.fit_resample(X_train, y_train)
                
                X_val = X_val.values
            
            est.fit(X_train, y_train)
            val_predict = est.predict(X_val)
            
            # calculate accuracy metrics
            results_tracker[est_name][i]['f1_score'].append(f1_score(y_val, val_predict))
            results_tracker[est_name][i]['precision'].append(precision_score(y_val, val_predict))
            results_tracker[est_name][i]['recall'].append(recall_score(y_val, val_predict))
            
        for acc_metric in ['f1_score', 'precision', 'recall']:
            results_tracker[est_name][i][acc_metric] = np.mean(results_tracker[est_name][i][acc_metric])
        

2019-12-01 21:35:45
Completed 10/100 iterations
2019-12-01 21:36:46
Completed 20/100 iterations
2019-12-01 21:37:37
Completed 30/100 iterations
2019-12-01 21:38:27
Completed 40/100 iterations
2019-12-01 21:39:22
Completed 50/100 iterations
2019-12-01 21:40:19
Completed 60/100 iterations
2019-12-01 21:41:15
Completed 70/100 iterations
2019-12-01 21:42:05
Completed 80/100 iterations
2019-12-01 21:42:56
Completed 90/100 iterations
2019-12-01 21:43:54
Completed 100/100 iterations


In [238]:
best_models = {}
for model in ['lr', 'lgbm', 'rf', 'xgb', 'svm']:
    df = pd.DataFrame(results_tracker[model]).T
    idx = df[df.f1_score == df.f1_score.max()].index[0]
    print(model, df[df.index==idx])
    best_models[model] = idx

lr     f1_score  precision    recall
97  0.365191   0.340253  0.424143
lgbm     f1_score  precision    recall
58  0.274565   0.350758  0.259975
rf     f1_score  precision    recall
82  0.265185    0.32338  0.278202
xgb     f1_score  precision    recall
10  0.346884   0.287301  0.619345
svm     f1_score  precision    recall
86  0.372626   0.275908  0.647962


In [243]:
# splitting the train and validation sets into X_train, y_train, X_val and y_val
year_pred = 2018

if year_pred == df_predict.year.max():
    df_val = outlier_predict.copy()
    df_val['y_act'] = None
else:
    df_val = outlier[outlier.year==year_pred].reset_index(drop=True).copy()

X_train, X_val, y_train, y_val = X_y_split(outlier[outlier.year != year_pred], df_val, scale=scale)

param_tracker['lr'][best_models['lr']].fit(X_train, y_train)
lr_pred = param_tracker['lr'][best_models['lr']].predict_proba(X_val)[:,1]

param_tracker['rf'][best_models['rf']].fit(X_train, y_train)
rf_pred = param_tracker['rf'][best_models['rf']].predict_proba(X_val)[:,1]

param_tracker['xgb'][best_models['xgb']].fit(X_train, y_train)
xgb_pred = param_tracker['xgb'][best_models['xgb']].predict_proba(X_val)[:,1]

param_tracker['lgbm'][best_models['lgbm']].fit(X_train, y_train)
lgbm_pred = param_tracker['lgbm'][best_models['lgbm']].predict_proba(X_val)[:,1]

param_tracker['svm'][best_models['svm']].fit(X_train, y_train)
svm_pred = param_tracker['svm'][best_models['svm']].predict_proba(X_val)[:,1]

In [246]:
results = pd.concat([df_val[['player', 'year', 'y_act'
                            ]], 
                     pd.Series(lr_pred, name='lr_pred'), 
                     pd.Series(rf_pred, name='rf_pred'),
                     pd.Series(xgb_pred, name='xgb_pred'),
                     pd.Series(lgbm_pred, name='lgbm_pred'),
                     pd.Series(svm_pred, name='svm_pred'),
                     pd.Series(None, name='total')
                    ], axis=1)

results['total'] = results.iloc[:, 3:].drop('total', axis=1).sum(axis=1)

results.sort_values(by='total', ascending=False)

Unnamed: 0,player,year,y_act,lr_pred,rf_pred,xgb_pred,lgbm_pred,svm_pred,total
1,Saquon Barkley,2018.0,,0.650281,0.452139,0.359333,0.320746,0.247547,2.030045
17,Kenyan Drake,2018.0,,0.63619,0.360842,0.43266,0.296729,0.249615,1.976036
29,Dalvin Cook,2018.0,,0.353811,0.414568,0.316271,0.586288,0.250294,1.921232
38,Leonard Fournette,2018.0,,0.381699,0.427079,0.345936,0.495758,0.247966,1.898439
2,Christian McCaffrey,2018.0,,0.344191,0.544749,0.387123,0.365526,0.248023,1.889613
3,Alvin Kamara,2018.0,,0.506578,0.493777,0.285745,0.315147,0.247184,1.84843
18,Tevin Coleman,2018.0,,0.40771,0.261987,0.418213,0.442556,0.247932,1.778399
55,Kalen Ballage,2018.0,,0.407925,0.392513,0.321427,0.339611,0.250019,1.711496
30,Derrius Guice,2018.0,,0.299695,0.316584,0.397258,0.407187,0.249143,1.669867
9,Joe Mixon,2018.0,,0.290091,0.377977,0.325894,0.411915,0.249491,1.655368


# Creating Salary, Injury Tables and SQLite Output

### Push Salary to DB

In [None]:
sal = pd.read_csv('/Users/Mark/Documents/GitHub/Fantasy_Football/Data/OtherData/Salaries/salaries_2019_nv.csv')
sal.to_sql('salaries_' + str(set_year), engine, schema='websitedev', index=False, if_exists='replace')

### Push Injuries to DB

In [None]:
from sklearn.preprocessing import StandardScaler

inj = pd.read_csv('/Users/Mark/Documents/GitHub/Fantasy_Football/Data/OtherData/InjuryPredictor/injury_predictor_2019_v2.csv',  
                  encoding='latin-1', header=None)
inj.columns = ['player', 'pct_miss_one', 'proj_games_missed', 'inj_pct_per_game', 'inj_risk', 'points']
inj.player = inj.player.apply(lambda x: x.split(',')[0])
inj.pct_miss_one = inj.pct_miss_one.apply(lambda x: float(x.strip('%')))
inj.inj_pct_per_game = inj.inj_pct_per_game.apply(lambda x: float(x.strip('%')))
inj = inj.drop(['points', 'inj_risk'], axis=1)

X = StandardScaler().fit_transform(inj.iloc[:, 1:])
inj = pd.concat([pd.DataFrame(inj.player), 
                 pd.DataFrame(X, columns=['pct_miss_one', 'proj_games_missed', 'pct_per_game'])], 
                axis=1)
for col in ['pct_miss_one', 'proj_games_missed', 'pct_per_game']:
    inj[col] = inj[col] + abs(inj[col].min())

inj['mean_risk'] = inj.iloc[:, 1:].mean(axis=1)
inj = inj[['player', 'mean_risk']].sort_values(by='mean_risk').reset_index(drop=True)
inj.loc[inj.player=='Kareem Hunt', 'mean_risk'] = 8
inj.loc[inj.player=='Melvin Gordon', 'mean_risk'] = inj.loc[inj.player=='Melvin Gordon', 'mean_risk'] + 4
inj.loc[inj.player=='Ezekiel Elliott', 'mean_risk'] = inj.loc[inj.player=='Ezekiel Elliott', 'mean_risk'] + 2
inj.loc[inj.player=='Todd Gurley', 'mean_risk'] = inj.loc[inj.player=='Todd Gurley', 'mean_risk'] + 1
inj.loc[inj.player=='Antonio Brown', 'mean_risk'] = inj.loc[inj.player=='Antonio Brown', 'mean_risk'] + 1
inj.loc[inj.player=='Derrius Guice', 'mean_risk'] = inj.loc[inj.player=='Derrius Guice', 'mean_risk'] + 1
inj.loc[inj.player=='A.J. Green', 'mean_risk'] = inj.loc[inj.player=='A.J. Green', 'mean_risk'] + 2


inj = inj[['player', 'mean_risk']]
inj.to_sql('injuries_' + str(set_year), engine, schema='websitedev', index=False, if_exists='replace')

### Drop All Data into SQLite Database

In [None]:
db = sqlite3.connect('/Users/Mark/Desktop/FF_Sim/SimInput_v2.sqlite3')

for set_pos in ['QB', 'RB', 'WR', 'TE']:
    
    train = pd.read_sql_query('SELECT * FROM {}."{}_Train_{}"' \
                                     .format(table_info['schema'], set_pos, str(set_year)), table_info['engine'])
    test = pd.read_sql_query('SELECT * FROM {}."{}_Test_{}"' \
                                    .format(table_info['schema'], set_pos, str(set_year)), table_info['engine'])
    
    train.to_sql(name='{}_Train_{}'.format(set_pos, str(set_year)), con=db, if_exists='replace', index=False)
    test.to_sql(name='{}_Test_{}'.format(set_pos, str(set_year)), con=db, if_exists='replace', index=False)
    

sal = pd.read_sql_query('SELECT * FROM {}."salaries_2019"' \
                                    .format(table_info['schema'], set_pos, str(set_year)), table_info['engine'])
sal.to_sql(name='salaries_2019', con=db, if_exists='replace', index=False)

inj = pd.read_sql_query('SELECT * FROM {}."injuries_2019"' \
                                    .format(table_info['schema'], set_pos, str(set_year)), table_info['engine'])
inj.to_sql(name='injuries_2019', con=db, if_exists='replace', index=False)