# General Settings

In [None]:
#==============
# Load Packages
#==============

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
import os

# change directory temporarily to helper scripts
os.chdir(path + 'Scripts/Analysis/Helper_Scripts')

# load custom helper functions
from helper_functions import *;


#===============
# Settings and User Inputs
#===============

# set core path
path = '/Users/Mark/Documents/Github/Fantasy_Football/'

# postgres login information
pg_log = {
    'USER': 'postgres',
    'PASSWORD': 'Ctdim#1bf!!!!!',
    'HOST': 'localhost',
    'PORT': '5432', 
    'DATABASE_NAME': 'fantasyfootball'
}

# create engine for connecting to database
engine = create_engine('postgres+psycopg2://{}:{}@{}:{}/{}'.format(pg_log['USER'], pg_log['PASSWORD'], pg_log['HOST'],
                                                                   pg_log['PORT'], pg_log['DATABASE_NAME']))

# define dictionary that contains all relevant point values
pts_dict = {}
pts_dict['QB'] = [0.04, 5, 0.1, 7, -2, -1]
pts_dict['RB'] = [0.1, 0.1, 0.5, 7]
pts_dict['WR'] = [0.1, 0.5, 7]
pts_dict['TE'] = [0.1, 0.5, 7]

# set random user id
user_id=20

# specify schema and table to write out intermediate results
table_info = {
    'schema': 'website',
}

# set year
year = 2018

# Pulling in Player Data

In [None]:
def rf_train_data(pts_dict, pos, engine, table_info, set_year=2018):
    
    '''
    The initialization of this Class reads in all of the statistical projection data and
    translates it into clusters and projection distributions given a particular scoring schema.
    The data is then stored in the self.data object, which will be accessed through the analysis.

    Input: A database that contains statistical projections, a dictionary that contains the points
           for each category, and number of prior repeats to use for Bayesian updating.
    Return: Stores all the player projection distributions in that self.data object.
    '''

    # create empty dataframe to store all player distributions
    data = pd.DataFrame()

    # print current position update
    #print('Loading and Preparing ' + pos[1:] + ' Data')

    #--------
    # Connect to Database and Pull Player Data
    #--------

    df_train_results = pd.read_sql_query('SELECT * FROM {}."{}_Train_Results_{}"' \
                                         .format(table_info['schema'], pos[1:], str(set_year)), engine)
    df_test_results = pd.read_sql_query('SELECT * FROM {}."{}_Test_Results_{}"' \
                                        .format(table_info['schema'], pos[1:], str(set_year)), engine)
    df_train = pd.read_sql_query('SELECT * FROM {}."{}_Train_{}"' \
                                 .format(table_info['schema'], pos[1:], str(set_year)), engine)
    df_predict = pd.read_sql_query('SELECT * FROM {}."{}_Predict_{}"' \
                                   .format(table_info['schema'], pos[1:], str(set_year)), engine)

    #--------
    # Calculate Fantasy Points for Given Scoring System and Cluster
    #--------

    # pull in data results from dataframe
    df_train_results, df_test_results = format_results(df_train_results, df_test_results, 
                                                       df_train, df_predict, 
                                                       pts_dict[pos[1:]])
    #df_train_results = df_train_results.drop('year', axis=1)

    return df_train_results, df_test_results

In [None]:
class ClusteringDev():
    
    def __init__(self, df_train, df_test):
    
        import pandas as pd
        
        # create self versions of train and test
        self.df_train = df_train
        self.df_test = df_test
    
        # create df for clustering by selecting numeric values and dropping y_act
        self.X_train = df_train.select_dtypes(include=['float', 'int', 'uint8']).drop(['y_act', 'error'], axis=1)
        #self.X_test = df_test.select_dtypes(include=['float', 'int', 'uint8']).drop([], axis=1)
        self.X_test = df_test.select_dtypes(include=['float', 'int', 'uint8']).drop(['y_act', 'error'], axis=1)
        self.y = df_train.y_act
    
        #=========
        # Set the RF search params for each position
        #=========

        self.pos = {'QB': {}, 'RB': {}, 'WR': {}, 'TE': {}}

        self.pos['QB']['tree_params'] = {
            'max_depth': [4, 5, 6],
            'min_samples_split': [2],
            'min_samples_leaf': [15, 20, 25],
            'splitter': ['random']
        }

        self.pos['RB']['tree_params'] = {
            'max_depth': [5, 6, 7],
            'min_samples_split': [2],
            'min_samples_leaf': [15, 20, 25],
            'splitter': ['random']
        }

        self.pos['WR']['tree_params'] = {
            'max_depth': [4, 5, 6],
            'min_samples_split': [2],
            'min_samples_leaf': [20, 25, 30],
            'splitter': ['random']
        }


        self.pos['TE']['tree_params'] = {
            'max_depth': [4, 5, 6],
            'min_samples_split': [2],
            'min_samples_leaf': [15, 20, 25],
            'splitter': ['random']
        }



    @staticmethod
    def _searching(est, params, X_grid, y_grid, n_jobs=1, print_results=True):
        '''
        Function to perform GridSearchCV and return the test RMSE, as well as the 
        optimized and fitted model
        '''
        from sklearn.model_selection import GridSearchCV
        from sklearn.model_selection import cross_val_score

        Search = GridSearchCV(estimator=est,
                              param_grid=params,
                              scoring='neg_mean_squared_error',
                              n_jobs=n_jobs,
                              cv=3,
                              return_train_score=True,
                              iid=False)

        search_results = Search.fit(X_grid, y_grid)

        best_params = search_results.cv_results_['params'][search_results.best_index_]
        est.set_params(**best_params)

        est.fit(X_grid, y_grid)

        return est


        
    def fit_and_predict_tree(self, pos, print_results=False):
        
        from sklearn.tree import DecisionTreeRegressor
        
        dtree = self._searching(DecisionTreeRegressor(random_state=1), self.pos[pos]['tree_params'], 
                               self.X_train, self.y, print_results=print_results)
        
        #----------
        # Calculate each cluster's mean and standard deviation
        #----------

        # pull out the training clusters and cbind with the actual points scored
        train_results = pd.concat([pd.Series(dtree.apply(self.X_train), name='Cluster'), self.y], axis=1)

        # calculate the average and standard deviation of points scored by cluster
        train_results = train_results.groupby('Cluster', as_index=False).agg({'y_act': ['mean', 'std']})
        train_results.columns = ['Cluster', 'ClusterMean', 'ClusterStd']

        #----------
        # Add the cluster to test results and resulting group mean / std: Player | Pred | StdDev
        #----------

        # grab the player, prediction, and add cluster to dataset
        test_results = pd.concat([self.df_test[['player', 'pred']], 
                                  pd.Series(dtree.apply(self.X_test), name='Cluster')], axis=1)

        # merge the test results with the train result on cluster to add mean cluster and std
        test_results = pd.merge(test_results, train_results, how='inner', left_on='Cluster', right_on='Cluster')

        # calculate an overall prediction mean
        test_results['PredMean'] = (0.5*test_results.pred + 0.5*test_results.ClusterMean)
        
        #test_results = test_results[['player', 'PredMean', 'ClusterStd']]
        test_results = test_results[['player','pred', 'ClusterMean', 'PredMean', 'ClusterStd']]
        
        return test_results

In [None]:
import time
start = time.time()
for pos in ['aQB', 'bRB', 'cWR', 'dTE']:

    train_results, test_results = rf_train_data(pts_dict, pos, engine, table_info)
    
    cluster_dev = ClusteringDev(train_results, test_results)
    dist_input = cluster_dev.fit_and_predict_tree(pos[1:])
time.time()-start

In [None]:
test_year = 2016
from sklearn.metrics import mean_squared_error

# create object to hold results
all_data = pd.DataFrame()
all_results = []

# loop through each test year
for test_year in [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]:
    pos = 'aTE'
    
    # pull out the train and test datasets and create temp tables based on current year
    train_results, test_results = rf_train_data(pts_dict, pos, engine, table_info)
    train_tmp = train_results[train_results.year < test_year].drop('year', axis=1).reset_index(drop=True)
    test_tmp = train_results[train_results.year==test_year].drop('year', axis=1).reset_index(drop=True)
    
    # run the clustering algorithm
    cluster_dev = ClusteringDev(train_tmp, test_tmp)
    dist_input = cluster_dev.fit_and_predict_tree(pos[1:])
    
    # paste actual values back to tes table and calculate error metrics
    dist_input = pd.merge(dist_input, test_tmp[['player', 'y_act']], how='inner', left_on='player', right_on='player')
    cluster_error = mean_squared_error(dist_input.ClusterMean, dist_input.y_act)
    pred_error = mean_squared_error(dist_input.pred, dist_input.y_act)
    combined_error = mean_squared_error(dist_input.PredMean, dist_input.y_act)
    
    # concatenate results
    all_data=pd.concat([all_data, dist_input], axis=0)
    all_results.append([np.sqrt(cluster_error), np.sqrt(pred_error), np.sqrt(combined_error)])

# pull out final results into a dataframe for error metrics
all_results = pd.DataFrame(all_results, columns=['ClusterError', 'PredError', 'CombinedError'])
    
# for i in test_results.iterrows():
#     x=np.random.normal(loc=i[1]['PredMean'], scale=i[1]['ClusterStd'], size=1000)*16

In [None]:
class ClusteringBayes():
    
    def __init__(self, df_train, df_test):
    
        import pandas as pd
        
        # create self versions of train and test
        self.df_train = df_train
        self.df_test = df_test
    
        # create df for clustering by selecting numeric values and dropping y_act
        self.X_train = df_train.select_dtypes(include=['float', 'int', 'uint8']).drop(['y_act', 'error'], axis=1)
        #self.X_test = df_test.select_dtypes(include=['float', 'int', 'uint8']).drop([], axis=1)
        self.X_test = df_test.select_dtypes(include=['float', 'int', 'uint8']).drop(['y_act', 'error'], axis=1)
        self.y = df_train.y_act
    
        #=========
        # Set the RF search params for each position
        #=========

        self.pos = {'QB': {}, 'RB': {}, 'WR': {}, 'TE': {}}

        self.pos['QB']['tree_params'] = {
            'max_depth': [4, 5, 6],
            'min_samples_split': [2],
            'min_samples_leaf': [15, 20, 25],
            'splitter': ['random']
        }

        self.pos['RB']['tree_params'] = {
            'max_depth': [5, 6, 7],
            'min_samples_split': [2],
            'min_samples_leaf': [15, 20, 25],
            'splitter': ['random']
        }

        self.pos['WR']['tree_params'] = {
            'max_depth': [4, 5, 6],
            'min_samples_split': [2],
            'min_samples_leaf': [20, 25, 30],
            'splitter': ['random']
        }


        self.pos['TE']['tree_params'] = {
            'max_depth': [4, 5, 6],
            'min_samples_split': [2],
            'min_samples_leaf': [15, 20, 25],
            'splitter': ['random']
        }



    @staticmethod
    def _searching(est, params, X_grid, y_grid, n_jobs=1, print_results=True):
        '''
        Function to perform GridSearchCV and return the test RMSE, as well as the 
        optimized and fitted model
        '''
        from sklearn.model_selection import GridSearchCV
        from sklearn.model_selection import cross_val_score

        Search = GridSearchCV(estimator=est,
                              param_grid=params,
                              scoring='neg_mean_squared_error',
                              n_jobs=n_jobs,
                              cv=3,
                              return_train_score=True,
                              iid=False)

        search_results = Search.fit(X_grid, y_grid)

        best_params = search_results.cv_results_['params'][search_results.best_index_]
        est.set_params(**best_params)

        est.fit(X_grid, y_grid)

        return est


        
    def fit_and_predict_tree(self, pos, print_results=False):
        
        from sklearn.tree import DecisionTreeRegressor
        
        dtree = self._searching(DecisionTreeRegressor(random_state=1), self.pos[pos]['tree_params'], 
                                self.X_train, self.y, print_results=print_results)
        
        #----------
        # Calculate each cluster's mean and standard deviation
        #----------

        # pull out the training clusters and cbind with the actual points scored
        train_results = pd.concat([self.df_train[['player', 'pred']],
                                   pd.Series(dtree.apply(self.X_train), name='cluster'), 
                                   self.y], axis=1)

        #----------
        # Add the cluster to test results and resulting group mean / std: Player | Pred | StdDev
        #----------

        # grab the player, prediction, and add cluster to dataset
        test_results = pd.concat([self.df_test[['player', 'pred']], 
                                  pd.Series(dtree.apply(self.X_test), name='cluster')], axis=1)
        
        return train_results, test_results

In [None]:
def create_distributions(df_train, df_test, prior_repeats=15, dist_size=1000, show_plots=False):

    # historical standard deviation and mean for actual results
    hist_std = df_train.groupby('player').agg('std').dropna()
    hist_mean = df_train.groupby('player').agg('mean').dropna()

    # merge historicaly mean and standard deviations
    hist_mean_std = pd.merge(hist_std, hist_mean, how='inner', left_index=True, right_index=True)

    # calculate global coefficient of variance for players that don't have enough historical results
    global_cv = (hist_mean_std.y_act_x / hist_mean_std.y_act_y).mean()

    #==========
    # Loop to Create Prior and Posterior Distributions
    #==========

    df_test = df_test.sort_values(by='pred', ascending=False)

    results = pd.DataFrame()

    for player in df_test.player[0:]:

        # set seed
        np.random.seed(1234)

        # create list for results
        results_list = [player]

        #==========
        # Pull Out Predictions and Actual Results for Given Player to Create Prior
        #==========

        #--------
        # Extract this year's results and multiply by prior_repeats
        #--------

        # extract predictions from ensemble and updated predictions based on cluster fit
        ty = df_test.loc[df_test.player == player, ['player', 'pred']]
        #ty_c = self.df_test.loc[self.df_test.player == player, ['player', 'cluster_pred']]

        # replicate the predictions to increase n_0 for the prior
        ty = pd.concat([ty]*prior_repeats, ignore_index=True)
        #ty_c = pd.concat([ty_c]*prior_repeats, ignore_index=True)

        # rename the prediction columns to 'points'
        ty = ty.rename(columns={'pred': 'points'})
        #ty_c = ty_c.rename(columns={'cluster_pred': 'points'})

        #--------
        # Extract previous year's results, if available
        #--------

        # pull out the most recent 5 years worth of performance, if available
        py = df_train.loc[df_train.player == player, ['player', 'y_act']].reset_index(drop=True)[0:5]

        # convert y_act to points name
        py = py.rename(columns={'y_act': 'points'})

        #--------
        # Create Prior Distribution and Conjugant Hyperparameters
        #--------

        # combine this year's prediction, the cluster prediction, and previous year actual, if available
        priors = pd.concat([ty, py], axis=0)

        # set m_0 to the priors mean
        m_0 = priors.points.mean()

        # Create the prior variance through a weighted average of the actual previous year
        # performance and a global coefficient of variance multiple by the prior mean.
        # If there is not at least 3 years of previous data, simply use the global cv.
        if py.shape[0] >= 3:
            s2_0 = ((py.shape[0]*py.points.std()**2) + (2*prior_repeats*(m_0 * global_cv)**2)) / (py.shape[0] + 2*prior_repeats)
        else:
            s2_0 = (m_0 * global_cv)**2

        # set the prior sample size and degrees of freedom
        n_0 = priors.shape[0]
        v_0 = n_0 - 1

        # calculate the prior distribution
        prior_y = np.random.normal(loc=m_0, scale=np.sqrt(s2_0), size=dist_size)

        #--------
        # Create the Data and Data Hyperparameters
        #--------

        # pull out the cluster for the current player
        ty_cluster = df_test[df_test.player == player].cluster.values[0]

        # create a list of the actual points scored to be used as updating data
        update_data = df_train[df_train.cluster == ty_cluster].y_act

        # set ybar to the mean of the update data
        ybar = update_data.mean()

        # calculate the standard deviation based on the 5th and 95th percentiles
        s2 = ((np.percentile(update_data, q=95)-np.percentile(update_data, q=5)) / 4.0)**2

        # determine the n as the number of data points
        n = len(update_data)

        #--------
        # Create the Posterior Distribution 
        #--------

        # set the poster n samples
        n_n = n_0 + n 

        # update the poster mean
        m_n = (n*ybar + n_0*m_0) / n_n 

#         # update the posterior degrees of freedom
#         v_n = v_0 + n 

#         # update the posterior variance
#         s2_n = ((n-1)*s2 + v_0*s2_0 + (n_0*n*(m_0 - ybar)**2)/n_n)/v_n

#         # calculate the gamma distribution and convert to sigma
#         phi = np.random.gamma(shape=v_n/2, scale=2/(s2_n*v_n), size=dist_size)
#         sigma = 1/np.sqrt(phi)

#         # calculate the posterior mean
#         post_mu = np.random.normal(loc=m_n, scale=sigma/(np.sqrt(n_n)), size=dist_size)

#         # create the posterior distribution
#         pred_y =  np.random.normal(loc=post_mu, scale=sigma, size=dist_size)

        results_list.append(m_n)
        results = pd.concat([results, pd.DataFrame(results_list).T], axis=0)

    return results.reset_index(drop=True)




In [None]:
bayes_errors = []

for test_year in [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]:
#for pos in ['aQB', 'aRB', 'aWR', 'aTE']:
    
    # pull out the train and test datasets and create temp tables based on current year
    train_results, test_results = rf_train_data(pts_dict, pos, engine, table_info)
    train_tmp = train_results[train_results.year < test_year].drop('year', axis=1).reset_index(drop=True)
    test_tmp = train_results[train_results.year==test_year].drop('year', axis=1).reset_index(drop=True)

    # run the clustering algorithm
    cluster_dev = ClusteringBayes(train_tmp, test_tmp)
    train_bayes, test_bayes = cluster_dev.fit_and_predict_tree(pos[1:])

    bayes_output = create_distributions(train_bayes, test_bayes)
    bayes_output = pd.merge(bayes_output, test_tmp, how='inner', left_on=0, right_on='player')

    bayes_errors.append(np.sqrt(mean_squared_error(bayes_output.iloc[:,1], bayes_output.y_act)))

all_results = pd.concat([all_results, pd.Series(bayes_errors, name='BayesError')], axis=1)