# General Settings

In [None]:
#==============
# Load Packages
#==============

# set core path
path = '/Users/Mark/Documents/Github/Fantasy_Football/'

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
import os

# change directory temporarily to helper scripts
os.chdir(path + 'Scripts/Analysis/Helper_Scripts')

# load custom helper functions
from helper_functions import *;


#===============
# Settings and User Inputs
#===============

#--------
# League Settings
#--------

# define point values for all statistical categories
pass_yd_per_pt = 0.04 
pass_td_pt = 5
int_pts = -2
sacks = -1
rush_yd_per_pt = 0.1 
rec_yd_per_pt = 0.1
rush_rec_td = 7
ppr = 0.5

# creating dictionary containing point values for each position
pts_dict = {}
pts_dict['QB'] = [pass_yd_per_pt, pass_td_pt, rush_yd_per_pt, rush_rec_td, int_pts, sacks]
pts_dict['RB'] = [rush_yd_per_pt, rec_yd_per_pt, ppr, rush_rec_td]
pts_dict['WR'] = [rec_yd_per_pt, ppr, rush_rec_td]
pts_dict['TE'] = [rec_yd_per_pt, ppr, rush_rec_td]

#--------
# Database Login Info
#--------

# postgres login information
pg_log = {
    'USER': 'postgres',
    'PASSWORD': 'Ctdim#1bf!!!!!',
    'HOST': 'localhost',
    'PORT': '5432', 
    'DATABASE_NAME': 'fantasyfootball'
}

# create engine for connecting to database
engine = create_engine('postgres+psycopg2://{}:{}@{}:{}/{}'.format(pg_log['USER'], pg_log['PASSWORD'], pg_log['HOST'],
                                                                   pg_log['PORT'], pg_log['DATABASE_NAME']))

# specify schema and table to write out intermediate results
table_info = {
    'engine': engine,
    'schema': 'websitedev',
}

# set year
year = 2018

# Pulling in Player Data

In [None]:
def data_load(pts_dict, pos, table_info, set_year=2018):
    
    '''
    This function reads in all raw statistical predictions from the ensemble model for a given
    position group and then converts it into predicted points scored based on a given scoring system.

    Input: Database connection to pull stored raw statistical data, a dictionary containing points
           per statistical category, and a position to pull.
    Return: A dataframe with a player, their raw statistical projections and the predicted points
            scored for a given scoring system.
    '''

    import pandas as pd

    #--------
    # Connect to Database and Pull Player Data
    #--------

    train = pd.read_sql_query('SELECT * FROM {}."{}_Train_{}"' \
                                         .format(table_info['schema'], pos[1:], str(set_year)), table_info['engine'])
    test = pd.read_sql_query('SELECT * FROM {}."{}_Test_{}"' \
                                        .format(table_info['schema'], pos[1:], str(set_year)), table_info['engine'])

    #--------
    # Calculate Fantasy Points for Given Scoring System
    #-------- 
    
    # extract points list and get the idx of point attributes based on length of list
    pts_list = pts_dict[pos[1:]]
    c_idx = len(pts_list) + 1

    # multiply stat categories by corresponding point values
    train.iloc[:, 1:c_idx] = train.iloc[:, 1:c_idx] * pts_list
    test.iloc[:, 1:c_idx] = test.iloc[:, 1:c_idx] * pts_list

    # add a total predicted points stat category
    train.loc[:, 'pred'] = train.iloc[:, 1:c_idx].sum(axis=1)
    test.loc[:, 'pred'] = test.iloc[:, 1:c_idx].sum(axis=1)
    
    return train, test


def _searching(est, pos, X_grid, y_grid, n_jobs=1):
    '''
    Function to perform GridSearchCV and return the test RMSE, as well as the 
    optimized and fitted model
    '''
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import cross_val_score
    
     #=========
    # Set the RF search params for each position
    #=========

    params = {}

    params['QB'] = {
        'max_depth': [4, 5],
        'min_samples_split': [2],
        'min_samples_leaf': [15, 20, 25],
        'splitter': ['random']
    }

    params['RB'] = {
        'max_depth': [5, 6, 7],
        'min_samples_split': [2],
        'min_samples_leaf': [15, 20, 25],
        'splitter': ['random']
    }

    params['WR'] = {
        'max_depth': [4, 5, 6],
        'min_samples_split': [2],
        'min_samples_leaf': [20, 25, 30],
        'splitter': ['random']
    }


    params['TE'] = {
        'max_depth': [4, 5],
        'min_samples_split': [2],
        'min_samples_leaf': [15, 20, 25],
        'splitter': ['random']
    }

    # set up GridSearch object
    Search = GridSearchCV(estimator=est,
                          param_grid=params[pos[1:]],
                          scoring='neg_mean_squared_error',
                          n_jobs=n_jobs,
                          cv=3,
                          return_train_score=False,
                          iid=False)

    # try all combination of parameters with the fit
    search_results = Search.fit(X_grid, y_grid)

    # extract best estimator parameters and create model object with them
    best_params = search_results.cv_results_['params'][search_results.best_index_]
    est.set_params(**best_params)

    # fit the optimal estimator with the data
    est.fit(X_grid, y_grid)

    return est


def tree_cluster(train, test, pos):
    
    # create df for clustering by selecting numeric values and dropping y_act
    X_train = train.select_dtypes(include=['float', 'int', 'uint8']).drop('y_act', axis=1)
    X_test = test.select_dtypes(include=['float', 'int', 'uint8'])
    y = train.y_act
    
    #----------
    # Train the Decision Tree with GridSearch optimization
    #----------

    from sklearn.tree import DecisionTreeRegressor

    # train decision tree with _searching method
    dtree = _searching(DecisionTreeRegressor(random_state=1), pos, X_train, y)

    #----------
    # Calculate each cluster's mean and standard deviation
    #----------

    # pull out the training clusters and cbind with the actual points scored
    train_results = pd.concat([pd.Series(dtree.apply(X_train), name='Cluster'), y], axis=1)

    # calculate the average and standard deviation of points scored by cluster
    train_results = train_results.groupby('Cluster', as_index=False).agg({'y_act': ['mean', 'std']})
    train_results.columns = ['Cluster', 'ClusterMean', 'ClusterStd']

    #----------
    # Add the cluster to test results and resulting group mean / std: Player | Pred | StdDev
    #----------

    # grab the player, prediction, and add cluster to dataset
    test_results = pd.concat([test[['player', 'pred']], 
                              pd.Series(dtree.apply(X_test), name='Cluster')], axis=1)

    # merge the test results with the train result on cluster to add mean cluster and std
    test_results = pd.merge(test_results, train_results, how='inner', left_on='Cluster', right_on='Cluster')

    # calculate an overall prediction mean and add position to dataset
    test_results['PredMean'] = (0.5*test_results.pred + 0.5*test_results.ClusterMean)
    test_results['pos'] = pos

    # pull out relevant results for creating distributions
    test_results = test_results[['player', 'pos', 'PredMean', 'ClusterStd']]
    
    return test_results


def create_distributions():
    
    # create empty dataframe to store player point distributions
    tree_output = pd.DataFrame()
    for pos in ['aQB', 'bRB', 'cWR', 'dTE']:
        
        # extract the train and test data for passing into the tree algorithm
        train, test = data_load(pts_dict, pos, table_info, set_year=2018)
        
        # obtain the cluster standard deviation + the mixed prediction / cluster mean
        results = tree_cluster(train, test, pos)
        
        # append the results for each position into single dataframe
        tree_output = pd.concat([tree_output, results], axis=0)

    tree_output = tree_output.reset_index(drop=True)
    
    # loop through each row in tree output and create a normal distribution
    data = []
    for row in tree_output.iterrows():
        dist = list(np.uint16(np.random.normal(loc=row[1]['PredMean'], scale=row[1]['ClusterStd'], size=1500)*16))
        data.append(dist)

    # create the player, position, point distribution dataframe
    data = pd.concat([tree_output.player, pd.DataFrame(data), tree_output.pos], axis=1)
    
    # add salaries to the dataframe and set index to player
    salaries = pd.read_sql_query('SELECT * FROM {}."salaries"'.format(table_info['schema']), table_info['engine'])
    data = pd.merge(data, salaries, how='inner', left_on='player', right_on='player')
    data = data.set_index('player')

    return data

In [None]:
%timeit x = create_distributions()

In [None]:
x