# General Settings

In [None]:
#==============
# Load Packages
#==============

# set core path
path = '/Users/Mark/Documents/Github/Fantasy_Football/'

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
import os

# change directory temporarily to helper scripts
os.chdir(path + 'Scripts/Analysis/Helper_Scripts')

# load custom helper functions
from helper_functions import *;


#===============
# Settings and User Inputs
#===============

# postgres login information
pg_log = {
    'USER': 'postgres',
    'PASSWORD': 'Ctdim#1bf!!!!!',
    'HOST': 'localhost',
    'PORT': '5432', 
    'DATABASE_NAME': 'fantasyfootball'
}

# create engine for connecting to database
engine = create_engine('postgres+psycopg2://{}:{}@{}:{}/{}'.format(pg_log['USER'], pg_log['PASSWORD'], pg_log['HOST'],
                                                                   pg_log['PORT'], pg_log['DATABASE_NAME']))

# define dictionary that contains all relevant point values
pts_dict = {}
pts_dict['QB'] = [0.04, 5, 0.1, 7, -2, -1]
pts_dict['RB'] = [0.1, 0.1, 0.5, 7]
pts_dict['WR'] = [0.1, 0.5, 7]
pts_dict['TE'] = [0.1, 0.5, 7]

# set random user id
user_id=20

# specify schema and table to write out intermediate results
table_info = {
    'schema': 'website',
}

# set year
year = 2018

# Pulling in Player Data

In [None]:
def rf_train_data(pts_dict, pos, engine, table_info, set_year=2018):
    
    '''
    This function reads in all raw statistical predictions from the ensemble model for a given
    position group and then converts it into predicted points scored based on a given scoring system.

    Input: Database connection to pull stored raw statistical data, a dictionary containing points
           per statistical category, and a position to pull.
    Return: A dataframe with a player, their raw statistical projections and the predicted points
            scored for a given scoring system.
    '''

    # create empty dataframe to store all player predicted points
    data = pd.DataFrame()

    #--------
    # Connect to Database and Pull Player Data
    #--------

    df_train_results = pd.read_sql_query('SELECT * FROM {}."{}_Train_Results_{}"' \
                                         .format(table_info['schema'], pos[1:], str(set_year)), engine)
    df_test_results = pd.read_sql_query('SELECT * FROM {}."{}_Test_Results_{}"' \
                                        .format(table_info['schema'], pos[1:], str(set_year)), engine)
    df_train = pd.read_sql_query('SELECT * FROM {}."{}_Train_{}"' \
                                 .format(table_info['schema'], pos[1:], str(set_year)), engine)
    df_predict = pd.read_sql_query('SELECT * FROM {}."{}_Predict_{}"' \
                                   .format(table_info['schema'], pos[1:], str(set_year)), engine)

    #--------
    # Calculate Fantasy Points for Given Scoring System
    #--------

    # pass in raw statistical results and convert to fantasy points scored
    df_train_results, df_test_results = format_results(df_train_results, df_test_results, 
                                                       df_train, df_predict, 
                                                       pts_dict[pos[1:]])
    # drop the year from the dataset
    df_train_results = df_train_results.drop('year', axis=1)

    return df_train_results, df_test_results

In [None]:
class DataGeneration():
    
    def __init__(self, df_train, df_test):
    
        import pandas as pd
        
        # create self versions of train and test
        self.df_train = df_train
        self.df_test = df_test
    
        # create df for clustering by selecting numeric values and dropping y_act
        self.X_train = df_train.select_dtypes(include=['float', 'int', 'uint8']).drop(['y_act', 'error'], axis=1)
        self.X_test = df_test.select_dtypes(include=['float', 'int', 'uint8']).drop([], axis=1)
        self.y = df_train.y_act
    
        #=========
        # Set the RF search params for each position
        #=========

        self.pos = {'QB': {}, 'RB': {}, 'WR': {}, 'TE': {}}

        self.pos['QB']['tree_params'] = {
            'max_depth': [4, 5],
            'min_samples_split': [2],
            'min_samples_leaf': [15, 20, 25],
            'splitter': ['random']
        }

        self.pos['RB']['tree_params'] = {
            'max_depth': [5, 6, 7],
            'min_samples_split': [2],
            'min_samples_leaf': [15, 20, 25],
            'splitter': ['random']
        }

        self.pos['WR']['tree_params'] = {
            'max_depth': [4, 5, 6],
            'min_samples_split': [2],
            'min_samples_leaf': [20, 25, 30],
            'splitter': ['random']
        }


        self.pos['TE']['tree_params'] = {
            'max_depth': [4, 5],
            'min_samples_split': [2],
            'min_samples_leaf': [15, 20, 25],
            'splitter': ['random']
        }



    @staticmethod
    def _searching(est, params, X_grid, y_grid, n_jobs=1):
        '''
        Function to perform GridSearchCV and return the test RMSE, as well as the 
        optimized and fitted model
        '''
        from sklearn.model_selection import GridSearchCV
        from sklearn.model_selection import cross_val_score

        # set up GridSearch object
        Search = GridSearchCV(estimator=est,
                              param_grid=params,
                              scoring='neg_mean_squared_error',
                              n_jobs=n_jobs,
                              cv=3,
                              return_train_score=False,
                              iid=False)

        # try all combination of parameters with the fit
        search_results = Search.fit(X_grid, y_grid)

        # extract best estimator parameters and create model object with them
        best_params = search_results.cv_results_['params'][search_results.best_index_]
        est.set_params(**best_params)

        # fit the optimal estimator with the data
        est.fit(X_grid, y_grid)

        return est


        
    def fit_and_predict_tree(self, pos, print_results=False):
        
        #----------
        # Train the Decision Tree with GridSearch optimization
        #----------
        
        from sklearn.tree import DecisionTreeRegressor
        
        # train decision tree with _searching method
        dtree = self._searching(DecisionTreeRegressor(random_state=1), self.pos[pos]['tree_params'], 
                               self.X_train, self.y)
        
        #----------
        # Calculate each cluster's mean and standard deviation
        #----------

        # pull out the training clusters and cbind with the actual points scored
        train_results = pd.concat([pd.Series(dtree.apply(self.X_train), name='Cluster'), self.y], axis=1)

        # calculate the average and standard deviation of points scored by cluster
        train_results = train_results.groupby('Cluster', as_index=False).agg({'y_act': ['mean', 'std']})
        train_results.columns = ['Cluster', 'ClusterMean', 'ClusterStd']

        #----------
        # Add the cluster to test results and resulting group mean / std: Player | Pred | StdDev
        #----------

        # grab the player, prediction, and add cluster to dataset
        test_results = pd.concat([self.df_test[['player', 'pred']], 
                                  pd.Series(dtree.apply(self.X_test), name='Cluster')], axis=1)

        # merge the test results with the train result on cluster to add mean cluster and std
        test_results = pd.merge(test_results, train_results, how='inner', left_on='Cluster', right_on='Cluster')

        # calculate an overall prediction mean
        test_results['PredMean'] = (0.5*test_results.pred + 0.5*test_results.ClusterMean)
        
        # pull out relevant results for creating distributions
        test_results = test_results[['player', 'PredMean', 'ClusterStd']]
        
        return test_results

In [None]:
import time
start =time.time()

distributions = pd.DataFrame()
for pos in ['aQB', 'bRB', 'cWR', 'dTE']:

    train_results, test_results = rf_train_data(pts_dict, pos, engine, table_info)
    
    cluster_dev = DataGeneration(train_results, test_results)
    dist_input = cluster_dev.fit_and_predict_tree(pos[1:])
    dist_input['pos'] = pos[1:]
    distributions = pd.concat([distributions, dist_input])

distributions = distributions.reset_index(drop=True)
data = []
for row in distributions.iterrows():
    dist = list(np.random.normal(loc=row[1]['PredMean'], scale=row[1]['ClusterStd'], size=1500)*16)
    data.append(dist)
    
data = pd.concat([distributions.player, distributions.pos, pd.DataFrame(data)], axis=1)
time.time()-start