# User Inputs

# Load Packages

In [1]:
year=2018

In [2]:
import pandas as pd
import os
import sqlite3
from data_functions import *
pd.options.mode.chained_assignment = None
import numpy as np

# set core path
path = '/Users/Mark/Documents/Github/Fantasy_Football'

# set the database name
db_name = 'Season_Stats.sqlite3'

In [3]:
#==========
# Clean the ADP data
#==========

'''
Cleaning the ADP data by selecting relevant features, and extracting the name and team
from the combined string column. Note that the year is not shifted back because the 
stats will be used to calculate FP/G for the rookie in that season, but will be removed
prior to training. Thus, the ADP should match the year from the stats.
'''

def clean_adp(data_adp, year):

    #--------
    # Select relevant columns and clean special figures
    #--------

    data_adp['year'] = year

    # set column names to what they are after pulling
    df_adp = data_adp.iloc[:, 1:].rename(columns={
        1: 'Player', 
        2: 'Avg. Pick',
        3: 'Min. Pick',
        4: 'Max. Pick',
        5: '# Drafts Selected In'
    })

    # selecting relevant columns and dropping na
    df_adp = df_adp[['Player', 'year', 'Avg. Pick']].dropna()

    # convert year to float and move back one year to match with stats
    df_adp['year'] = df_adp.year.astype('float')

    # selecting team and player name information from combined string
    df_adp['Tm'] = df_adp.Player.apply(team_select)
    df_adp['Player'] = df_adp.Player.apply(name_select)
    df_adp['Player'] = df_adp.Player.apply(name_clean)

    # format and rename columns
    df_adp = df_adp[['Player', 'Tm', 'year', 'Avg. Pick']]

    colnames_adp = {
        'Player': 'player',
        'Tm': 'team',
        'year': 'year',
        'Avg. Pick': 'avg_pick'
    }

    df_adp = df_adp.rename(columns=colnames_adp)
    
    return df_adp

# Running Backs

In [4]:
url_adp_rush = 'http://www03.myfantasyleague.com/{}/adp?COUNT=100&POS=RB&ROOKIES=0&INJURED=1&CUTOFF=5&FRANCHISES=-1&IS_PPR=-1&IS_KEEPER=0&IS_MOCK=0&TIME='
data_adp_rush = pd.DataFrame()
rb_adp = pd.DataFrame()

for year in range(2004, year+1):
    url_year = url_adp_rush.format(str(year))
    f = pd.read_html(url_year, header=0)[1]
    f = f.assign(year=year)
    f = clean_adp(f, year)
    rb_adp = pd.concat([rb_adp, f], axis=0)
    
rb_adp = rb_adp.reset_index(drop=True)

In [5]:
#===========
# Pull out Rookie seasons from training dataframe
#===========

'''
Loop through each player and select their minimum year, which will likely be their 
rookie season. Weird outliers will be removed later on.
'''

conn = sqlite3.connect('/Users/Mark/Documents/Github/Fantasy_Football/Data/Season_Stats.sqlite3')
query = "SELECT * FROM RB_Stats A"
rb = pd.read_sql_query(query, con=conn)

rookies = pd.DataFrame()
for player in rb.player.unique():
    tmp = rb[rb.player == player]
    year_min = rb[rb.player == player].year.min()
    rookie_tmp = tmp[(tmp.player == player) & (tmp.year == year_min)]
    rookies = rookies.append(rookie_tmp)
    
# reset index and grab basic info columns and target columns
rookies = rookies.reset_index(drop=True)
rookies = rookies[['player', 'pos', 'team', 'games', 'games_started', 'year', 'age', 'rush_yds', 
                   'rush_td', 'receptions', 'rec_yds', 'rec_td']]

rookies['total_td'] = rookies.rec_td + rookies.rush_td
rookies['rush_yd_per_game'] = rookies.rush_yds / rookies.games
rookies['rec_yd_per_game'] = rookies.rec_yds / rookies.games
rookies['rec_per_game'] = rookies.receptions / rookies.games
rookies['td_per_game'] = rookies.total_td / rookies.games

rookies = rookies.drop(['rush_yds', 'rec_yds', 'rec_td', 'receptions', 'games', 'rush_td', 'total_td'], axis=1)

In [7]:
#==========
# Pulling in the Player Profiler statistics
#==========

'''
Pull in the player profiler statistics and clean up any formatting issues. Follow by
left joining the statistics to the existing player dataframe.
'''

# read in player profiler csv
data_pp = pd.read_csv('/Users/Mark/Documents/Github/Fantasy_Football/Data/RookieRBProfiler.csv')

# convert all dashes to null
data_pp = data_pp.replace("-", float('nan'))

colnames = {
    'Full Name': 'player',
    'Position': 'position',
    '20-Yard Shuttle': 'shuffle_20_yd',
    'Athleticism Score': 'athlete_score',
    'SPARQ-x': 'sparq',
    '3-Cone Drill': 'three_cone',
    'Bench Press': 'bench_press',
    'Speed Score': 'speed_score',
    '40-Yard Dash': 'forty',
    'Broad Jump': 'broad_jump',
    'Vertical Jump': 'vertical',
    'Burst Score': 'burst_score',
    'Agility Score': 'agility_score',
    'Hand Size': 'hand_size',
    'Age': 'pp_age',
    'Arm Length': 'arm_length',
    'Height (Inches)': 'height',
    'Weight': 'weight',
    'Draft Pick': 'draft_pick', 
    'BMI': 'bmi',
    'Breakout Age': 'breakout_age',
    'College YPC': 'college_ypc',
    'Breakout Year': 'breakout_year',
    'College Dominator Rating': 'dominator_rating',
    'College Target Share': 'college_tgt_share'
}

# rename columns
data_pp = data_pp.rename(columns=colnames)

# replace undrafted players draft slot with 7.33
data_pp = data_pp.replace("Undrafted", 7.33)

def draft_pick(col):
    a = str(col).split('.')
    x = [float(val) for val in a]
    y = 32*x[0] + x[1] - 32
    return y

# create continuous draft pick number
data_pp['draft_pick'] = data_pp['draft_pick'].apply(draft_pick)

def weight_clean(col):
    y = str(col).split(' ')[0]
    y = float(y)
    return y

# clean up the weight to remove lbs
data_pp['weight'] = data_pp['weight'].apply(weight_clean)

# convert all columns to numeric
data_pp.iloc[:, 2:] = data_pp.iloc[:, 2:].astype('float')

# select only relevant columns before joining
data_pp = data_pp[['player', 'pp_age', 'shuffle_20_yd', 'athlete_score', 'sparq', 'three_cone', 'bench_press',
                   'speed_score', 'forty', 'broad_jump', 'vertical', 'burst_score', 'agility_score',
                   'hand_size', 'arm_length', 'height', 'weight', 'draft_pick', 'bmi', 'breakout_age' ,
                   'college_ypc', 'breakout_year', 'dominator_rating', 'college_tgt_share']]

In [8]:
#===========
# Merge player profiler data with statistical data
#===========

'''
The college and combine statistics are merged with the traditional statistics. Players who do
not have any player profiler information are dropped, as well as players who don't have 
any stats information (excluding this year's data). The dataframes are recombined for NA filling.
'''

# merge statistical data with player_profiler data
rookies = pd.merge(rookies, data_pp, how='inner', right_on='player', left_on='player')

# select players who aren't rookies and drop
to_drop = ['Rex Burkhead', 'Corey Grant', 'Shaun Draughn', 'LaDainian Tomlinson', 
           'Chris Warren', 'Andre Brown', 'Matt Asiata', 'Chris Ivory', 'Mike Tolbert',
           'Shonn Greene', 'Danny Woodhead']
to_drop_idx = rookies[rookies.player.isin(to_drop)].index
rookies = rookies.drop(to_drop_idx, axis=0).reset_index(drop=True)

# set the age to the player profile age and drop pp_age
rookies.loc[: , 'age'] = rookies.loc[:, 'pp_age']
rookies = rookies.drop('pp_age', axis=1)

# fill in null values with the median
for col in rookies.columns:
    try:
        rookies.loc[:, col] = rookies.loc[:, col].fillna(rookies[col].median())
    except:
        pass

In [9]:
rookies = pd.merge(rookies, rb_adp, how='inner', left_on=['player', 'year'], right_on=['player', 'year'])
rookies = rookies.drop('team_y', axis=1).rename(columns={'team_x': 'team'})

In [10]:
import matplotlib.pyplot as plt
rookies.loc[:, 'log_draft_pick'] = np.log(rookies.draft_pick)
rookies.loc[:, 'log_avg_pick'] = np.log(rookies.avg_pick)
rookies.loc[:, 'speed_weight'] = rookies.speed_score * rookies.weight
rookies.loc[:, 'speed_catch'] = rookies.speed_score * rookies.college_tgt_share

rookies.sort_values(by='year', ascending=False)

In [15]:
combine_stats = pd.read_html('https://www.pro-football-reference.com/play-index/nfl-combine-results.cgi?request=1&year_min=2019&year_max=2019&pos%5B%5D=RB&show=all&order_by=year_id')[0]

In [19]:
combine_stats = combine_stats[['Player', 'Pos', 'Age', 'Height', 'Wt', '40YD', 'Vertical', 
                               'BenchReps', 'Broad Jump', '3Cone', 'Shuttle', 'Drafted (tm/rnd/yr)']]


In [23]:
rookies.corr()['rush_yd_per_game'].sort_values()

log_avg_pick        -0.514326
avg_pick            -0.493015
log_draft_pick      -0.457913
draft_pick          -0.369090
year                -0.155923
breakout_year       -0.141206
breakout_age        -0.110688
forty               -0.108094
vertical            -0.012490
college_tgt_share    0.027745
college_ypc          0.032221
burst_score          0.033275
sparq                0.067383
broad_jump           0.075400
bench_press          0.095831
age                  0.096272
speed_catch          0.102934
athlete_score        0.108746
shuffle_20_yd        0.108836
dominator_rating     0.118369
height               0.126386
three_cone           0.148212
agility_score        0.148414
hand_size            0.151401
bmi                  0.190257
arm_length           0.226396
weight               0.276960
speed_score          0.298959
speed_weight         0.342600
rec_yd_per_game      0.381557
rec_per_game         0.410913
td_per_game          0.767514
games_started        0.804301
rush_yd_pe

In [20]:
combine_stats

Unnamed: 0,Player,Pos,Age,Height,Wt,40YD,Vertical,BenchReps,Broad Jump,3Cone,Shuttle,Drafted (tm/rnd/yr)
0,Trayveon Williams,RB,21.0,5-8,206,4.51,33.0,19.0,121.0,7.44,4.44,Cincinnati Bengals / 6th / 182nd pick / 2019
1,James Williams,RB,22.0,5-9,197,4.58,36.5,,118.0,7.01,4.25,
2,Dexter Williams,RB,22.0,5-11,212,4.57,36.0,17.0,130.0,7.0,4.16,Green Bay Packers / 6th / 194th pick / 2019
3,Mike Weber,RB,21.0,5-10,211,4.47,33.5,22.0,,,,Dallas Cowboys / 7th / 218th pick / 2019
4,Devin Singletary,RB,21.0,5-7,203,4.66,35.0,15.0,117.0,7.32,4.4,Buffalo Bills / 3rd / 74th pick / 2019
5,Lj Scott,RB,,6-0,227,,33.0,21.0,120.0,7.27,4.34,
6,Jordan Scarlett,RB,23.0,5-11,208,4.47,30.0,21.0,116.0,7.37,4.63,Carolina Panthers / 5th / 154th pick / 2019
7,Miles Sanders,RB,21.0,5-11,211,4.49,36.0,20.0,124.0,6.89,4.19,Philadelphia Eagles / 2nd / 53rd pick / 2019
8,Tony Pollard,RB,21.0,6-0,210,4.52,35.0,13.0,121.0,,,Dallas Cowboys / 4th / 128th pick / 2019
9,Qadree Ollison,RB,22.0,6-1,228,4.58,29.5,19.0,114.0,7.53,4.31,Atlanta Falcons / 5th / 152nd pick / 2019


append_to_db(rookies, db_name='Season_Stats.sqlite3', table_name='Rookie_RB_Stats', if_exist='append')

# Wide Receivers

In [83]:
url_adp_rec = 'http://www03.myfantasyleague.com/{}/adp?COUNT=100&POS=WR&ROOKIES=0&INJURED=1&CUTOFF=5&FRANCHISES=-1&IS_PPR=-1&IS_KEEPER=0&IS_MOCK=0&TIME='
data_adp_rec = pd.DataFrame()
wr_adp = pd.DataFrame()

for year in range(2004, 2018):
    url_year = url_adp_rec.format(str(year))
    f = pd.read_html(url_year, header=0)[1]
    f = f.assign(year=year)
    f = clean_adp(f, year)
    wr_adp = pd.concat([wr_adp, f], axis=0)
    
wr_adp = wr_adp.reset_index(drop=True)

In [85]:
#===========
# Pull out Rookie seasons from training dataframe
#===========

'''
Loop through each player and select their minimum year, which will likely be their 
rookie season. Weird outliers will be removed later on.
'''

conn = sqlite3.connect('/Users/Mark/Documents/Github/Fantasy_Football/Data/Season_Stats.sqlite3')
query = "SELECT * FROM WR_Stats A"
rb = pd.read_sql_query(query, con=conn)

rookies = pd.DataFrame()
for player in rb.player.unique():
    tmp = rb[rb.player == player]
    year_min = rb[rb.player == player].year.min()
    rookie_tmp = tmp[(tmp.player == player) & (tmp.year == year_min)]
    rookies = rookies.append(rookie_tmp)
    
# reset index and grab basic info columns and target columns
rookies = rookies.reset_index(drop=True)
rookies = rookies[['player', 'pos', 'team', 'games', 'games_started', 'year', 'age',
                   'receptions', 'rec_yds', 'rec_td']]

rookies['total_td'] = rookies.rec_td
rookies['rec_yd_per_game'] = rookies.rec_yds / rookies.games
rookies['rec_per_game'] = rookies.receptions / rookies.games
rookies['td_per_game'] = rookies.total_td / rookies.games

rookies = rookies.drop(['rec_yds', 'receptions', 'games',  'total_td'], axis=1)

In [91]:
data_pp.columns

Index(['Full Name', 'Position', 'BMI', 'Height (Inches)', 'Weight', 'Age',
       'Hand Size', 'Arm Length', '20-Yard Shuttle', 'Athleticism Score',
       'Catch Radius', 'Height-adjusted Speed Score', 'Bench Press',
       '3-Cone Drill', '40-Yard Dash', 'Broad Jump', 'SPARQ-x',
       'Vertical Jump', 'Burst Score', 'Agility Score', 'College YPR',
       'College Dominator Rating', 'Breakout Year', 'Breakout Age'],
      dtype='object')

In [100]:
#==========
# Pulling in the Player Profiler statistics
#==========

'''
Pull in the player profiler statistics and clean up any formatting issues. Follow by
left joining the statistics to the existing player dataframe.
'''

# read in player profiler csv
data_pp = pd.read_csv('/Users/Mark/Desktop/Jupyter Projects/Fantasy Football/Data/Player_Data/Rookie_WR/wr_player_profiler.csv')

# convert all dashes to null
data_pp = data_pp.replace("-", float('nan'))

colnames = {
    'Full Name': 'player',
    'Position': 'position',
    '20-Yard Shuttle': 'shuffle_20_yd',
    'Athleticism Score': 'athlete_score',
    'SPARQ-x': 'sparq',
    '3-Cone Drill': 'three_cone',
    'Bench Press': 'bench_press',
    'Height-adjusted Speed Score': 'speed_score',
    '40-Yard Dash': 'forty',
    'Broad Jump': 'broad_jump',
    'Vertical Jump': 'vertical',
    'Burst Score': 'burst_score',
    'Agility Score': 'agility_score',
    'Hand Size': 'hand_size',
    'Age': 'pp_age',
    'Arm Length': 'arm_length',
    'Height (Inches)': 'height',
    'Weight': 'weight',
    'Draft Pick': 'draft_pick', 
    'BMI': 'bmi',
    'Breakout Age': 'breakout_age',
    'College YPR': 'college_ypr',
    'Breakout Year': 'breakout_year',
    'College Dominator Rating': 'dominator_rating',
    'College Target Share': 'college_tgt_share',
    'Catch Radius': 'catch_radius',
}

# rename columns
data_pp = data_pp.rename(columns=colnames)

# replace undrafted players draft slot with 7.33
data_pp = data_pp.replace("Undrafted", 7.33)

def draft_pick(col):
    a = str(col).split('.')
    x = [float(val) for val in a]
    y = 32*x[0] + x[1] - 32
    return y

# create continuous draft pick number
#data_pp['draft_pick'] = data_pp['draft_pick'].apply(draft_pick)

def weight_clean(col):
    y = str(col).split(' ')[0]
    y = float(y)
    return y

# clean up the weight to remove lbs
data_pp['weight'] = data_pp['weight'].apply(weight_clean)

# convert all columns to numeric
data_pp.iloc[:, 2:] = data_pp.iloc[:, 2:].astype('float')

# select only relevant columns before joining
data_pp = data_pp[['player', 'pp_age', 'shuffle_20_yd', 'athlete_score', 'sparq', 'three_cone', 'bench_press',
                   'speed_score', 'forty', 'broad_jump', 'vertical', 'burst_score', 'agility_score',
                   'hand_size', 'arm_length', 'height', 'weight',  'bmi', 'breakout_age' ,
                   'college_ypr', 'breakout_year', 'dominator_rating']]

In [101]:
#===========
# Merge player profiler data with statistical data
#===========

'''
The college and combine statistics are merged with the traditional statistics. Players who do
not have any player profiler information are dropped, as well as players who don't have 
any stats information (excluding this year's data). The dataframes are recombined for NA filling.
'''

# merge statistical data with player_profiler data
all_data = pd.merge(rookies, data_pp, how='left', right_on='player', left_on='player')

# drop any player who doesn't have player profiler stats by subsetting dropna with draft pick
all_data = all_data.dropna(subset=['height'], axis=0)

# split out past years and this year
df_predict = all_data[all_data.year == 2018]
df_train = all_data[all_data.year < 2018]

# drop players without any stats, excluding this year's predict data
df_train = df_train.dropna(subset=['rec_yd_per_game'], axis=0)

# re-combine the train and predict dataframe so that null values can be filled
all_data = pd.concat([df_train, df_predict], axis=0).reset_index(drop=True)
all_data = all_data.sort_values(by='year')

In [110]:
all_data['handsize_forty'] = all_data.hand_size * all_data.forty
all_data.corr()['games_started']

games_started         1.000000
year                 -0.110354
age                  -0.144836
rec_td                0.544668
rec_yd_per_game       0.671573
rec_per_game          0.670402
td_per_game           0.474986
pp_age                0.019126
shuffle_20_yd         0.099981
athlete_score         0.065389
sparq                -0.026290
three_cone            0.038885
bench_press           0.013708
speed_score           0.040115
forty                 0.048317
broad_jump           -0.012957
vertical             -0.065289
burst_score          -0.048543
agility_score         0.075127
hand_size             0.022626
arm_length            0.099568
height                0.080040
weight                0.097365
bmi                   0.037339
breakout_age          0.030172
college_ypr          -0.010901
breakout_year        -0.019486
dominator_rating      0.046858
handsize_dominator    0.028188
handsize_forty        0.028188
Name: games_started, dtype: float64

In [None]:
#===========
# Remaining Clean-up
#===========

'''
Complete the final portion of cleanup, such as filling in null values,
dropping statistical columns that would leak information, creating train / predict
sets, and removing any remaining null values from the datasets.
'''

# fill all NA with median value
all_data.loc[:,'bmi':'breakout_year'] = all_data.loc[:,'bmi':'breakout_year'].fillna(all_data.median())

# split out train and predict
df_train = all_data[all_data.year < 2018]
df_predict = all_data[all_data.year == 2018].reset_index(drop=True)

# drop any remaining NA from df_train and remove FP
df_train = df_train.dropna()
df_train = df_train.drop('fp', axis=1)

# set age to player profiler age and drop fp columns
df_predict.age = df_predict.pp_age
df_predict = df_predict.drop(['pp_age', 'fp', 'fp_per_game'], axis=1)

In [None]:
#===========
# Remaining Clean-up
#===========

'''
Complete the final portion of cleanup, such as filling in null values,
dropping statistical columns that would leak information, creating train / predict
sets, and removing any remaining null values from the datasets.
'''

# fill all NA with median value
all_data.loc[:,'bmi':'breakout_year'] = all_data.loc[:,'bmi':'breakout_year'].fillna(all_data.median())

# columns to drop (stats columns)
to_drop = ['tgt', 'receptions', 'yds', 'td', 'catch_pct', 'games', 'games_started', 'long', 
           'rec_per_game', 'yd_per_game', 'yd_per_rec', 'fmb']

all_data = all_data.drop(to_drop, axis=1)

# split out train and predict
df_train = all_data[all_data.year < 2018]
df_predict = all_data[all_data.year == 2018].reset_index(drop=True)

# drop any remaining NA from df_train and remove FP
df_train = df_train.dropna()
df_train = df_train.drop('fp', axis=1)

# set age to player profiler age and drop fp columns
df_predict.age = df_predict.pp_age
df_predict = df_predict.drop(['pp_age', 'fp', 'fp_per_game'], axis=1)