# User Inputs

# Load Packages

In [None]:
import pandas as pd
import os
import sqlite3
from data_functions import *
pd.options.mode.chained_assignment = None
import numpy as np

# Running Backs

In [None]:
#==========
# Scraping the statistical and ADP data
#==========

'''
Pull in statistical and ADP data for the given years using the custom data_load function.
'''

# starting and ending year for pulling player data
year_start = 2000
year_end = 2017

# pulling receiving statistics
url_player = 'https://www.pro-football-reference.com/years/{0}/rushing.htm'
data_player = data_load(year_start, year_end, url_player, 0)

# pulling historical player adp
url_adp = 'http://www03.myfantasyleague.com/{0}/adp?COUNT=100&POS=RB&ROOKIES=0&INJURED=1&CUTOFF=10\
&FRANCHISES=-1&IS_PPR=1&IS_KEEPER=0&IS_MOCK=-1&TIME='
data_adp = data_load(year_start, year_end+1, url_adp, 1)

In [None]:
#==========
# Clean the Statistical Data
#==========

'''
Clean the statistical data by selecting the column names, formatting the column names,
and cleaning up any special characters associated with the player column.
'''

data_player.columns = data_player.iloc[0,:]
df_player = data_player.iloc[1:, 1:]
df_player = df_player.T.reset_index(drop=True).T

# extra total yards column for some reason, col 25 is skipped below in naming
df_player = df_player.drop(25, axis=1)

colnames_player = {
    0: 'player', 
    1: 'team', 
    2: 'age',
    3: 'pos',
    4: 'games',
    5: 'games_started',
    6: 'att',
    7: 'rush_yds',
    8: 'rush_td',
    9: 'long_rush',
    10: 'yard_per_att',
    11: 'rush_yd_per_game',
    12: 'att_per_game',
    13: 'tgt',
    14: 'receptions',
    15: 'rec_yds',
    16: 'yd_per_rec',
    17: 'rec_td',
    18: 'rec_long',
    19: 'rec_per_game',
    20: 'rec_yd_per_game',
    21: 'catch_pct',
    22: 'total_touches',
    23: 'yard_per_touch',
    24: 'scrimmage_yds',
    26: 'total_td',
    27: 'fmb',
    28: 'year'   
}

# cleaning player name and stat categories
df_player = df_player.rename(columns = colnames_player)
df_player['player'] = df_player.player.apply(name_clean)
df_player['year'] = df_player.year.astype('float')

In [None]:
#==========
# Clean the ADP data
#==========

'''
Cleaning the ADP data by selecting relevant features, and extracting the name and team
from the combined string column. Note that the year is not shifted back because the 
stats will be used to calculate FP/G for the rookie in that season, but will be removed
prior to training. Thus, the ADP should match the year from the stats.
'''

# selecting relevant columns and dropping na
df_adp = data_adp[['Player', 'year', 'Avg. Pick']].dropna()

# selecting team and player name information from combined string
df_adp['Tm'] = df_adp.Player.apply(team_select)
df_adp['Player'] = df_adp.Player.apply(name_select)
df_adp['Player'] = df_adp.Player.apply(name_clean)

# convert year to float and move back one year to match with stats
df_adp['year'] = df_adp.year.astype('float')

# format and rename columns
df_adp = df_adp[['Player', 'Tm', 'year', 'Avg. Pick']]

colnames_adp = {
    'Player': 'player',
    'Tm': 'team',
    'year': 'year',
    'Avg. Pick': 'avg_pick'
}

df_adp = df_adp.rename(columns=colnames_adp)

In [None]:
#==========
# Merging and formatting all player-based data.
#==========

'''
Join the statistical and adp data into a single, merged dataframe. Update the teams
to have a consistent abbreviation for later joining. Also, select only relevant columns, 
as well as convert all numerical features to float.
'''

# merge adp and player data with right join to keep all adp data
df_merged = pd.merge(df_player, df_adp, how = 'right', left_on = ['player', 'year'], right_on = ['player', 'year'])

# ensure all teams have same abbreviations for matching
adp_to_player_teams = {
    
    'ARI': 'ARI',
    'ATL': 'ATL',
    'BAL': 'BAL',
    'BUF': 'BUF',
    'CAR': 'CAR',
    'CHI': 'CHI',
    'CIN': 'CIN',
    'CLE': 'CLE',
    'DAL': 'DAL',
    'DEN': 'DEN',
    'DET': 'DET',
    'GBP': 'GNB',
    'HOU': 'HOU',
    'IND': 'IND',
    'JAC': 'JAX',
    'KCC': 'KAN',
    'LAC': 'LAC',
    'SDC': 'LAC',
    'LAR': 'LAR',
    'RAM': 'LAR',
    'MIA': 'MIA',
    'MIN': 'MIN',
    'NEP': 'NWE',
    'NOS': 'NOR',
    'NYG': 'NYG',
    'NYJ': 'NYJ',
    'OAK': 'OAK',
    'PHI': 'PHI',
    'PIT': 'PIT',
    'SEA': 'SEA',
    'SFO': 'SFO',
    'TBB': 'TAM',
    'TEN': 'TEN',
    'WAS': 'WAS'
}

df_merged['team_y'] = df_merged['team_y'].map(adp_to_player_teams)
df_merged = df_merged.rename(columns = {'team_y': 'team'})

df_merged['pos'] = 'RB'

# rearrange columns
df_merged = df_merged[['player', 'pos', 'team', 'year', 'age', 'avg_pick',
                       'att', 'rush_yds', 'yard_per_att', 'att_per_game', 'rush_yd_per_game',
                       'rush_td', 'tgt', 'receptions', 'rec_yds', 'yd_per_rec', 'rec_td',
                       'rec_long', 'long_rush', 'rec_per_game', 'rec_yd_per_game', 'catch_pct', 'scrimmage_yds',
                       'total_td', 'fmb', 'games', 'games_started', 'total_touches']]

# make all columns numeric
df_merged['catch_pct'] = df_merged.catch_pct.apply(name_clean)
df_merged.iloc[:, 3:] = df_merged.iloc[:, 3:].astype('float', errors='ignore')

In [None]:
#==========
# Load team-based statistics
#==========

'''
Pull in the oline, quarterback, and overall team offense statistics, and join them 
to the player data. This will provide team-based context for the rookies, as well as
allow for grouped statistics generation.
'''

# fill NA with LAR since all Rams players are missing team (fills non-Rams as well)
df_merged.loc[:, 'team'] = df_merged.team.fillna('LAR')

# pull in team based stats for oline and quarterback play
oline = pd.read_csv('/Users/Mark/Desktop/Jupyter Projects/Fantasy Football/Data/CSV/oline_rankings.csv', index_col=0)
qb = pd.read_csv('/Users/Mark/Desktop/Jupyter Projects/Fantasy Football/Data/CSV/qb_stats_for_wr.csv', index_col=0)
team_off = pd.read_csv('/Users/Mark/Desktop/Jupyter Projects/Fantasy Football/Data/CSV/team_offensive_efficiency.csv', index_col=0)
team_stats = pd.read_csv('/Users/Mark/Desktop/Jupyter Projects/Fantasy Football/Data/CSV/traditional_team_stats.csv', index_col=0)

# set year + 1 since we aren't predicting forward for rookies
oline['year'] = oline.year + 1
qb['year'] = qb.year + 1
team_off['year'] = team_off.year + 1
team_stats['year'] = team_stats.year + 1

# give qb attempts a unique name before merging
qb = qb.rename(columns={'att': 'qb_att'})

# merge all df with the original
team_files = [oline, qb, team_off, team_stats]
for file in team_files:
    df_merged = pd.merge(df_merged, file, how='left', left_on=['team', 'year'], right_on=['team', 'year'])

In [None]:
#==========
# Creating team based grouped statistics
#==========

'''
Create grouped statistics based on the team and teammates. For example,
create total touches by team feature, as well as how the average, min, and max
teammate adps compare to the current player.
'''

# groupby team and year to get total rb touches for each team
team_touches = df_merged.groupby(['team', 'year'], group_keys=False)['att'].agg(np.sum).reset_index().rename(columns={'att': 'rb_att_on_team'})
team_touches.year = (team_touches.year + 1)
df_merged = pd.merge(df_merged, team_touches, how='left', left_on=['team', 'year'], right_on=['team', 'year'])
df_merged['available_rush_att'] = 1-(df_merged['rb_att_on_team'] / df_merged['tm_rush_att'])

team_tgts = df_merged.groupby(['team', 'year'], group_keys=False)['tgt'].agg(np.sum).reset_index().rename(columns={'tgt': 'tgt_on_team'})
team_tgts.year = team_tgts.year + 1
df_merged = pd.merge(df_merged, team_tgts, how='left', left_on=['team', 'year'], right_on=['team', 'year'])
df_merged['available_tgt'] = 1-(df_merged['tgt_on_team'] / df_merged['tm_pass_att'])

# create teammate ADP metrics to see if top ranked player
min_teammate = df_merged.groupby(['team', 'year'], group_keys=False)['avg_pick'].agg(np.min).reset_index().rename(columns={'avg_pick': 'min_teammate'})
max_teammate = df_merged.groupby(['team', 'year'], group_keys=False)['avg_pick'].agg(np.max).reset_index().rename(columns={'avg_pick': 'max_teammate'})
avg_teammate = df_merged.groupby(['team', 'year'], group_keys=False)['avg_pick'].agg(np.mean).reset_index().rename(columns={'avg_pick': 'avg_teammate'})

names = ['min_teammate', 'max_teammate', 'avg_teammate']
for i, file in enumerate([min_teammate, max_teammate, avg_teammate]):
    
    df_merged = pd.merge(df_merged, file, how='inner', left_on=['team', 'year'], right_on=['team', 'year'])
    df_merged['teammate_diff'] = df_merged[names[i]] - df_merged['min_teammate'] 

In [None]:
#==========
# Pulling in the Player Profiler statistics
#==========

'''
Pull in the player profiler statistics and clean up any formatting issues. Follow by
left joining the statistics to the existing player dataframe.
'''

# read in player profiler csv
data_pp = pd.read_csv('/Users/Mark/Desktop/Jupyter Projects/Fantasy Football/Data/Player_Data/Rookie_RB/rb_player_profiler.csv')

# convert all dashes to null
data_pp = data_pp.replace("-", float('nan'))

colnames = {
    'Full Name': 'player',
    'Position': 'position',
    '20-Yard Shuttle': 'shuffle_20_yd',
    'Athleticism Score': 'athlete_score',
    'SPARQ-x': 'sparq',
    '3-Cone Drill': 'three_cone',
    'Bench Press': 'bench_press',
    'Speed Score': 'speed_score',
    '40-Yard Dash': 'forty',
    'Broad Jump': 'broad_jump',
    'Vertical Jump': 'vertical',
    'Burst Score': 'burst_score',
    'Agility Score': 'agility_score',
    'Hand Size': 'hand_size',
    'Age': 'pp_age',
    'Arm Length': 'arm_length',
    'Height (Inches)': 'height',
    'Weight': 'weight',
    'Draft Pick': 'draft_pick', 
    'BMI': 'bmi',
    'Breakout Age': 'breakout_age',
    'College YPC': 'college_ypc',
    'Breakout Year': 'breakout_year',
    'College Dominator Rating': 'dominator_rating',
    'College Target Share': 'college_tgt_share'
}

# rename columns
data_pp = data_pp.rename(columns=colnames)

# replace undrafted players draft slot with 7.33
data_pp = data_pp.replace("Undrafted", 7.33)

def draft_pick(col):
    a = str(col).split('.')
    x = [float(val) for val in a]
    y = 32*x[0] + x[1] - 32
    return y

# create continuous draft pick number
data_pp['draft_pick'] = data_pp['draft_pick'].apply(draft_pick)

def weight_clean(col):
    y = str(col).split(' ')[0]
    y = float(y)
    return y

# clean up the weight to remove lbs
data_pp['weight'] = data_pp['weight'].apply(weight_clean)

# convert all columns to numeric
data_pp.iloc[:, 2:] = data_pp.iloc[:, 2:].astype('float')

# select only relevant columns before joining
data_pp = data_pp[['player', 'pp_age', 'shuffle_20_yd', 'athlete_score', 'sparq', 'three_cone', 'bench_press',
                   'speed_score', 'forty', 'broad_jump', 'vertical', 'burst_score', 'agility_score',
                   'hand_size', 'arm_length', 'height', 'weight', 'draft_pick', 'bmi', 'breakout_age' ,
                   'college_ypc', 'breakout_year', 'dominator_rating', 'college_tgt_share']]

In [None]:
#===========
# Merge player profiler data with statistical data
#===========

'''
The college and combine statistics are merged with the traditional statistics. Players who do
not have any player profiler information are dropped, as well as players who don't have 
any stats information (excluding this year's data). The dataframes are recombined for NA filling.
'''

# merge statistical data with player_profiler data
all_data = pd.merge(df_merged, data_pp, how='left', right_on='player', left_on='player')

# drop any player who doesn't have player profiler stats by subsetting dropna with draft pick
all_data = all_data.dropna(subset=['draft_pick'], axis=0)

# split out past years and this year
df_predict = all_data[all_data.year == 2018]
df_train = all_data[all_data.year < 2018]

# drop players without any stats, excluding this year's predict data
df_train = df_train.dropna(subset=['tgt'], axis=0)

# re-combine the train and predict dataframe so that null values can be filled
all_data = pd.concat([df_train, df_predict], axis=0).reset_index(drop=True)

In [None]:
#===========
# Calculate Fantasy Points
#===========

'''
The fantasy points are calculated. In this case, the target will be what players
scored in year N, since all statistical columns will be dropped for the rookies.
All the player profiler stats are filled with the median (check better way?) and
the train and predict columns are split out.
'''

# specify stat categories
Yd_mult = 0.1
TD_val = 6.0
Rec_val = 1
Fmb = -1.0
params = [Yd_mult, TD_val, Rec_val, Fmb]

# calculate total fantasy points
all_data['fp'] = params[0]*all_data['rush_yds'] + \
                 params[0]*all_data['rec_yds'] + \
                 params[1]*all_data['rush_td'] + \
                 params[1]*all_data['rec_td'] + \
                 params[2]*all_data['receptions'] + \
                 params[3]*all_data['fmb']

# calculate fantasy points per game
all_data['fp_per_game'] = all_data['fp'] / all_data['games']

In [None]:
#===========
# Remaining Clean-up
#===========

'''
Complete the final portion of cleanup, such as filling in null values,
dropping statistical columns that would leak information, creating train / predict
sets, and removing any remaining null values from the datasets.
'''

# fill all NA with median value
all_data.loc[:,'shuffle_20_yd':'college_tgt_share'] = all_data.loc[:,'shuffle_20_yd':'college_tgt_share'].fillna(all_data.median())

# columns to drop (stats columns)
to_drop = ['att', 'rush_yds', 'yard_per_att', 'att_per_game', 'rush_yd_per_game',
           'rush_td', 'tgt', 'receptions', 'rec_yds', 'yd_per_rec', 'rec_td', 'rec_long', 'long_rush',
           'rec_per_game', 'rec_yd_per_game', 'catch_pct', 'scrimmage_yds', 'total_td', 'fmb', 'games',
           'games_started', 'total_touches']

all_data = all_data.drop(to_drop, axis=1)

# split out train and predict
df_train = all_data[all_data.year < 2018]
df_predict = all_data[all_data.year == 2018].reset_index(drop=True)

# drop any remaining NA from df_train and remove FP
df_train = df_train.dropna()
df_train = df_train.drop('fp', axis=1)

# set age to player profiler age and drop fp columns
df_predict.age = df_predict.pp_age
df_predict = df_predict.drop(['pp_age', 'fp', 'fp_per_game'], axis=1)

In [None]:
#===========
# Select the Rookies from this year
#===========

'''
Subset the prediction dataframe to only include rookies from this years class,
as well as a few 2nd year players that are in questionable positions.
'''

# actual rookies, plus a few questionable 2nd year players
actual_rookies = ['Saquon Barkley',
                  'Rashaad Penny',
                  'Sony Michel',
                  'Royce Freeman',
                  'Kerryon Johnson',
                  'Ronald Jones',
                  'Nick Chubb',
                  'Bo Scarbrough',
                  'Kalen Ballage',
                  'Ito Smith',
                  'Nyheim Hines',
                  'Jaylen Samuels',
                  'Josh Adams',
                  'Mark Walton',
                  'Boston Scott',
                  'Jordan Wilkins',
                  'Chase Edmonds',
                  'Justin Jackson']

df_predict = df_predict[df_predict.player.isin(actual_rookies)].reset_index(drop=True)

In [None]:
#===========
# Pull out Rookie seasons from training dataframe
#===========

'''
Loop through each player and select their minimum year, which will likely be their 
rookie season. Also remove CHI Adrian Peterson from the dataset. Note that a few
players who were not actually rookies may be included. Can always filter by age
to remove extra players.
'''

ap = df_train[(df_train.player == 'Adrian Peterson') & (df_train.team == 'CHI') | (df_train.age > 25)].index
df_train = df_train.drop(ap, axis=0)

rookies = pd.DataFrame()
for player in df_train.player.unique():
    tmp = df_train[df_train.player == player]
    year_min = tmp[tmp.player == player].year.min()
    rookie_tmp = tmp[(tmp.player == player) & (tmp.year == year_min)]
    rookies = rookies.append(rookie_tmp)
    
rookies = rookies.reset_index(drop=True)

# Wide Receivers

In [None]:
#==========
# Scraping the statistical and ADP data
#==========

'''
Pull in statistical and ADP data for the given years using the custom data_load function.
'''

# starting and ending year for pulling player data
year_start = 1998
year_end = 2017

# pulling receiving statistics
url_player = 'https://www.pro-football-reference.com/years/{0}/receiving.htm'
data_player = data_load(year_start, year_end, url_player, 0)

# pulling historical player adp
url_adp = 'http://www03.myfantasyleague.com/{0}/adp?COUNT=100&POS=WR&ROOKIES=0&INJURED=1&CUTOFF=10\
&FRANCHISES=-1&IS_PPR=1&IS_KEEPER=0&IS_MOCK=-1&TIME='
data_adp = data_load(year_start, year_end+1, url_adp, 1)

In [None]:
#==========
# Clean the Statistical Data
#==========

'''
Clean the statistical data by selecting the column names, formatting the column names,
and cleaning up any special characters associated with the player column.
'''

# formatting the pulled player data
df_player = player_format(data_player)

# cleaning player name and stat categories
df_player['Player'] = df_player.Player.apply(name_clean)
df_player['Ctch%'] = df_player['Ctch%'].apply(name_clean)


colnames_player = {
    'Player': 'player', 
    'Tm': 'team', 
    'Pos': 'pos',
    'year': 'year',
    'Age': 'age',
    'Ctch%': 'catch_pct',
    'Fmb': 'fmb',
    'G': 'games',
    'GS': 'games_started',
    'Lng': 'long',
    'R/G': 'rec_per_game',
    'Rec': 'receptions',
    'TD': 'td',
    'Tgt': 'tgt',
    'Y/G': 'yd_per_game',
    'Y/R': 'yd_per_rec',
    'Yds': 'yds'
}

df_player = df_player.rename(columns = colnames_player)
df_player['year'] = df_player.year.astype('float')

# drop players with same name
ss = df_player[(df_player.player == "Steve Smith") & (df_player.team.isin(['STL', 'PHI', 'NYG']))].index
mw = df_player[(df_player.player == 'Mike Williams') & (df_player.team != 'LAC')].index

df_player = df_player.drop(ss, axis=0)
df_player = df_player.drop(mw, axis=0).reset_index(drop=True)

In [None]:
#==========
# Clean the Statistical Data
#==========

'''
Clean the statistical data by selecting the column names, formatting the column names,
and cleaning up any special characters associated with the player column.
'''

# formatting the pulled player data
df_player = player_format(data_player)

# cleaning player name and stat categories
df_player['Player'] = df_player.Player.apply(name_clean)
df_player['Ctch%'] = df_player['Ctch%'].apply(name_clean)


colnames_player = {
    'Player': 'player', 
    'Tm': 'team', 
    'Pos': 'pos',
    'year': 'year',
    'Age': 'age',
    'Ctch%': 'catch_pct',
    'Fmb': 'fmb',
    'G': 'games',
    'GS': 'games_started',
    'Lng': 'long',
    'R/G': 'rec_per_game',
    'Rec': 'receptions',
    'TD': 'td',
    'Tgt': 'tgt',
    'Y/G': 'yd_per_game',
    'Y/R': 'yd_per_rec',
    'Yds': 'yds'
}

df_player = df_player.rename(columns = colnames_player)
df_player['year'] = df_player.year.astype('float')

# drop players with same name
ss = df_player[(df_player.player == "Steve Smith") & (df_player.team.isin(['STL', 'PHI', 'NYG']))].index
mw = df_player[(df_player.player == 'Mike Williams') & (df_player.team != 'LAC')].index

df_player = df_player.drop(ss, axis=0)
df_player = df_player.drop(mw, axis=0).reset_index(drop=True)

In [None]:
#==========
# Clean the Statistical Data
#==========

'''
Clean the statistical data by selecting the column names, formatting the column names,
and cleaning up any special characters associated with the player column.
'''

# formatting the pulled player data
df_player = player_format(data_player)

# cleaning player name and stat categories
df_player['Player'] = df_player.Player.apply(name_clean)
df_player['Ctch%'] = df_player['Ctch%'].apply(name_clean)


colnames_player = {
    'Player': 'player', 
    'Tm': 'team', 
    'Pos': 'pos',
    'year': 'year',
    'Age': 'age',
    'Ctch%': 'catch_pct',
    'Fmb': 'fmb',
    'G': 'games',
    'GS': 'games_started',
    'Lng': 'long',
    'R/G': 'rec_per_game',
    'Rec': 'receptions',
    'TD': 'td',
    'Tgt': 'tgt',
    'Y/G': 'yd_per_game',
    'Y/R': 'yd_per_rec',
    'Yds': 'yds'
}

df_player = df_player.rename(columns = colnames_player)
df_player['year'] = df_player.year.astype('float')

# drop players with same name
ss = df_player[(df_player.player == "Steve Smith") & (df_player.team.isin(['STL', 'PHI', 'NYG']))].index
mw = df_player[(df_player.player == 'Mike Williams') & (df_player.team != 'LAC')].index

df_player = df_player.drop(ss, axis=0)
df_player = df_player.drop(mw, axis=0).reset_index(drop=True)

In [None]:
#==========
# Clean the Statistical Data
#==========

'''
Clean the statistical data by selecting the column names, formatting the column names,
and cleaning up any special characters associated with the player column.
'''

# formatting the pulled player data
df_player = player_format(data_player)

# cleaning player name and stat categories
df_player['Player'] = df_player.Player.apply(name_clean)
df_player['Ctch%'] = df_player['Ctch%'].apply(name_clean)


colnames_player = {
    'Player': 'player', 
    'Tm': 'team', 
    'Pos': 'pos',
    'year': 'year',
    'Age': 'age',
    'Ctch%': 'catch_pct',
    'Fmb': 'fmb',
    'G': 'games',
    'GS': 'games_started',
    'Lng': 'long',
    'R/G': 'rec_per_game',
    'Rec': 'receptions',
    'TD': 'td',
    'Tgt': 'tgt',
    'Y/G': 'yd_per_game',
    'Y/R': 'yd_per_rec',
    'Yds': 'yds'
}

df_player = df_player.rename(columns = colnames_player)
df_player['year'] = df_player.year.astype('float')

# drop players with same name
ss = df_player[(df_player.player == "Steve Smith") & (df_player.team.isin(['STL', 'PHI', 'NYG']))].index
mw = df_player[(df_player.player == 'Mike Williams') & (df_player.team != 'LAC')].index

df_player = df_player.drop(ss, axis=0)
df_player = df_player.drop(mw, axis=0).reset_index(drop=True)

In [None]:
#==========
# Clean the Statistical Data
#==========

'''
Clean the statistical data by selecting the column names, formatting the column names,
and cleaning up any special characters associated with the player column.
'''

# formatting the pulled player data
df_player = player_format(data_player)

# cleaning player name and stat categories
df_player['Player'] = df_player.Player.apply(name_clean)
df_player['Ctch%'] = df_player['Ctch%'].apply(name_clean)


colnames_player = {
    'Player': 'player', 
    'Tm': 'team', 
    'Pos': 'pos',
    'year': 'year',
    'Age': 'age',
    'Ctch%': 'catch_pct',
    'Fmb': 'fmb',
    'G': 'games',
    'GS': 'games_started',
    'Lng': 'long',
    'R/G': 'rec_per_game',
    'Rec': 'receptions',
    'TD': 'td',
    'Tgt': 'tgt',
    'Y/G': 'yd_per_game',
    'Y/R': 'yd_per_rec',
    'Yds': 'yds'
}

df_player = df_player.rename(columns = colnames_player)
df_player['year'] = df_player.year.astype('float')

# drop players with same name
ss = df_player[(df_player.player == "Steve Smith") & (df_player.team.isin(['STL', 'PHI', 'NYG']))].index
mw = df_player[(df_player.player == 'Mike Williams') & (df_player.team != 'LAC')].index

df_player = df_player.drop(ss, axis=0)
df_player = df_player.drop(mw, axis=0).reset_index(drop=True)

In [None]:
#==========
# Pulling in the Player Profiler statistics
#==========

'''
Pull in the player profiler statistics and clean up any formatting issues. Follow by
left joining the statistics to the existing player dataframe.
'''

# read in player profiler csv
data_pp = pd.read_csv('/Users/Mark/Desktop/Jupyter Projects/Fantasy Football/Data/Player_Data/Rookie_WR/wr_player_profiler.csv')

# convert all dashes to null
data_pp = data_pp.replace("-", float('nan'))



In [None]:
#===========
# Merge player profiler data with statistical data
#===========

'''
The college and combine statistics are merged with the traditional statistics. Players who do
not have any player profiler information are dropped, as well as players who don't have 
any stats information (excluding this year's data). The dataframes are recombined for NA filling.
'''

# merge statistical data with player_profiler data
all_data = pd.merge(df_merged, data_pp, how='left', right_on='player', left_on='player')

# drop any player who doesn't have player profiler stats by subsetting dropna with draft pick
all_data = all_data.dropna(subset=['height'], axis=0)

# split out past years and this year
df_predict = all_data[all_data.year == 2018]
df_train = all_data[all_data.year < 2018]

# drop players without any stats, excluding this year's predict data
df_train = df_train.dropna(subset=['tgt'], axis=0)

# re-combine the train and predict dataframe so that null values can be filled
all_data = pd.concat([df_train, df_predict], axis=0).reset_index(drop=True)

In [None]:
#===========
# Calculate Fantasy Points
#===========

'''
The fantasy points are calculated. In this case, the target will be what players
scored in year N, since all statistical columns will be dropped for the rookies.
All the player profiler stats are filled with the median (check better way?) and
the train and predict columns are split out.
'''

# specify stat categories
Yd_mult = 0.1
TD_val = 6.0
Rec_val = 1
Fmb = -1.0
params = [Yd_mult, TD_val, Rec_val, Fmb]

# calculate total fantasy points
all_data['fp'] = params[0]*all_data['yds'] + \
                 params[1]*all_data['td'] + \
                 params[2]*all_data['receptions'] + \
                 params[3]*all_data['fmb']

# calculate fantasy points per game
all_data['fp_per_game'] = all_data['fp'] / all_data['games']

In [None]:
#===========
# Remaining Clean-up
#===========

'''
Complete the final portion of cleanup, such as filling in null values,
dropping statistical columns that would leak information, creating train / predict
sets, and removing any remaining null values from the datasets.
'''

# fill all NA with median value
all_data.loc[:,'bmi':'breakout_year'] = all_data.loc[:,'bmi':'breakout_year'].fillna(all_data.median())

# columns to drop (stats columns)
to_drop = ['tgt', 'receptions', 'yds', 'td', 'catch_pct', 'games', 'games_started', 'long', 
           'rec_per_game', 'yd_per_game', 'yd_per_rec', 'fmb']

all_data = all_data.drop(to_drop, axis=1)

# split out train and predict
df_train = all_data[all_data.year < 2018]
df_predict = all_data[all_data.year == 2018].reset_index(drop=True)

# drop any remaining NA from df_train and remove FP
df_train = df_train.dropna()
df_train = df_train.drop('fp', axis=1)

# set age to player profiler age and drop fp columns
df_predict.age = df_predict.pp_age
df_predict = df_predict.drop(['pp_age', 'fp', 'fp_per_game'], axis=1)

In [None]:
#===========
# Remaining Clean-up
#===========

'''
Complete the final portion of cleanup, such as filling in null values,
dropping statistical columns that would leak information, creating train / predict
sets, and removing any remaining null values from the datasets.
'''

# fill all NA with median value
all_data.loc[:,'bmi':'breakout_year'] = all_data.loc[:,'bmi':'breakout_year'].fillna(all_data.median())

# columns to drop (stats columns)
to_drop = ['tgt', 'receptions', 'yds', 'td', 'catch_pct', 'games', 'games_started', 'long', 
           'rec_per_game', 'yd_per_game', 'yd_per_rec', 'fmb']

all_data = all_data.drop(to_drop, axis=1)

# split out train and predict
df_train = all_data[all_data.year < 2018]
df_predict = all_data[all_data.year == 2018].reset_index(drop=True)

# drop any remaining NA from df_train and remove FP
df_train = df_train.dropna()
df_train = df_train.drop('fp', axis=1)

# set age to player profiler age and drop fp columns
df_predict.age = df_predict.pp_age
df_predict = df_predict.drop(['pp_age', 'fp', 'fp_per_game'], axis=1)

In [None]:
#===========
# Remaining Clean-up
#===========

'''
Complete the final portion of cleanup, such as filling in null values,
dropping statistical columns that would leak information, creating train / predict
sets, and removing any remaining null values from the datasets.
'''

# fill all NA with median value
all_data.loc[:,'bmi':'breakout_year'] = all_data.loc[:,'bmi':'breakout_year'].fillna(all_data.median())

# columns to drop (stats columns)
to_drop = ['tgt', 'receptions', 'yds', 'td', 'catch_pct', 'games', 'games_started', 'long', 
           'rec_per_game', 'yd_per_game', 'yd_per_rec', 'fmb']

all_data = all_data.drop(to_drop, axis=1)

# split out train and predict
df_train = all_data[all_data.year < 2018]
df_predict = all_data[all_data.year == 2018].reset_index(drop=True)

# drop any remaining NA from df_train and remove FP
df_train = df_train.dropna()
df_train = df_train.drop('fp', axis=1)

# set age to player profiler age and drop fp columns
df_predict.age = df_predict.pp_age
df_predict = df_predict.drop(['pp_age', 'fp', 'fp_per_game'], axis=1)