In [1]:
import pandas as pd
from data_functions import *
pd.options.mode.chained_assignment = None
import numpy as np

In [2]:
#==========
# Scraping the statistical and ADP data
#==========

'''
Pull in statistical and ADP data for the given years using the custom data_load function.
'''

# starting and ending year for pulling player data
year_start = 1998
year_end = 2017

# pulling receiving statistics
url_player = 'https://www.pro-football-reference.com/years/{0}/rushing.htm'
data_player = data_load(year_start, year_end, url_player, 0)

# pulling historical player adp
url_adp = 'http://www03.myfantasyleague.com/{0}/adp?COUNT=100&POS=RB&ROOKIES=0&INJURED=1&CUTOFF=10\
&FRANCHISES=-1&IS_PPR=1&IS_KEEPER=0&IS_MOCK=-1&TIME='
data_adp = data_load(year_start, year_end+1, url_adp, 1)

In [51]:
#==========
# Clean the Statistical Data
#==========

'''
Clean the statistical data by selecting the column names, formatting the column names,
and cleaning up any special characters associated with the player column.
'''

data_player.columns = data_player.iloc[0,:]
df_player = data_player.iloc[1:, 1:]
df_player = df_player.T.reset_index(drop=True).T

# extra total yards column for some reason, col 25 is skipped below in naming
df_player = df_player.drop(25, axis=1)

colnames_player = {
    0: 'player', 
    1: 'team', 
    2: 'age',
    3: 'pos',
    4: 'games',
    5: 'games_started',
    6: 'att',
    7: 'rush_yds',
    8: 'rush_td',
    9: 'long_rush',
    10: 'yard_per_att',
    11: 'rush_yd_per_game',
    12: 'att_per_game',
    13: 'tgt',
    14: 'receptions',
    15: 'rec_yds',
    16: 'yd_per_rec',
    17: 'rec_td',
    18: 'rec_long',
    19: 'rec_per_game',
    20: 'rec_yd_per_game',
    21: 'catch_pct',
    22: 'total_touches',
    23: 'yard_per_touch',
    24: 'scrimmage_yds',
    26: 'total_td',
    27: 'fmb',
    28: 'year'   
}

# cleaning player name and stat categories
df_player = df_player.rename(columns = colnames_player)
df_player['player'] = df_player.player.apply(name_clean)
df_player['year'] = df_player.year.astype('float')

# removing duplicate players
bad_ap = df_player[(df_player.player == 'Adrian Peterson') & (df_player.team == 'CHI')].index
bad_rw = df_player[(df_player.player == 'Ricky Williams') & (df_player.team == 'IND')].index
bad_dj = df_player[(df_player.player == 'David Johnson') & (df_player.team == 'SDG')].index

df_player = df_player.drop(bad_ap, axis=0)
df_player = df_player.drop(bad_rw, axis=0)
df_player = df_player.drop(bad_dj, axis=0)

In [52]:
#==========
# Clean the ADP data
#==========

'''
Cleaning the ADP data by selecting relevant features, and extracting the name and team
from the combined string column. Note that the year is not shifted back because the 
stats will be used to calculate FP/G for the rookie in that season, but will be removed
prior to training. Thus, the ADP should match the year from the stats.
'''

# selecting relevant columns and dropping na
df_adp = data_adp[['Player', 'year', 'Avg. Pick']].dropna()

# selecting team and player name information from combined string
df_adp['Tm'] = df_adp.Player.apply(team_select)
df_adp['Player'] = df_adp.Player.apply(name_select)
df_adp['Player'] = df_adp.Player.apply(name_clean)


# convert year to float and move back one year to match with stats
df_adp['year'] = df_adp.year.astype('float')
df_adp['year'] = df_adp['year'] - 1

# format and rename columns
df_adp = df_adp[['Player', 'Tm', 'year', 'Avg. Pick']]

colnames_adp = {
    'Player': 'player',
    'Tm': 'team',
    'year': 'year',
    'Avg. Pick': 'avg_pick'
}

df_adp = df_adp.rename(columns=colnames_adp)

# removing duplicate players
bad_ap = df_adp[(df_adp.player == 'Adrian Peterson') & (df_adp.team == 'CHI')].index
bad_rw = df_adp[(df_adp.player == 'Ricky Williams') & (df_adp.team == 'IND')].index

df_adp = df_adp.drop(bad_ap, axis=0)
df_adp = df_adp.drop(bad_rw, axis=0)

In [53]:
#==========
# Merging and formatting all player-based data.
#==========

'''
Join the statistical and adp data into a single, merged dataframe. Update the teams
to have a consistent abbreviation for later joining. Also, select only relevant columns, 
as well as convert all numerical features to float.
'''

# merge adp and player data
df_merged = pd.merge(df_player, df_adp, how = 'inner', left_on = ['player', 'year'], right_on = ['player', 'year'])

# ensure all teams have same abbreviations for matching
adp_to_player_teams = {
    
    'ARI': 'ARI',
    'ATL': 'ATL',
    'BAL': 'BAL',
    'BUF': 'BUF',
    'CAR': 'CAR',
    'CHI': 'CHI',
    'CIN': 'CIN',
    'CLE': 'CLE',
    'DAL': 'DAL',
    'DEN': 'DEN',
    'DET': 'DET',
    'GBP': 'GNB',
    'HOU': 'HOU',
    'IND': 'IND',
    'JAC': 'JAX',
    'KCC': 'KAN',
    'LAC': 'LAC',
    'SDC': 'LAC',
    'LAR': 'LAR',
    'RAM': 'LAR',
    'MIA': 'MIA',
    'MIN': 'MIN',
    'NEP': 'NWE',
    'NOS': 'NOR',
    'NYG': 'NYG',
    'NYJ': 'NYJ',
    'OAK': 'OAK',
    'PHI': 'PHI',
    'PIT': 'PIT',
    'SEA': 'SEA',
    'SFO': 'SFO',
    'TBB': 'TAM',
    'TEN': 'TEN',
    'WAS': 'WAS'
}

df_merged['team_y'] = df_merged['team_y'].map(adp_to_player_teams)

# update old team names to LA team names
la_update = {
    'STL': 'LAR',
    'SDG': 'LAC'
}

la_teams = df_merged[(df_merged.team_x == 'SDG') | (df_merged.team_x == 'STL')]
la_teams['team_x'] = la_teams.team_x.map(la_update)
df_merged.update(la_teams)

# create flag if player switched teams
df_merged['team_y'] = df_merged.team_y.fillna('FA')
df_merged['new_team'] = df_merged['team_x'] != df_merged['team_y']
df_merged['new_team'] = df_merged.new_team.map({True: 1, False: 0})

# keep current team 
df_merged = df_merged.drop('team_x', axis=1)
df_merged = df_merged.rename(columns = {'team_y': 'team'})

df_merged['pos'] = 'RB'

In [54]:
#==========
# Arranging statistical and ADP columns prior to merging
#==========

'''
Select and order relevant columns, followed by any remaining cleaning up of stats
and converting all numerical stats to float
'''

# rearrange columns
df_merged = df_merged[['player', 'pos', 'team', 'year', 'age', 'avg_pick',
                       'new_team', 'att', 'rush_yds', 'yard_per_att', 'att_per_game', 'rush_yd_per_game',
                       'rush_td', 'tgt', 'receptions', 'rec_yds', 'yd_per_rec', 'rec_td',
                       'rec_long', 'long_rush', 'rec_per_game', 'rec_yd_per_game', 'catch_pct', 'scrimmage_yds',
                       'total_td', 'total_touches', 'fmb', 'games', 'games_started']]

# make all columns numeric
df_merged['catch_pct'] = df_merged.catch_pct.apply(name_clean)
df_merged.iloc[:, 3:] = df_merged.iloc[:, 3:].astype('float')

In [55]:
# set David Johnson individual stats for last year to 90% of 2016's stats
dj_2017 = df_merged[(df_merged.player == 'David Johnson') & (df_merged.year == 2017)].index
dj_2016 = df_merged[(df_merged.player == 'David Johnson') & (df_merged.year == 2016)].index

df_merged.iloc[dj_2017,7:26] = df_merged.iloc[dj_2016, 7:26].values*0.9
df_merged.iloc[dj_2017,27:29] = 16 

In [56]:
#==========
# Load team-based statistics
#==========

'''
Pull in the oline, quarterback, and overall team offense statistics, and join them 
to the player data. This will provide team-based context for the players, as well as
allow for grouped statistics generation.
'''

# fill NA with LAR since all Rams players are missing team (fills non-Rams as well)
df_merged.loc[:, 'team'] = df_merged.team.fillna('LAR')

# pull in team based stats for oline and quarterback play
oline = pd.read_csv('/Users/Mark/Desktop/Jupyter Projects/Fantasy Football/Data/CSV/oline_rankings.csv', index_col=0)
qb = pd.read_csv('/Users/Mark/Desktop/Jupyter Projects/Fantasy Football/Data/CSV/qb_stats_for_wr.csv', index_col=0)
team_off = pd.read_csv('/Users/Mark/Desktop/Jupyter Projects/Fantasy Football/Data/CSV/team_offensive_efficiency.csv', index_col=0)
team_stats = pd.read_csv('/Users/Mark/Desktop/Jupyter Projects/Fantasy Football/Data/CSV/traditional_team_stats.csv', index_col=0)

# give qb attempts a unique name before merging
qb = qb.rename(columns={'att': 'qb_att'})

# merge all df with the original
team_files = [oline, qb, team_off, team_stats]
for file in team_files:
    df_merged = pd.merge(df_merged, file, how='left', left_on=['team', 'year'], right_on=['team', 'year'])

In [57]:
#==========
# Creating team based grouped statistics
#==========

'''
Create grouped statistics based on the team and teammates. For example,
create total touches by team feature, as well as how the average, min, and max
teammate adps compare to the current player.
'''

# groupby team and year to get total rb touches for each team
team_touches = df_merged.groupby(['team', 'year'], group_keys=False)['att'].agg(np.sum).reset_index().rename(columns={'att': 'rb_att_on_team'})
df_merged = pd.merge(df_merged, team_touches, how='left', left_on=['team', 'year'], right_on=['team', 'year'])
df_merged['available_rush_att'] = 1-(df_merged['rb_att_on_team'] / df_merged['tm_rush_att'])
df_merged['available_rush_att_2'] = 1-((df_merged['rb_att_on_team'] - df_merged['att']) / df_merged['tm_rush_att'])

team_tgts = df_merged.groupby(['team', 'year'], group_keys=False)['tgt'].agg(np.sum).reset_index().rename(columns={'tgt': 'tgt_on_team'})
df_merged = pd.merge(df_merged, team_tgts, how='left', left_on=['team', 'year'], right_on=['team', 'year'])
df_merged['available_tgt'] = 1-(df_merged['tgt_on_team'] / df_merged['tm_pass_att'])
df_merged['available_tgt_2'] = 1-((df_merged['tgt_on_team'] - df_merged['tgt']) / df_merged['tm_pass_att'])

# create market share statistics
df_merged['ms_rush_att'] = df_merged['att'] / df_merged['tm_rush_att']
df_merged['ms_rush_yd'] = df_merged['rush_yds'] / df_merged['tm_rush_yds']
df_merged['ms_rush_td'] = df_merged['rush_td'] / df_merged['tm_rush_td']
df_merged['ms_rec_yd'] = df_merged['rec_yds'] / df_merged['tm_pass_yds']
df_merged['ms_tgts'] = df_merged['tgt'] / df_merged['tm_pass_att']

df_merged['ms_rush_yd_per_att'] = df_merged['ms_rush_yd'] / df_merged['ms_rush_att']

df_merged['avail_x_newteam'] = df_merged['available_rush_att'] * df_merged['new_team']

# create teammate ADP metrics to see if top ranked player
min_teammate = df_merged.groupby(['team', 'year'], group_keys=False)['avg_pick'].agg(np.min).reset_index().rename(columns={'avg_pick': 'min_teammate'})
max_teammate = df_merged.groupby(['team', 'year'], group_keys=False)['avg_pick'].agg(np.max).reset_index().rename(columns={'avg_pick': 'max_teammate'})
avg_teammate = df_merged.groupby(['team', 'year'], group_keys=False)['avg_pick'].agg(np.mean).reset_index().rename(columns={'avg_pick': 'avg_teammate'})

names = ['min_teammate', 'max_teammate', 'avg_teammate']
for i, file in enumerate([min_teammate, max_teammate, avg_teammate]):
    
    df_merged = pd.merge(df_merged, file, how='inner', left_on=['team', 'year'], right_on=['team', 'year'])
    df_merged['teammate_diff'] = df_merged[names[i]] - df_merged['min_teammate'] 

In [58]:
df_merged.to_csv('/Users/Mark/Desktop/Jupyter Projects/Fantasy Football/Data/CSV/rb_training_statistics.csv')