To Do:
* rush percentage as feature
* ewma feature instead of multiple fpts features
* L1 regularization
* Decision Tree regresssion

In [29]:
pd.__version__

'0.24.2'

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [2]:
all_games = pd.read_csv('nfl-football-player-stats/games_1995.csv',index_col='player_id')
all_players = pd.read_csv('nfl-football-player-stats/players_1995.csv',index_col='player_id')

In [3]:
all_games.columns

Index(['age', 'date', 'defense_interception_touchdowns',
       'defense_interception_yards', 'defense_interceptions', 'defense_sacks',
       'defense_safeties', 'defense_tackle_assists', 'defense_tackles',
       'field_goal_attempts', 'field_goal_makes', 'game_location',
       'game_number', 'game_won', 'kick_return_attempts',
       'kick_return_touchdowns', 'kick_return_yards', 'opponent',
       'opponent_score', 'passing_attempts', 'passing_completions',
       'passing_interceptions', 'passing_rating', 'passing_sacks',
       'passing_sacks_yards_lost', 'passing_touchdowns', 'passing_yards',
       'player_team_score', 'point_after_attemps', 'point_after_makes',
       'punt_return_attempts', 'punt_return_touchdowns', 'punt_return_yards',
       'punting_attempts', 'punting_blocked', 'punting_yards',
       'receiving_receptions', 'receiving_targets', 'receiving_touchdowns',
       'receiving_yards', 'rushing_attempts', 'rushing_touchdowns',
       'rushing_yards', 'team', 'ye

In [4]:
# dictionary that has fantasy value for each nfl stat
half_ppr = {
    'rushing_yards': 0.1,
    'rushing_touchdowns': 6,
    'receiving_receptions': 0.5,
    'receiving_yards': 0.1,
    'receiving_touchdowns': 6
}

In [5]:
# subset RB position by choosing only RBs with at least 8 rushing attempts
# or receiving targets in a given nfl week.
subset_position = {
    'RB': [['rushing_attempts', 'receiving_targets'], 8]
}

In [136]:
def get_players_thatweek(all_games, all_players, position, year, nfl_week, subset_position, specific_players=None):
    if specific_players != None:
        ids = specific_players
    else:
        ids = all_players[all_players.position == position].index
        
    # return pandas df with player_id as index and column 'name' as player name
    games = all_games.loc[ids]
    stats = subset_position[position][0]
    sum_threshold = subset_position[position][1]
    # only take RBs with 5 rush attempts or receiving targets (>= sum_threshold)
    worth_predicting = games[(games.year == year) & 
                             (games.game_number == nfl_week) &
                             (np.sum(games[stats], axis=1) >= sum_threshold)]
    ids = worth_predicting.index
    for_df = all_players.loc[ids, ['name', 'draft_position']]
    for_df.fillna(255, inplace=True)
    for_df['log_draft_position'] = np.log(for_df.draft_position)
    return pd.DataFrame(data=for_df)

In [137]:
# example usage
a = get_players_thatweek(all_games, all_players,'RB',2016,5,subset_position,[160,15340])
a

Unnamed: 0_level_0,name,draft_position,log_draft_position
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
160,Jay Ajayi,149.0,5.003946
15340,Lamar Miller,97.0,4.574711


In [138]:
def get_team_rush_perc(games, year, nfl_week):
    games = games[(games.passing_attempts > 0)]
    prev_years = games[(games.year < year) &
                       (games.game_number <= 16)]
    current_year = games[(games.year == year) &
                         (games.game_number < nfl_week)]
    qbs = pd.concat((prev_years, current_year))
    qbs.sort_values(by=['year', 'game_number'], axis=0,ascending=True, inplace=True)
    qbs = qbs.groupby(['team', 'year', 'game_number'])
    pass_attempts = qbs.passing_attempts.sum()
    games = all_games[(all_games.rushing_attempts > 0)]
    prev_years = games[(games.year < year) &
                       (games.game_number <= 16)]
    current_year = games[(games.year == year) &
                         (games.game_number < nfl_week)]
    rbs = pd.concat((prev_years, current_year))
    rbs.sort_values(by=['year', 'game_number'], axis=0, ascending=True, inplace=True)
    rbs = rbs.groupby(['team', 'year', 'game_number'])
    rush_attempts = rbs.rushing_attempts.sum()
    rush_perc = rush_attempts/(rush_attempts+pass_attempts)
    rush_perc = pd.DataFrame(rush_perc, columns=['rush_perc'])
    ewma_rush_perc = rush_perc.groupby('team').apply(lambda x: x['rush_perc'].ewm(span=16).mean()).droplevel(0)
    rush_perc['ewma_rush_perc'] = ewma_rush_perc
    return(rush_perc)

In [154]:
def get_features_response(players, all_games, year, nfl_week, points_dict):
    games = all_games.loc[players.index]
    
    # compute fpts for each row
    games['fpts'] = games['game_number']*0
    for stat, value in zip(points_dict.keys(), points_dict.values()):
        games['fpts'] = games['fpts'] + games[stat]*value
    
    prev_years = games[(games.year < year) &
                       (games.game_number <= 16)]
    current_year = games[(games.year == year) &
                         (games.game_number < nfl_week)]
    rest_year = games[(games.year == year) &
                     (games.game_number>=nfl_week)]
    df = pd.concat((prev_years, current_year))
    df.sort_values(by=['player_id', 'year', 'game_number'], axis=0,
                   ascending=True, inplace=True)
    # test df of games
    for_testing = df
    
    # get rush_perc of teams
    rush_perc = get_team_rush_perc(games = all_games, year=year, nfl_week=nfl_week)
    rush_perc = rush_perc.loc[zip(df.team, df.year, df.game_number)]
    df = df.join(rush_perc, how='outer', on=['team', 'year', 'game_number']).drop_duplicates()
    df.sort_values(by=['player_id', 'year', 'game_number'], axis=0,
                   ascending=True, inplace=True)
    print(df)
    print('______________________________')
    
    # group dataframe by index
    df['num_prev'] = df.groupby(df.index).cumcount().rename('num_prev')
    log_num_prev = np.log(df.fpts.agg('count').rename('log_num_prev'))
    last = df.nth([0]).groupby('player_id').agg('mean').fpts.rename('last')
    next_3 = df.nth(list(range(1,4))).groupby('player_id').agg('mean').fpts.rename('next_3')
    next_15 = df.nth(list(range(4,19))).groupby('player_id').agg('mean').fpts.rename('next_15')
    to_debut = df.nth(list(range(19,300))).groupby('player_id').agg('mean').fpts.rename('to_debut')
    
    # get response variables, the rest of season ppg or next game ppg or rest of season games played 
    ros_ppg = rest_year.groupby('player_id').fpts.mean().rename('ros_ppg', inplace=True)
    #ros_games = rest_year.groupby('player_id').fpts.count().rename('ros_games', inplace=True)

    for_return = players.join([num_prev, log_num_prev, last, next_3, next_15, to_debut, ros_ppg], how='left')#,last,next_3,next_15, to_debut])
    return for_return

In [155]:
# example usage
get_features_response(a, all_games, 2016, 5, half_ppr)

              age        date  defense_interception_touchdowns  \
player_id                                                        
160        22-146  2015-11-08                                0   
160        22-153  2015-11-15                                0   
160        22-160  2015-11-22                                0   
160        22-167  2015-11-29                                0   
160        22-174  2015-12-06                                0   
160        22-182  2015-12-14                                0   
160        22-188  2015-12-20                                0   
160        22-195  2015-12-27                                0   
160        22-202  2016-01-03                                0   
160        23-095  2016-09-18                                0   
160        23-102  2016-09-25                                0   
160        23-106  2016-09-29                                0   
15340      21-144  2012-09-16                                0   
15340     

In [None]:
## Pretend its 2016, nfl week 5

# Get all features and responses for Running Backs from START_YEAR to YEAR before NFL_WEEK
POSITION = 'RB'
START_YEAR = 2013
YEAR = 2016
NFL_WEEK = 5
# features 
FEATURES = ['log_draft_position','log_num_prev', 'last', 'next_3', 'next_15', 'to_debut']
# response
RESPONSE = ['ros_ppg']

# append features and response each week to these lists
feature_list = []
response_list = []
lr = None
r_sqr_list = []
mean_abs_err_list = []
mean_test_fpts_list = []

# train mlr if no training yet
if lr is None:

    for train_year in range(START_YEAR, YEAR+1):   

        # if current year dont go past nfl week
        if year == YEAR:
            week_limit = NFL_WEEK-1
        else: # if previous year don't go past regular season (nfl week 16)
            week_limit = 16

        for train_week in range(1,week_limit+1):
            players = get_players_thatweek(all_games, all_players, POSITION, train_year, train_week, subset_position)
            train = get_features_response(players, all_games, train_year, train_week, points_dict=half_ppr)
            feature = train[FEATURES]
            response = train[RESPONSE]
            feature_list.append(feature)
            response_list.append(response)

    ## After all feature and response training lists have been created
    #  Concat lists into train dataframes
    train_x = pd.concat(feature_list)
    train_y = pd.concat(response_list)

    # fill NaN in train_x with zeros
    train_x = train_x.fillna(0)

    # Train a linear regression model to predict rest of season ppg for RBs 
    lr = LinearRegression()
    lr = lr.fit(train_x, train_y)
    print('Model trained now on data through ' + str(YEAR) + ' Week ' + str(NFL_WEEK))
    print('Intercept:')
    print(lr.intercept_)
    print(FEATURES)
    print(lr.coef_)

for test_week in range(NFL_WEEK,16+1):
    # Get test_x and test_y
    players = get_players_thatweek(all_games, all_players, POSITION, YEAR, test_week, subset_position)
    test = get_features_response(players, all_games, YEAR, test_week, points_dict=half_ppr)
    test_x = test[FEATURES]
    test_y = test[RESPONSE]

    # fill NaN in test_x with zeros
    test_x = test_x.fillna(0)

    # Score gives R^2 of prediction of test_x wrt test_y
    # Note: R^2 is correlated to how many easy predictions
    #  (players with low fantasy points are easy to predict).
    #  R^2 decreases when predicting on fewer but better running backs.
    print('Year: ' +str(YEAR) + '  Week: '+ str(test_week))
    r_sqr = lr.score(test_x, test_y)
    r_sqr_list.append(r_sqr)
    mean_abs_err = round(mean_absolute_error(test_y, lr.predict(test_x)),2)
    mean_abs_err_list.append(mean_abs_err)
    mean_test_fpts = np.mean(test_y)
    mean_test_fpts_list.append(mean_test_fpts)
    
    print('R^2: ' + str(round(r_sqr,2)))
    print('Mean Abs Error: ' + str(mean_abs_err))

print('Mean R^2: ' + str(round(np.mean(r_sqr_list), 2)))
print('Overall mean abs err: ' + str(round(np.mean(mean_abs_err_list), 2)))
print('Overall mean test fpts:' + str(round(np.mean(mean_test_fpts_list),2)))

In [49]:
print('number of running back games: ' + str(len(train_x)))

number of running back games: 3026


In [145]:
# exponential weighted moving average
POSITION = 'RB'
year = 2016
week = 5
players = get_players_thatweek(all_games, all_players, POSITION, year, week, subset_position)
train, ewm_games = get_features_response(players, all_games, year, week, points_dict=half_ppr)
        

In [146]:
df = ewm_games
df.head()

Unnamed: 0_level_0,age,date,defense_interception_touchdowns,defense_interception_yards,defense_interceptions,defense_sacks,defense_safeties,defense_tackle_assists,defense_tackles,field_goal_attempts,...,receiving_receptions,receiving_targets,receiving_touchdowns,receiving_yards,rushing_attempts,rushing_touchdowns,rushing_yards,team,year,fpts
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
160,22-146,2015-11-08,0,0,0,0.0,0,0,0,0,...,0,0,0,0,5,0,41,MIA,2015,4.1
160,22-153,2015-11-15,0,0,0,0.0,0,0,0,0,...,0,0,0,0,6,0,48,MIA,2015,4.8
160,22-160,2015-11-22,0,0,0,0.0,0,0,0,0,...,2,2,0,23,4,0,13,MIA,2015,4.6
160,22-167,2015-11-29,0,0,0,0.0,0,0,0,0,...,4,6,0,52,3,0,6,MIA,2015,7.8
160,22-174,2015-12-06,0,0,0,0.0,0,0,0,0,...,0,0,0,0,4,0,12,MIA,2015,1.2


In [147]:
df['ma'] = df.groupby('player_id').apply(lambda x: x['fpts'].ewm(span=6).mean()).values

In [150]:
df[['fpts','ma']]

KeyError: 0