In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [2]:
all_games = pd.read_csv('nfl-football-player-stats/games_1995.csv',index_col='player_id')
all_players = pd.read_csv('nfl-football-player-stats/players_1995.csv',index_col='player_id')

In [175]:
all_games.columns

Index(['age', 'date', 'defense_interception_touchdowns',
       'defense_interception_yards', 'defense_interceptions', 'defense_sacks',
       'defense_safeties', 'defense_tackle_assists', 'defense_tackles',
       'field_goal_attempts', 'field_goal_makes', 'game_location',
       'game_number', 'game_won', 'kick_return_attempts',
       'kick_return_touchdowns', 'kick_return_yards', 'opponent',
       'opponent_score', 'passing_attempts', 'passing_completions',
       'passing_interceptions', 'passing_rating', 'passing_sacks',
       'passing_sacks_yards_lost', 'passing_touchdowns', 'passing_yards',
       'player_team_score', 'point_after_attemps', 'point_after_makes',
       'punt_return_attempts', 'punt_return_touchdowns', 'punt_return_yards',
       'punting_attempts', 'punting_blocked', 'punting_yards',
       'receiving_receptions', 'receiving_targets', 'receiving_touchdowns',
       'receiving_yards', 'rushing_attempts', 'rushing_touchdowns',
       'rushing_yards', 'team', 'ye

In [3]:
# dictionary that has fantasy value for each nfl stat
half_ppr = {
    'rushing_yards': 0.1,
    'rushing_touchdowns': 6,
    'receiving_receptions': 0.5,
    'receiving_yards': 0.1,
    'receiving_touchdowns': 6
}

In [45]:
# subset RB position by choosing only RBs with at least 8 rushing attempts
# or receiving targets in a given nfl week.
subset_position = {
    'RB': [['rushing_attempts', 'receiving_targets'], 8]
}

In [165]:
def get_players_thatweek(all_games, all_players, position, year, nfl_week, subset_position):
    # return pandas df with player_id as index and column 'name' as player name
    ids = all_players[all_players.position == position].index
    games = all_games.loc[ids]
    stats = subset_position[position][0]
    sum_threshold = subset_position[position][1]
    # only take RBs with 5 rush attempts or receiving targets (>= sum_threshold)
    worth_predicting = games[(games.year == year) & 
                             (games.game_number == nfl_week) &
                             (np.sum(games[stats], axis=1) >= sum_threshold)]
    ids = worth_predicting.index
    for_df = all_players.loc[ids, ['name', 'draft_position']]
    for_df.fillna(255, inplace=True)
    for_df['log_draft_position'] = np.log(for_df.draft_position)
    return pd.DataFrame(data=for_df)

In [166]:
# example usage
a = get_players_thatweek(all_games, all_players,'RB',2016,5,subset_position)
a

Unnamed: 0_level_0,name,draft_position,log_draft_position
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17858,Bilal Powell,126.0,4.836282
10586,Carlos Hyde,57.0,4.043051
11664,Matt Jones,95.0,4.553877
23267,Spencer Ware,194.0,5.267858
8152,Frank Gore,65.0,4.174387
1999,Devontae Booker,136.0,4.912655
16086,DeMarco Murray,71.0,4.26268
18654,Theo Riddick,199.0,5.293305
12749,Eddie Lacy,61.0,4.110874
8145,Melvin Gordon,15.0,2.70805


In [170]:
def get_features_response(players, all_games, year, nfl_week, points_dict):
    games = all_games.loc[players.index]
    
    # compute fpts for each row
    games['fpts'] = games['game_number']*0
    for stat, value in zip(points_dict.keys(), points_dict.values()):
        games['fpts'] = games['fpts'] + games[stat]*value
    
    prev_years = games[(games.year < year) &
                       (games.game_number <= 16)]
    current_year = games[(games.year == year) &
                         (games.game_number < nfl_week)]
    rest_year = games[(games.year == year) &
                     (games.game_number>=nfl_week)]
    df = pd.concat((prev_years, current_year))
    df.sort_values(by=['player_id', 'year', 'game_number'], axis=0,
                   ascending=True, inplace=True)
    # test df of games
    for_testing = df
    
    # get Passing stats of that team
    
    
    # group dataframe by index
    df = df.groupby(df.index)
    num_prev = df.fpts.agg('count').rename('num_prev')
    log_num_prev = np.log(df.fpts.agg('count').rename('log_num_prev'))
    last = df.nth([0]).groupby('player_id').agg('mean').fpts.rename('last')
    next_3 = df.nth(list(range(1,4))).groupby('player_id').agg('mean').fpts.rename('next_3')
    next_15 = df.nth(list(range(4,19))).groupby('player_id').agg('mean').fpts.rename('next_15')
    to_debut = df.nth(list(range(19,300))).groupby('player_id').agg('mean').fpts.rename('to_debut')
    
    # get response variable, the rest of season ppg 
    ros_ppg = rest_year.groupby('player_id').fpts.mean().rename('ros_ppg', inplace=True)
    #ros_games = rest_year.groupby('player_id').fpts.count().rename('ros_games', inplace=True)

    for_return = players.join([num_prev, log_num_prev, last, next_3, next_15, to_debut, ros_ppg], how='left')#,last,next_3,next_15, to_debut])
    return for_return, for_testing

In [171]:
# example usage
#get_features_response(a, all_games, 2011, 5, half_ppr)

In [172]:
# Pretend its 2016, nfl week 5
# Get all features and responses for Running Backs from START_YEAR to YEAR before NFL_WEEK
POSITION = 'RB'
START_YEAR = 2013
YEAR = 2016
NFL_WEEK = 5
# features 
FEATURES = ['log_draft_position','log_num_prev', 'last', 'next_3', 'next_15', 'to_debut']
# response
RESPONSE = ['ros_ppg']

# append features and response each week to these lists
feature_list = []
response_list = []

for year in range(START_YEAR, YEAR+1):   
    
    # if current year dont go past nfl week
    if year == YEAR:
        week_limit = NFL_WEEK-1
    else: # if previous year don't go past regular season (nfl week 16)
        week_limit = 16

    for week in range(1,week_limit+1):
        players = get_players_thatweek(all_games, all_players, POSITION, year, week, subset_position)
        train, _ = get_features_response(players, all_games, year, week, points_dict=half_ppr)
        feature = train[FEATURES]
        response = train[RESPONSE]
        feature_list.append(feature)
        response_list.append(response)

# At the end concatenate feature and response lists 
#  into train_x and train_y dataframes
train_x = pd.concat(feature_list)
train_y = pd.concat(response_list)

# fill NaN in train_x with zeros
train_x = train_x.fillna(0)

# Get test_x and test_y
players = get_players_thatweek(all_games, all_players, POSITION, YEAR, NFL_WEEK, subset_position)
test, _ = get_features_response(players, all_games, YEAR, NFL_WEEK, points_dict=half_ppr)
test_x = test[FEATURES]
test_y = test[RESPONSE]

# fill NaN in test_x with zeros
test_x = test_x.fillna(0)

In [173]:
# Train a linear regression model to predict rest of season ppg for RBs
lr = LinearRegression()
lr = lr.fit(train_x, train_y)
print('Intercept:')
print(lr.intercept_)
print(FEATURES)
print(lr.coef_)

Intercept:
[8.46365903]
['log_draft_position', 'log_num_prev', 'last', 'next_3', 'next_15', 'to_debut']
[[ 0.06567182 -0.76261883  0.08269373 -0.01331628  0.16360388  0.29163449]]


In [174]:
# Score gives R^2 of prediction of test_x wrt test_y
# Note: R^2 is correlated to how many easy predictions
#  (players with low fantasy points are easy to predict).
#  R^2 decreases when predicting on fewer but better running backs.
lr.score(test_x, test_y)

-0.05387609053773157

In [145]:
# exponential weighted moving average
POSITION = 'RB'
year = 2016
week = 5
players = get_players_thatweek(all_games, all_players, POSITION, year, week, subset_position)
train, ewm_games = get_features_response(players, all_games, year, week, points_dict=half_ppr)
        

In [146]:
df = ewm_games
df.head()

Unnamed: 0_level_0,age,date,defense_interception_touchdowns,defense_interception_yards,defense_interceptions,defense_sacks,defense_safeties,defense_tackle_assists,defense_tackles,field_goal_attempts,...,receiving_receptions,receiving_targets,receiving_touchdowns,receiving_yards,rushing_attempts,rushing_touchdowns,rushing_yards,team,year,fpts
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
160,22-146,2015-11-08,0,0,0,0.0,0,0,0,0,...,0,0,0,0,5,0,41,MIA,2015,4.1
160,22-153,2015-11-15,0,0,0,0.0,0,0,0,0,...,0,0,0,0,6,0,48,MIA,2015,4.8
160,22-160,2015-11-22,0,0,0,0.0,0,0,0,0,...,2,2,0,23,4,0,13,MIA,2015,4.6
160,22-167,2015-11-29,0,0,0,0.0,0,0,0,0,...,4,6,0,52,3,0,6,MIA,2015,7.8
160,22-174,2015-12-06,0,0,0,0.0,0,0,0,0,...,0,0,0,0,4,0,12,MIA,2015,1.2


In [147]:
df['ma'] = df.groupby('player_id').apply(lambda x: x['fpts'].ewm(span=6).mean()).values

In [150]:
df[['fpts','ma']]

KeyError: 0