In [18]:
import pandas as pd
import numpy as np

In [19]:
all_games = pd.read_csv('nfl-football-player-stats/games_1995.csv',index_col='player_id')
all_players = pd.read_csv('nfl-football-player-stats/players_1995.csv',index_col='player_id')

In [20]:
# dictionary that has fantasy value for each nfl stat
half_ppr = {
    'rushing_yards': 0.1,
    'rushing_touchdowns': 6,
    'receiving_receptions': 0.5,
    'receiving_yards': 0.1,
    'receiving_touchdowns': 6
}

In [21]:
# subset RB position by choosing only RBs with at least 5 rushing attempts
# or receiving targets in a given nfl week.
subset_position = {
    'RB': [['rushing_attempts', 'receiving_targets'], 5]
}

In [22]:
def get_players_thatweek(all_games, all_players, position, year, nfl_week, subset_position):
    # return pandas df with player_id as index and column 'name' as player name
    ids = all_players[all_players.position == position].index
    games = all_games.loc[ids]
    stats = subset_position[position][0]
    sum_threshold = subset_position[position][1]
    # only take RBs with 5 rush attempts or receiving targets
    worth_predicting = games[(games.year == year) & (games.game_number == nfl_week) &
                   (np.sum(games[stats], axis=1) >= sum_threshold)]
    ids = worth_predicting.index
    for_df = all_players.loc[ids, 'name']
    return pd.DataFrame(data=for_df)

In [16]:
# example usage
a = get_playerids_thatweek(all_games, all_players,'RB',2011,5,subset_position)
a.shape

(57, 1)

In [26]:
np.zeros((55,1))

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [47]:
def get_features_response(players, all_games, year, nfl_week, points_dict):
    games = all_games.loc[players.index]
    
    # compute fpts for each row
    games['fpts'] = games['game_number']*0
    for stat, value in zip(points_dict.keys(), points_dict.values()):
        games['fpts'] = games['fpts'] + games[stat]*value
    
    prev_years = games[(games.year < year) &
                       (games.game_number <= 16)]
    current_year = games[(games.year == year) &
                         (games.game_number < nfl_week)]
    rest_year = games[(games.year == year) &
                     (games.game_number>=nfl_week)]
    df = pd.concat((prev_years, current_year))
    df.sort_values(by=['player_id', 'year', 'game_number'], axis=0,
                   ascending=False, inplace=True)
    
    
    # group dataframe by index
    df = df.groupby(df.index)
    num_prev = df.fpts.agg('count').rename('num_prev')
    last = df.nth([0]).groupby('player_id').agg('mean').fpts.rename('last')
    next_3 = df.nth(list(range(1,4))).groupby('player_id').agg('mean').fpts.rename('next_3')
    next_15 = df.nth(list(range(4,19))).groupby('player_id').agg('mean').fpts.rename('next_15')
    to_debut = df.nth(list(range(19,300))).groupby('player_id').agg('mean').fpts.rename('to_debut')
    
    # get respoinse variable, the rest of season ppg 
    ros_ppg = rest_year.groupby('player_id').fpts.mean().rename('ros_ppg', inplace=True)
    #ros_games = rest_year.groupby('player_id').fpts.count().rename('ros_games', inplace=True)

    for_return = players.join([num_prev,last, next_3, next_15, to_debut, ros_ppg], how='left')#,last,next_3,next_15, to_debut])
    return for_return

In [40]:
# example usage
#get_features_response(a, all_games, 2011, 5, half_ppr)

In [48]:
# Train a linear regression model to predict rest of season ppg for RBs
# Pretend its 2018, nfl week 7

# get all features and responses for Running Backs from 2017 to 2018 week 7
POSITION = 'RB'
START_YEAR = 2017
YEAR = 2018
NFL_WEEK = 7
# features 
feat_list = ['num_prev', 'last', 'next_3', 'next_15', 'to_debut']
# response
resp_list = ['ros_ppg']

for year in range(START_YEAR, YEAR+1):   
    
    # if current year dont go past nfl week
    if year == YEAR:
        week_limit = NFL_WEEK-1
    else: # if previous year don't go past regular season (nfl week 16)
        week_limit = 16
        
    feature_list = []
    response_list = []
    for week in range(1,week_limit+1):
        players = get_players_thatweek(all_games, all_players, POSITION, year, week, subset_position)
        train = get_features_response(players, all_games, year, week, points_dict=half_ppr)
        print(train.head())
        feature = train[feat_list]
        response = train[resp_list]
        feature_list.append(feature)
        response_list.append(response)

# At the end concatenate feature and response lists 
#  into train_x and train_y dataframes
train_x = pd.concat(feature_list)
train_y = pd.concat(response_list)

                           name  num_prev  last     next_3    next_15  \
player_id                                                               
17858              Bilal Powell      74.0  21.2  20.833333   8.793333   
10586               Carlos Hyde      34.0  13.7  15.000000  11.233333   
24148          Kerwynn Williams      22.0  12.0   5.600000   2.633333   
14460      Christian McCaffrey        NaN   NaN        NaN        NaN   
8152                 Frank Gore     180.0   8.6  12.266667  12.553333   

            to_debut    ros_ppg  
player_id                        
17858       5.730909   8.320000  
10586       6.753333  12.900000  
24148       3.333333   1.972727  
14460            NaN  12.100000  
8152       13.431677   9.272727  
                           name  num_prev  last     next_3    next_15  \
player_id                                                               
17858              Bilal Powell        75   6.4  17.100000   9.840000   
10586               Carlos Hyde

Empty DataFrame
Columns: [name, num_prev, last, next_3, next_15, to_debut, ros_ppg]
Index: []
Empty DataFrame
Columns: [name, num_prev, last, next_3, next_15, to_debut, ros_ppg]
Index: []
Empty DataFrame
Columns: [name, num_prev, last, next_3, next_15, to_debut, ros_ppg]
Index: []
Empty DataFrame
Columns: [name, num_prev, last, next_3, next_15, to_debut, ros_ppg]
Index: []
Empty DataFrame
Columns: [name, num_prev, last, next_3, next_15, to_debut, ros_ppg]
Index: []
Empty DataFrame
Columns: [name, num_prev, last, next_3, next_15, to_debut, ros_ppg]
Index: []
Empty DataFrame
Columns: [name, num_prev, last, next_3, next_15, to_debut, ros_ppg]
Index: []
Empty DataFrame
Columns: [name, num_prev, last, next_3, next_15, to_debut, ros_ppg]
Index: []
Empty DataFrame
Columns: [name, num_prev, last, next_3, next_15, to_debut, ros_ppg]
Index: []
Empty DataFrame
Columns: [name, num_prev, last, next_3, next_15, to_debut, ros_ppg]
Index: []


In [49]:
# NOTE: concatenating the lists are not working right now
#        but the features look good