In [20]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [16]:
np.unique(all_games.year)

array([1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
       2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
       2017])

In [2]:
all_games = pd.read_csv('nfl-football-player-stats/games_1995.csv',index_col='player_id')
all_players = pd.read_csv('nfl-football-player-stats/players_1995.csv',index_col='player_id')

In [3]:
# dictionary that has fantasy value for each nfl stat
half_ppr = {
    'rushing_yards': 0.1,
    'rushing_touchdowns': 6,
    'receiving_receptions': 0.5,
    'receiving_yards': 0.1,
    'receiving_touchdowns': 6
}

In [4]:
# subset RB position by choosing only RBs with at least 5 rushing attempts
# or receiving targets in a given nfl week.
subset_position = {
    'RB': [['rushing_attempts', 'receiving_targets'], 5]
}

In [5]:
def get_players_thatweek(all_games, all_players, position, year, nfl_week, subset_position):
    # return pandas df with player_id as index and column 'name' as player name
    ids = all_players[all_players.position == position].index
    games = all_games.loc[ids]
    stats = subset_position[position][0]
    sum_threshold = subset_position[position][1]
    # only take RBs with 5 rush attempts or receiving targets (>= sum_threshold)
    worth_predicting = games[(games.year == year) & 
                             (games.game_number == nfl_week) &
                             (np.sum(games[stats], axis=1) >= sum_threshold)]
    ids = worth_predicting.index
    for_df = all_players.loc[ids, 'name']
    return pd.DataFrame(data=for_df)

In [8]:
# example usage
a = get_players_thatweek(all_games, all_players,'RB',2011,5,subset_position)
a.shape

(57, 1)

In [9]:
def get_features_response(players, all_games, year, nfl_week, points_dict):
    games = all_games.loc[players.index]
    
    # compute fpts for each row
    games['fpts'] = games['game_number']*0
    for stat, value in zip(points_dict.keys(), points_dict.values()):
        games['fpts'] = games['fpts'] + games[stat]*value
    
    prev_years = games[(games.year < year) &
                       (games.game_number <= 16)]
    current_year = games[(games.year == year) &
                         (games.game_number < nfl_week)]
    rest_year = games[(games.year == year) &
                     (games.game_number>=nfl_week)]
    df = pd.concat((prev_years, current_year))
    df.sort_values(by=['player_id', 'year', 'game_number'], axis=0,
                   ascending=False, inplace=True)
    
    
    # group dataframe by index
    df = df.groupby(df.index)
    num_prev = df.fpts.agg('count').rename('num_prev')
    last = df.nth([0]).groupby('player_id').agg('mean').fpts.rename('last')
    next_3 = df.nth(list(range(1,4))).groupby('player_id').agg('mean').fpts.rename('next_3')
    next_15 = df.nth(list(range(4,19))).groupby('player_id').agg('mean').fpts.rename('next_15')
    to_debut = df.nth(list(range(19,300))).groupby('player_id').agg('mean').fpts.rename('to_debut')
    
    # get response variable, the rest of season ppg 
    ros_ppg = rest_year.groupby('player_id').fpts.mean().rename('ros_ppg', inplace=True)
    #ros_games = rest_year.groupby('player_id').fpts.count().rename('ros_games', inplace=True)

    for_return = players.join([num_prev,last, next_3, next_15, to_debut, ros_ppg], how='left')#,last,next_3,next_15, to_debut])
    return for_return

In [10]:
# example usage
#get_features_response(a, all_games, 2011, 5, half_ppr)

In [78]:
# Pretend its 2016, nfl week 5
# Get all features and responses for Running Backs from START_YEAR to YEAR before NFL_WEEK
POSITION = 'RB'
START_YEAR = 2013
YEAR = 2016
NFL_WEEK = 5
# features 
FEATURES = ['num_prev', 'last', 'next_3', 'next_15', 'to_debut']
# response
RESPONSE = ['ros_ppg']

# append features and response each week to these lists
feature_list = []
response_list = []

for year in range(START_YEAR, YEAR+1):   
    
    # if current year dont go past nfl week
    if year == YEAR:
        week_limit = NFL_WEEK-1
    else: # if previous year don't go past regular season (nfl week 16)
        week_limit = 16

    for week in range(1,week_limit+1):
        players = get_players_thatweek(all_games, all_players, POSITION, year, week, subset_position)
        train = get_features_response(players, all_games, year, week, points_dict=half_ppr)
        feature = train[FEATURES]
        response = train[RESPONSE]
        feature_list.append(feature)
        response_list.append(response)

# At the end concatenate feature and response lists 
#  into train_x and train_y dataframes
train_x = pd.concat(feature_list)
train_y = pd.concat(response_list)

# fill NaN in train_x with zeros
train_x = train_x.fillna(0)

# Get test_x and test_y
players = get_players_thatweek(all_games, all_players, POSITION, YEAR, NFL_WEEK, subset_position)
test = get_features_response(players, all_games, YEAR, NFL_WEEK, points_dict=half_ppr)
test_x = test[FEATURES]
test_y = test[RESPONSE]

# fill NaN in test_x with zeros
test_x = test_x.fillna(0)

In [85]:
# Train a linear regression model to predict rest of season ppg for RBs
lr = LinearRegression()
lr = lr.fit(train_x, train_y)
print('Intercept:')
print(lr.intercept_)
print(feat_list)
print(lr.coef_)

Intercept:
[4.37520197]
['num_prev', 'last', 'next_3', 'next_15', 'to_debut']
[[-0.01226013  0.15071756  0.20960431  0.20734425  0.06766796]]


In [86]:
# score gives R^2 of prediction of test_x wrt test_y
lr.score(test_x, test_y)

0.3857776972616249