# Overview

This notebook will explore the `pybaseball` package and see if it can be used to create features for a regression model.

In [15]:
import datetime

import pandas as pd
from pybaseball import batting_stats_range, pitching_stats

### Loading stats accumulated over a date range

In [16]:
data = batting_stats_range('2017-06-11', '2017-06-11')
data.head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,...,HBP,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS
1,Jose Abreu,30,342,MLB-AL,Chicago,1,4,3,1,0,...,1,0,0,2,0,0,0.0,0.25,0.0,0.25
2,Matt Adams,28,342,MLB-NL,Atlanta,1,4,4,0,0,...,0,0,0,1,0,0,0.0,0.0,0.0,0.0
3,Tim Adleman,29,342,MLB-NL,Cincinnati,1,3,3,0,1,...,0,0,0,0,0,0,0.333,0.333,0.333,0.667
4,Ehire Adrianza,27,342,MLB-AL,Minnesota,1,1,0,0,0,...,0,0,0,0,0,0,,1.0,,
5,Jesus Aguilar,27,342,MLB-NL,Milwaukee,1,1,0,0,0,...,0,0,0,0,0,0,,1.0,,


### Calculating FD points for a given range

Create a function that will take a single row and output the Fanduel score for that player.

In [17]:
def get_fantasy_points(stats):
    singles = stats["H"]-stats["2B"]-stats["3B"]-stats["HR"]
    points = 3*singles+3.2*stats["R"]+6*stats["2B"]+9*stats["3B"]+12*stats["HR"]\
            +3.5*stats["RBI"]+3*stats["BB"]+3*stats["IBB"]+3*stats["HBP"]+6*stats["SB"]
    return pd.DataFrame({"Name": stats["Name"],
                         "FD Points": points})

In [18]:
data.set_index("Name").join(get_fantasy_points(data).set_index("Name")).head()

Unnamed: 0_level_0,Age,#days,Lev,Tm,G,PA,AB,R,H,2B,...,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS,FD Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Altherr,26,342,MLB-NL,Philadelphia,1,1,1,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
Aaron Hicks,27,342,MLB-AL,New York,1,5,3,2,1,1,...,0,0,0,0,0,0.333,0.6,0.667,1.267,25.4
Aaron Hill,35,342,MLB-NL,San Francisco,1,5,4,1,1,0,...,0,1,0,0,0,0.25,0.2,0.25,0.45,9.7
Aaron Judge,25,342,MLB-AL,New York,1,5,4,4,4,1,...,0,0,0,0,0,1.0,1.0,2.75,3.75,59.3
Aaron Nola,24,342,MLB-NL,Philadelphia,1,2,2,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


### Easily getting stats within a range

Rather than passing strings to the argument of `batting_stats_range()`, a wrapper function will be written that takes a `datetime` and a `timedelta` and gets the stats accumulated within that duration.

In [19]:
cur_day = datetime.date(2017, 6, 11)
day = datetime.timedelta(days=1)
week = datetime.timedelta(days=7)
start = cur_day-week
print(cur_day)
print(start)

2017-06-11
2017-06-04


In [39]:
def get_batting_stats(cur_day, duration, forecasting=False):
    yesterday = cur_day-datetime.timedelta(days=1)
    if forecasting:
        offset = datetime.timedelta(0)
    else:
        offset = datetime.timedelta(1)
        
    results = batting_stats_range((cur_day-duration).strftime("%Y-%m-%d"),
                                  (cur_day-offset).strftime("%Y-%m-%d")
                                 )
    results.drop(columns=["Age", "#days", "Lev", "Tm"], inplace=True)
    
    if not forecasting:
        new_names = [(i,str(duration.days)+"-"+i) for i in results.iloc[:, 1:].columns.values]
        results = results.rename(columns = dict(new_names))
    
    return results

In [22]:
week_df = get_batting_stats(cur_day, week)
day_df = get_batting_stats(cur_day, day)

In [40]:
def construct_multi_duration_dataframe(cur_day, durations, pitchers=[]):
    dfs = [get_batting_stats(cur_day, duration, forecasting=False).set_index("Name") for duration in durations]
    
    return dfs[0].join(dfs[1:]).drop(pitchers)

In [41]:
results = construct_multi_duration_dataframe(cur_day, [day, week])

In [43]:
results

Unnamed: 0_level_0,1-G,1-PA,1-AB,1-R,1-H,1-2B,1-3B,1-HR,1-RBI,1-BB,...,7-HBP,7-SH,7-SF,7-GDP,7-SB,7-CS,7-BA,7-OBP,7-SLG,7-OPS
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.J. Ellis,,,,,,,,,,,...,1,0,0,0,0,0,0.429,0.556,0.429,0.984
Aaron Altherr,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0.292,0.320,0.583,0.903
Aaron Hicks,1.0,5.0,4.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0.292,0.370,0.583,0.954
Aaron Hill,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.273,0.333,0.636,0.970
Aaron Judge,1.0,5.0,4.0,3.0,3.0,1.0,0.0,1.0,3.0,1.0,...,0,0,0,0,1,0,0.375,0.483,0.583,1.066
Aaron Nola,,,,,,,,,,,...,0,0,0,0,0,0,0.000,0.000,0.000,0.000
Adam Duvall,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.353,0.353,0.471,0.824
Adam Engel,,,,,,,,,,,...,0,0,0,0,3,0,0.357,0.357,0.357,0.714
Adam Frazier,1.0,5.0,5.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,1,0,1,0,0,0,0.261,0.308,0.391,0.699
Adam Jones,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0.250,0.250,0.400,0.650


In [26]:
def add_fantasy_points(cur_day, dataframe):
    points = get_fantasy_points(get_batting_stats(cur_day,
                                                  datetime.timedelta(0), 
                                                  forecasting=True)).set_index("Name")
    return dataframe.join(points)

In [27]:
def remove_nan_fd_points(dataframe):
    return dataframe[dataframe["FD Points"]>= 0]

In [28]:
new_results = add_fantasy_points(cur_day, results)
new_results = remove_nan_fd_points(new_results)

In [29]:
new_results.head()

Unnamed: 0_level_0,1-G,1-PA,1-AB,1-R,1-H,1-2B,1-3B,1-HR,1-RBI,1-BB,...,7-SH,7-SF,7-GDP,7-SB,7-CS,7-BA,7-OBP,7-SLG,7-OPS,FD Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Altherr,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0.292,0.32,0.583,0.903,0.0
Aaron Hicks,1.0,5.0,4.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0.292,0.37,0.583,0.954,25.4
Aaron Hill,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.273,0.333,0.636,0.97,9.7
Aaron Judge,1.0,5.0,4.0,3.0,3.0,1.0,0.0,1.0,3.0,1.0,...,0,0,0,1,0,0.375,0.483,0.583,1.066,59.3
Aaron Nola,,,,,,,,,,,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [None]:
# TODO: set NaN to 0
# TODO: for one day, get fd_dataframe. do it again for the next day, then reset_index, remove names, and append dfs

## Excluding pitchers

In [35]:
pitchers = pitching_stats(2018, qual=10)["Name"].tolist()

In [36]:
pruned_df = new_results.drop(pitchers)
pruned_df

Unnamed: 0_level_0,1-G,1-PA,1-AB,1-R,1-H,1-2B,1-3B,1-HR,1-RBI,1-BB,...,7-SH,7-SF,7-GDP,7-SB,7-CS,7-BA,7-OBP,7-SLG,7-OPS,FD Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Altherr,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0.292,0.320,0.583,0.903,0.0
Aaron Hicks,1.0,5.0,4.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0.292,0.370,0.583,0.954,25.4
Aaron Hill,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.273,0.333,0.636,0.970,9.7
Aaron Judge,1.0,5.0,4.0,3.0,3.0,1.0,0.0,1.0,3.0,1.0,...,0,0,0,1,0,0.375,0.483,0.583,1.066,59.3
Adam Duvall,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.353,0.353,0.471,0.824,18.7
Adam Frazier,1.0,5.0,5.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0.261,0.308,0.391,0.699,15.0
Adam Jones,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0.250,0.250,0.400,0.650,0.0
Adam Lind,1.0,5.0,5.0,1.0,2.0,0.0,0.0,1.0,2.0,0.0,...,0,1,0,0,0,0.286,0.267,0.500,0.767,3.0
Adam Rosales,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.167,0.167,0.250,0.417,0.0
Addison Russell,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.111,0.200,0.111,0.311,21.7
