# Overview

This notebook will explore the `pybaseball` package and see if it can be used to create features for a regression model.

In [56]:
from pybaseball import batting_stats_range
import datetime
import pandas as pd

### Loading stats accumulated over a date range

In [93]:
data = batting_stats_range('2017-06-12', '2017-06-12')
data.head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,...,HBP,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS
1,Jose Abreu,30,339,MLB-AL,Chicago,1,5,5,2,2,...,0,0,0,0,0,0,0.4,0.4,0.6,1.0
2,Lane Adams,27,339,MLB-NL,Atlanta,1,1,1,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
3,Matt Adams,28,339,MLB-NL,Atlanta,1,5,4,3,3,...,0,0,0,0,0,0,0.75,0.8,2.25,3.05
4,Ehire Adrianza,27,339,MLB-AL,Minnesota,1,3,3,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
5,Arismendy Alcantara,25,339,MLB-NL,Cincinnati,1,1,1,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


### Calculating FD points for a given range

Create a function that will take a single row and output the Fanduel score for that player.

In [57]:
def get_fantasy_points(stats):
    singles = stats["H"]-stats["2B"]-stats["3B"]-stats["HR"]
    points = 3*singles+3.2*stats["R"]+6*stats["2B"]+9*stats["3B"]+12*stats["HR"]\
            +3.5*stats["RBI"]+3*stats["BB"]+3*stats["IBB"]+3*stats["HBP"]+6*stats["SB"]
    return pd.DataFrame({"Name": stats["Name"],
                         "FD Points": points})

In [108]:
data.set_index("Name").join(get_fantasy_points(data).set_index("Name")).head()

Unnamed: 0_level_0,Age,#days,Lev,Tm,G,PA,AB,R,H,2B,...,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS,FD Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Jose Abreu,30,339,MLB-AL,Chicago,1,5,5,2,2,1,...,0,0,0,0,0,0.4,0.4,0.6,1.0,18.9
Lane Adams,27,339,MLB-NL,Atlanta,1,1,1,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
Matt Adams,28,339,MLB-NL,Atlanta,1,5,4,3,3,0,...,0,0,0,0,0,0.75,0.8,2.25,3.05,53.6
Ehire Adrianza,27,339,MLB-AL,Minnesota,1,3,3,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
Arismendy Alcantara,25,339,MLB-NL,Cincinnati,1,1,1,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


### Easily getting stats within a range

Rather than passing strings to the argument of `batting_stats_range()`, a wrapper function will be written that takes a `datetime` and a `timedelta` and gets the stats accumulated within that duration.

In [41]:
end = datetime.date(2017, 6, 11)
day = datetime.timedelta(days=1)
week = datetime.timedelta(days=7)
start = end-week
print(start)
print(end)

2017-06-04
2017-06-11


In [47]:
def get_batting_stats(end_ts, duration, rename=False):
    results = batting_stats_range((end_ts-duration).strftime("%Y-%m-%d"),
                                   end_ts.strftime("%Y-%m-%d")
                                 )
    results = results.drop(columns=["Age", "#days", "Lev", "Tm"])
    
    if rename:
        new_names = [(i,str(duration.days)+"-"+i) for i in results.iloc[:, 1:].columns.values]
        results = results.rename(columns = dict(new_names))
    
    return results

In [22]:
week_df = get_batting_stats(end, week)
day_df = get_batting_stats(end, day)

In [95]:
def construct_multi_duration_dataframe(end_ts, durations):
    dfs = [get_batting_stats(end_ts, duration, rename=True).set_index("Name") for duration in durations]
    
    return dfs[0].join(dfs[1:])

In [96]:
results = construct_multi_duration_dataframe(end, [day, week])

In [97]:
results.head()

Unnamed: 0_level_0,1-G,1-PA,1-AB,1-R,1-H,1-2B,1-3B,1-HR,1-RBI,1-BB,...,7-HBP,7-SH,7-SF,7-GDP,7-SB,7-CS,7-BA,7-OBP,7-SLG,7-OPS
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.J. Ellis,,,,,,,,,,,...,1,0,0,0,0,0,0.429,0.556,0.429,0.984
Aaron Altherr,2.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0.28,0.308,0.56,0.868
Aaron Hicks,2.0,10.0,7.0,4.0,2.0,2.0,0.0,0.0,2.0,3.0,...,0,0,0,0,0,0,0.296,0.406,0.593,0.999
Aaron Hill,2.0,9.0,8.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0,0,1,0,0,0,0.267,0.294,0.533,0.827
Aaron Judge,2.0,10.0,8.0,7.0,7.0,2.0,0.0,3.0,6.0,2.0,...,0,0,0,0,1,0,0.464,0.559,0.893,1.452


In [98]:
def add_fantasy_points(end_ts, dataframe):
    points = get_fantasy_points(get_batting_stats(end_ts+datetime.timedelta(1),
                                                  datetime.timedelta(0), 
                                                  rename=False)).set_index("Name")
    return dataframe.join(points)

In [103]:
def remove_nan_fd_points(dataframe):
    return dataframe[dataframe["FD Points"]>= 0]

In [105]:
new_results = add_fantasy_points(end, results)
new_results = remove_nan_fd_points(new_results)

In [107]:
new_results.head()

Unnamed: 0_level_0,1-G,1-PA,1-AB,1-R,1-H,1-2B,1-3B,1-HR,1-RBI,1-BB,...,7-SH,7-SF,7-GDP,7-SB,7-CS,7-BA,7-OBP,7-SLG,7-OPS,FD Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Altherr,2.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0.28,0.308,0.56,0.868,9.2
Aaron Hicks,2.0,10.0,7.0,4.0,2.0,2.0,0.0,0.0,2.0,3.0,...,0,0,0,0,0,0.296,0.406,0.593,0.999,9.2
Aaron Judge,2.0,10.0,8.0,7.0,7.0,2.0,0.0,3.0,6.0,2.0,...,0,0,0,1,0,0.464,0.559,0.893,1.452,34.6
Adam Duvall,2.0,6.0,6.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0.318,0.318,0.545,0.864,12.2
Adam Jones,2.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0.217,0.217,0.348,0.565,6.5
