# Overview

This notebook will explore the `pybaseball` package and see if it can be used to create features for a regression model.

In [4]:
from pybaseball import batting_stats_range
import datetime

### Loading stats accumulated over a date range

In [5]:
data = batting_stats_range('2017-05-01', '2017-06-01')
data.head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,...,HBP,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS
1,Jose Abreu,30,350,MLB-AL,Chicago,29,130,122,22,36,...,1,0,0,5,0,0,0.295,0.338,0.541,0.879
2,Lane Adams,27,374,MLB-NL,Atlanta,6,6,6,0,2,...,0,0,0,1,1,0,0.333,0.333,0.333,0.667
3,Matt Adams,28,351,MLB-NL,"Atlanta,St. Louis",21,56,55,8,15,...,0,0,0,2,0,0,0.273,0.286,0.527,0.813
4,Jim Adduci,32,371,MLB-AL,Detroit,7,27,24,3,6,...,0,0,0,0,0,0,0.25,0.333,0.375,0.708
5,Tim Adleman,29,355,MLB-NL,Cincinnati,4,9,9,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


### Calculating FD points for a given range

Create a function that will take a single row and output the Fanduel score for that player.

In [6]:
def get_fantasy_points(stats):
    singles = stats["H"]-stats["2B"]-stats["3B"]-stats["HR"]
    points = 3*singles+3.2*stats["R"]+6*stats["2B"]+9*stats["3B"]+12*stats["HR"]\
            +3.5*stats["RBI"]+3*stats["BB"]+3*stats["IBB"]+3*stats["HBP"]+6*stats["SB"]
    return points    

In [7]:
data["FD Points"] = get_fantasy_points(data)
data.head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,...,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS,FD Points
1,Jose Abreu,30,350,MLB-AL,Chicago,29,130,122,22,36,...,0,0,5,0,0,0.295,0.338,0.541,0.879,358.9
2,Lane Adams,27,374,MLB-NL,Atlanta,6,6,6,0,2,...,0,0,1,1,0,0.333,0.333,0.333,0.667,15.5
3,Matt Adams,28,351,MLB-NL,"Atlanta,St. Louis",21,56,55,8,15,...,0,0,2,0,0,0.273,0.286,0.527,0.813,146.6
4,Jim Adduci,32,371,MLB-AL,Detroit,7,27,24,3,6,...,0,0,0,0,0,0.25,0.333,0.375,0.708,59.6
5,Tim Adleman,29,355,MLB-NL,Cincinnati,4,9,9,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


### Easily getting stats within a range

Rather than passing strings to the argument of `batting_stats_range()`, a wrapper function will be written that takes a `datetime` and a `timedelta` and gets the stats accumulated within that duration.

In [8]:
start = datetime.date(2017, 6, 11)
day = datetime.timedelta(days=1)
week = datetime.timedelta(days=7)
end = start+week
print(start)
print(end)

2017-06-11
2017-06-18


In [20]:
def get_batting_stats(start_ts, duration):
    results = batting_stats_range(start_ts.strftime("%Y-%m-%d"), 
                                 (start_ts+duration).strftime("%Y-%m-%d"))
    results = results.drop(columns=["Age", "#days", "Lev", "Tm"])
    
    new_names = [(i,str(duration.days)+"-"+i) for i in results.iloc[:, 1:].columns.values]
    
    results = results.rename(columns = dict(new_names))
    
    return results

In [21]:
df = get_batting_stats(start, week)
df

Unnamed: 0,Name,7-G,7-PA,7-AB,7-R,7-H,7-2B,7-3B,7-HR,7-RBI,...,7-HBP,7-SH,7-SF,7-GDP,7-SB,7-CS,7-BA,7-OBP,7-SLG,7-OPS
1,Jose Abreu,8,36,32,8,12,2,1,1,9,...,1,0,1,2,0,0,0.375,0.417,0.594,1.010
2,Lane Adams,5,4,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0.000,0.000,0.000,0.000
3,Matt Adams,7,31,26,8,11,3,0,3,10,...,1,0,1,2,0,0,0.423,0.484,0.885,1.368
4,Tim Adleman,2,4,4,0,1,0,0,0,0,...,0,0,0,0,0,0,0.250,0.250,0.250,0.500
5,Ehire Adrianza,6,10,8,0,2,1,0,0,2,...,0,0,0,0,1,0,0.250,0.400,0.375,0.775
6,Jesus Aguilar,6,11,8,3,3,0,0,1,1,...,0,0,0,0,0,0,0.375,0.545,0.750,1.295
7,Nick Ahmed,3,12,11,1,2,0,0,0,0,...,0,0,0,2,0,1,0.182,0.250,0.182,0.432
8,Arismendy Alcantara,5,10,10,1,2,1,0,0,0,...,0,0,0,0,0,0,0.200,0.200,0.300,0.500
9,Scott Alexander,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0.000,0.000,0.000,0.000
10,Albert Almora,5,17,16,2,5,1,0,0,0,...,0,0,0,0,0,0,0.313,0.353,0.375,0.728
