## Overview

This notebook will explore the `pybaseball` package and see if it can be used to create features for a regression model.

In [1]:
import datetime

import pandas as pd
    
from pybaseball import batting_stats_range_by_id, pitching_stats_by_id

## Loading stats accumulated over a date range

In [2]:
data = batting_stats_range_by_id('2017-06-11', '2017-06-11')
data.head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,...,HBP,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS
1,547989,30,355,MLB-AL,Chicago,1,4,3,1,0,...,1,0,0,2,0,0,0.0,0.25,0.0,0.25
2,571431,28,355,MLB-NL,Atlanta,1,4,4,0,0,...,0,0,0,1,0,0,0.0,0.0,0.0,0.0
3,534947,29,355,MLB-NL,Cincinnati,1,3,3,0,1,...,0,0,0,0,0,0,0.333,0.333,0.333,0.667
4,501303,27,355,MLB-AL,Minnesota,1,1,0,0,0,...,0,0,0,0,0,0,,1.0,,
5,542583,27,355,MLB-NL,Milwaukee,1,1,0,0,0,...,0,0,0,0,0,0,,1.0,,


## Calculating FD points for a given range

Create a function that will take a single row and output the Fanduel score for that player.

In [3]:
def get_fantasy_points(stats):
    singles = stats["H"]-stats["2B"]-stats["3B"]-stats["HR"]
    points = 3*singles+3.2*stats["R"]+6*stats["2B"]+9*stats["3B"]+12*stats["HR"]\
            +3.5*stats["RBI"]+3*stats["BB"]+3*stats["IBB"]+3*stats["HBP"]+6*stats["SB"]
    return pd.DataFrame({"Name": stats["Name"],
                         "FD Points": points})

In [4]:
data.set_index("Name").join(get_fantasy_points(data).set_index("Name")).head()

Unnamed: 0_level_0,Age,#days,Lev,Tm,G,PA,AB,R,H,2B,...,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS,FD Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
547989,30,355,MLB-AL,Chicago,1,4,3,1,0,0,...,0,0,2,0,0,0.0,0.25,0.0,0.25,6.2
571431,28,355,MLB-NL,Atlanta,1,4,4,0,0,0,...,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0
534947,29,355,MLB-NL,Cincinnati,1,3,3,0,1,0,...,0,0,0,0,0,0.333,0.333,0.333,0.667,3.0
501303,27,355,MLB-AL,Minnesota,1,1,0,0,0,0,...,0,0,0,0,0,,1.0,,,3.0
542583,27,355,MLB-NL,Milwaukee,1,1,0,0,0,0,...,0,0,0,0,0,,1.0,,,3.0


### Easily getting stats within a range

Rather than passing strings to the argument of `batting_stats_range_by_id()`, a wrapper function will be written that takes a `datetime` and a `timedelta` and gets the stats accumulated within that duration.

In [5]:
cur_day = datetime.date(2017, 6, 11)
day = datetime.timedelta(days=1)
week = datetime.timedelta(days=7)
month = datetime.timedelta(days=30)
start = cur_day-week
print(cur_day)
print(start)

2017-06-11
2017-06-04


In [46]:
def get_batting_stats(cur_day, duration, forecasting=False):
    try:
        yesterday = cur_day-datetime.timedelta(days=1)
        if forecasting:
            offset = datetime.timedelta(0)
        else:
            offset = datetime.timedelta(1)

        results = batting_stats_range_by_id((cur_day-duration).strftime("%Y-%m-%d"),
                                            (cur_day-offset).strftime("%Y-%m-%d")
                                           )
        results.drop(columns=["Tm", "Age", "#days", "Lev"], inplace=True)
        
    except IndexError as err:
        print("{}: Unable to retrieve stats for {} and {} range.".format(err, cur_day, duration))
        cols = ['Name', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'IBB', 
                'SO', 'HBP', 'SH', 'SF', 'GDP', 'SB', 'CS', 'BA', 'OBP', 'SLG', 'OPS']
        results = pd.DataFrame(columns=cols)
    
    if not forecasting:
        new_names = [(i,str(duration.days)+"-"+i) for i in results.iloc[:, 1:].columns.values]
        results = results.rename(columns = dict(new_names))
        
    return results

In [47]:
week_df = get_batting_stats(cur_day, week)
day_df = get_batting_stats(cur_day, day)

In [48]:
from functools import reduce
def construct_multi_duration_dataframe(cur_day, durations):
    dfs = [get_batting_stats(cur_day, duration, forecasting=False).set_index("Name") for duration in durations]
    return reduce(lambda left,right: pd.merge(left,right, how='outer', left_index=True, right_index=True), dfs)

In [49]:
results = construct_multi_duration_dataframe(cur_day, [day])

In [50]:
results.head()

Unnamed: 0_level_0,1-G,1-PA,1-AB,1-R,1-H,1-2B,1-3B,1-HR,1-RBI,1-BB,...,1-HBP,1-SH,1-SF,1-GDP,1-SB,1-CS,1-BA,1-OBP,1-SLG,1-OPS
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
547989,1,5,5,1,3,1,0,0,1,0,...,0,0,0,1,0,0,0.6,0.6,0.8,1.4
572669,2,2,2,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0.5,0.5,0.5,1.0
571431,2,8,7,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0.143,0.25,0.143,0.393
501303,1,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
542583,1,4,3,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0.0,0.25,0.0,0.25


In [51]:
def add_fantasy_points(cur_day, dataframe):
    points = get_fantasy_points(get_batting_stats(cur_day,
                                                  datetime.timedelta(0), 
                                                  forecasting=True)).set_index("Name")
    return dataframe.join(points)

In [52]:
def remove_nan_fd_points(dataframe):
    return dataframe[dataframe["FD Points"]>= 0]

In [53]:
new_results = add_fantasy_points(cur_day, results)
new_results = remove_nan_fd_points(new_results)

In [54]:
print(len(new_results))
new_results.head()

268


Unnamed: 0_level_0,1-G,1-PA,1-AB,1-R,1-H,1-2B,1-3B,1-HR,1-RBI,1-BB,...,1-SH,1-SF,1-GDP,1-SB,1-CS,1-BA,1-OBP,1-SLG,1-OPS,FD Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
547989,1,5,5,1,3,1,0,0,1,0,...,0,0,1,0,0,0.6,0.6,0.8,1.4,6.2
571431,2,8,7,0,1,0,0,0,0,1,...,0,0,0,0,0,0.143,0.25,0.143,0.393,0.0
501303,1,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,3.0
542583,1,4,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0.25,0.0,0.25,3.0
475174,2,10,10,1,7,2,0,0,2,0,...,0,0,0,0,0,0.7,0.7,0.9,1.6,12.4


In [55]:
# TODO: set NaN to 0
# TODO: for one day, get fd_dataframe. do it again for the next day, then reset_index, remove names, and append dfs

## Excluding pitchers

In [57]:
pitchers_17 = pitching_stats_by_id(2017, qual=3)["Name"].tolist()
pitchers_18 = pitching_stats_by_id(2018, qual=3)["Name"].tolist()

In [67]:
def drop_pitchers(dataframe, pitchers):
    """
    :param dataframe: dataframe of players and stats
    :param pitchers: list of BBRef IDs for pitchers
    """
    return dataframe.drop(pitchers, errors='ignore')

In [68]:
today = datetime.date(2017, 7, 20)
yesterday = datetime.date(2017, 7, 19)

df2 = construct_multi_duration_dataframe(today, [day, week])
print(len(df2))
df2 = add_fantasy_points(today, df2)
df2 = remove_nan_fd_points(df2)
df2 = drop_pitchers(df2, pitchers_17)
print(len(df2))

468
192


In [69]:
df2.head()

Unnamed: 0_level_0,1-G,1-PA,1-AB,1-R,1-H,1-2B,1-3B,1-HR,1-RBI,1-BB,...,7-SH,7-SF,7-GDP,7-SB,7-CS,7-BA,7-OBP,7-SLG,7-OPS,FD Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
134181,1.0,4.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.227,0.261,0.409,0.67,12.7
400121,1.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0,0,0,0,0,0.348,0.444,0.391,0.836,6.2
400284,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.167,0.286,0.167,0.452,6.0
407812,,,,,,,,,,,...,0,0,2,0,0,0.107,0.138,0.214,0.352,0.0
408234,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,1,1,0,0,0.24,0.31,0.36,0.67,6.5


## Setting NaN to 0

In [70]:
def replace_nan(dataframe):
    return dataframe.fillna(0)

In [71]:
df2 = replace_nan(df2)
df2.head()

Unnamed: 0_level_0,1-G,1-PA,1-AB,1-R,1-H,1-2B,1-3B,1-HR,1-RBI,1-BB,...,7-SH,7-SF,7-GDP,7-SB,7-CS,7-BA,7-OBP,7-SLG,7-OPS,FD Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
134181,1.0,4.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.227,0.261,0.409,0.67,12.7
400121,1.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0,0,0,0,0,0.348,0.444,0.391,0.836,6.2
400284,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.167,0.286,0.167,0.452,6.0
407812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,2,0,0,0.107,0.138,0.214,0.352,0.0
408234,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,1,1,0,0,0.24,0.31,0.36,0.67,6.5


## Getting and combining dataframes

In [73]:
def get_dataframe(current_day, durations, pitchers):
    df = construct_multi_duration_dataframe(current_day, durations)
    df = add_fantasy_points(current_day, df)
    df = remove_nan_fd_points(df)
    df = drop_pitchers(df, pitchers)
    df = replace_nan(df)
    return df

In [72]:
def combine_dataframes(dataframes):
    return pd.concat(dataframes).reset_index()

In [80]:
df1 = get_dataframe(today, [day, week], pitchers_17)
df2 = get_dataframe(today+day, [day, week], pitchers_17)
df3 = get_dataframe(today+day+day, [day, week], pitchers_17)
print("Length of df1: {}".format(len(df1)))
print("Length of df2: {}".format(len(df2)))
print("Length of df3: {}".format(len(df3)))

Length of df1: 192
Length of df2: 317
Length of df3: 313


In [81]:
len(combine_dataframes([df1, df2, df3]))

822

## Creating train/test set

In [151]:
def retrieve_data(years):
    dfs = []
    day = datetime.timedelta(days=1)
    week = datetime.timedelta(days=7)
    month = datetime.timedelta(days=30)
    for year in years:
        print("Downloading pitchers for {} season.".format(year))
        pitchers = pitching_stats_by_id(year, qual=3)["Name"].tolist()
        print("Downloaded list of {} pitchers.".format(len(pitchers)))
        starting_date = datetime.date(year, 4, 20)
        date_list = [starting_date + datetime.timedelta(days=x) 
                     for x in range(0, (datetime.date(year, 9, 30)-starting_date).days)]
#         date_list = [starting_date + datetime.timedelta(days=x) 
#                      for x in range(0, (datetime.date(year, 6, 27)-starting_date).days)]
        for idx, today in enumerate(date_list):
            print("{}/{} - Downloading data for {}".format(idx+1, len(date_list), today))
            try:
                df = get_dataframe(today, [day, week, month], pitchers)
                dfs.append(df)
            except Exception as err:
                print("{} - {}".format(today, err))
    return combine_dataframes(dfs)

In [152]:
data = retrieve_data([2017])

Downloading pitchers for 2017 season.
Downloaded list of 701 pitchers.
1/163 - Downloading data for 2017-04-20
2/163 - Downloading data for 2017-04-21
3/163 - Downloading data for 2017-04-22
4/163 - Downloading data for 2017-04-23
5/163 - Downloading data for 2017-04-24
6/163 - Downloading data for 2017-04-25
7/163 - Downloading data for 2017-04-26
8/163 - Downloading data for 2017-04-27
9/163 - Downloading data for 2017-04-28
10/163 - Downloading data for 2017-04-29
11/163 - Downloading data for 2017-04-30
12/163 - Downloading data for 2017-05-01
13/163 - Downloading data for 2017-05-02
14/163 - Downloading data for 2017-05-03
15/163 - Downloading data for 2017-05-04
16/163 - Downloading data for 2017-05-05
17/163 - Downloading data for 2017-05-06
18/163 - Downloading data for 2017-05-07
19/163 - Downloading data for 2017-05-08
20/163 - Downloading data for 2017-05-09
21/163 - Downloading data for 2017-05-10
22/163 - Downloading data for 2017-05-11
23/163 - Downloading data for 2017-0

In [153]:
len(data)

45294

In [154]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2)

## Training and testing Linear Regression

In [159]:
from sklearn.linear_model import LinearRegression

regr = LinearRegression(n_jobs=-1)
regr.fit(train.values[:,1:len(train.values[0])-1], train.values[:,-1])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [156]:
# Make predictions using the testing set
preds = regr.predict(test.values[:,1:len(test.values[0])-1])

from sklearn.metrics import mean_squared_error, r2_score

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(test.values[:,-1], preds))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(test.values[:,-1], preds))

Coefficients: 
 [ 5.11707653e-02 -2.70180726e+00  2.59498989e+00  2.22597722e-01
  2.18767165e-01  8.43613198e-02  4.51923780e-01  1.71378920e-01
  1.69095917e-01  2.99907827e+00  2.93833064e-01 -1.83055893e-02
  2.63991312e+00  3.62856272e+00  3.23932483e+00  5.62950374e-02
  1.40792152e-01 -6.62955412e-01  2.27443360e+00 -8.63662329e-01
  2.02649529e-01 -1.09711691e+00 -9.51320326e-02  1.09411872e+00
 -9.77868936e-01  1.29272479e-02 -6.83136120e-02  7.72860233e-02
 -7.59101724e-02 -1.60089569e-01  4.37396372e-02 -1.00285404e+00
 -5.89618700e-02 -3.82624020e-02 -9.83499206e-01 -1.33136108e+00
 -8.88842820e-01 -1.85102948e-01 -2.69514296e-02  1.52452530e-01
  2.45623396e-02  5.42800115e-01  8.71400722e-01 -1.89387746e-01
 -2.02598894e-01  1.94782389e-01 -1.17256560e-01 -1.53335150e-02
 -2.87797272e-02 -8.59246946e-03 -7.93341716e-02  1.50329723e-01
 -3.04213099e-02 -1.14154921e-01  9.84109533e-02  2.00700589e-02
 -1.64825655e-01 -5.92292295e-01 -1.56223887e-01  1.74394185e-01
  1.45856

In [157]:
from sklearn.svm import SVR

svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)

X = train.values[:,1:len(train.values[0])-1]
y = train.values[:,-1]
test_X = test.values[:,1:len(test.values[0])-1]
true_labels = test.values[:,-1]

svr_rbf.fit(X, y)

y_rbf = svr_rbf.predict(test_X)

# The coefficients
# print('Coefficients: \n', svr_rbf)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(true_labels, y_rbf))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(true_labels, y_rbf))

Mean squared error: 95.94
Variance score: 0.01


In [173]:
from beakerx import TableDisplay
TableDisplay(data)