In [1]:
import sqlite3
import pandas as pd

con = sqlite3.connect('data/hockey.db')

In [2]:
df = pd.read_sql('select * from players', con)
df['date'] = df['date'].apply(pd.to_datetime)
df = df.sort_values('date', ascending=True)

In [3]:
df.date.max()

Timestamp('2019-02-14 00:00:00')

In [4]:
df = df[df.date <= '2018-12-31']

In [5]:
df.shape

(437, 13)

In [6]:
df.sample(5)

Unnamed: 0,id,player_id,name,position,date,team,venue,opponent,outcome,goals,assists,shots,ice_time
58,59,bergepa01,Patrice Bergeron,C,2018-10-13,BOS,Home,DET,W,0,3,1,14.0
113,114,karlser01,Erik Karlsson,D,2018-10-26,SJS,Away,CAR,L-SO,0,1,0,25.0
380,381,burnsbr01,Brent Burns,D/RW,2018-12-20,SJS,Home,WPG,L,0,0,5,27.0
182,183,tavarjo01,John Tavares,C,2018-11-10,TOR,Away,BOS,L,1,0,7,18.0
98,99,mcdavco01,Connor McDavid,C,2018-10-23,EDM,Home,PIT,L-OT,1,1,8,28.0


In [7]:
# df = df[df['name'] == 'Sidney Crosby']
df.head(5)

Unnamed: 0,id,player_id,name,position,date,team,venue,opponent,outcome,goals,assists,shots,ice_time
0,1,ovechal01,Alex Ovechkin,LW,2018-10-03,WSH,Home,BOS,W,1,1,5,16.0
7,8,dermotr01,Travis Dermott,D,2018-10-03,TOR,Home,MTL,W,0,1,1,17.0
6,7,bergepa01,Patrice Bergeron,C,2018-10-03,BOS,Away,WSH,L,0,0,3,15.0
5,6,kapanka01,Kasperi Kapanen,RW,2018-10-03,TOR,Home,MTL,W,0,0,2,11.0
8,9,hymanza01,Zach Hyman,C,2018-10-03,TOR,Home,MTL,W,0,0,2,14.0


In [8]:
rolling = (
    df
    .groupby(['player_id', 'position'])
    [['goals', 'assists', 'shots', 'ice_time']]
    .rolling(5)
    .mean()
    .groupby(['player_id', 'position'])
    .shift(1)
    .reset_index()
    .rename(columns={'level_2': 'index'})
    .set_index('index')
)

In [9]:
train = pd.merge(df, rolling, left_index=True, right_index=True, suffixes=('_next', ''))

In [10]:
train.head()

Unnamed: 0,id,player_id_next,name,position_next,date,team,venue,opponent,outcome,goals_next,assists_next,shots_next,ice_time_next,player_id,position,goals,assists,shots,ice_time
0,1,ovechal01,Alex Ovechkin,LW,2018-10-03,WSH,Home,BOS,W,1,1,5,16.0,ovechal01,LW,,,,
7,8,dermotr01,Travis Dermott,D,2018-10-03,TOR,Home,MTL,W,0,1,1,17.0,dermotr01,D,,,,
6,7,bergepa01,Patrice Bergeron,C,2018-10-03,BOS,Away,WSH,L,0,0,3,15.0,bergepa01,C,,,,
5,6,kapanka01,Kasperi Kapanen,RW,2018-10-03,TOR,Home,MTL,W,0,0,2,11.0,kapanka01,RW,,,,
8,9,hymanza01,Zach Hyman,C,2018-10-03,TOR,Home,MTL,W,0,0,2,14.0,hymanza01,C,,,,


In [11]:
train = train.dropna(subset=['goals'])

In [12]:
X = train[[
    'position', 
    'goals', 
    'assists',
    'shots',
    'ice_time',
]]

y = train['goals_next']

In [13]:
X[:5]

Unnamed: 0,position,goals,assists,shots,ice_time
61,C,0.0,0.8,1.2,16.4
60,D,0.0,0.2,1.0,16.0
57,RW,0.2,0.6,2.6,14.0
53,C,1.8,0.6,3.4,18.0
56,C,1.2,0.8,4.2,18.8


In [14]:
y[:5]

61    0
60    0
57    1
53    1
56    0
Name: goals_next, dtype: int64

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer

In [23]:
mapper = DataFrameMapper([
    ('position', [CategoricalImputer(), LabelBinarizer()]),
    (['goals'], [SimpleImputer(), StandardScaler()]), 
    (['assists'], [SimpleImputer(), StandardScaler()]),
    (['shots'], [SimpleImputer(), StandardScaler()]), 
    (['ice_time'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [24]:
Z = mapper.fit_transform(X)

In [25]:
model = LinearRegression()
model.fit(Z, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [26]:
model.score(Z, y)

0.12812000807969037

In [27]:
# now to pipeline?

In [None]:
name = 'Alex Ovechkin'
test = pd.read_sql(f'''
    select 
    * 
    from players 
    where name = "{name}" 
    order by date asc
    limit 5
''', con)

test

In [None]:
rolling = (
    test
    .groupby(['name', 'position'])
    [['goals', 'assists', 'shots', 'ice_time']]
    .rolling(5)
    .mean()
    .reset_index()
    .drop(['name', 'level_2'], axis=1)
    .iloc[-1:]
)

In [None]:
Z = mapper.transform(rolling)
round(model.predict(Z)[0], 2)

Wrap up attempt #1

In [None]:
def read_data(name, con):
    player = pd.read_sql(f'''
    select 
    * 
    from players 
    where name = "{name}" 
    order by date asc
    limit 5
    ''', con)
    return player

In [None]:
data = read_data('Alex Ovechkin', con)

In [None]:
def prepare_data(data):
    rolling = (
        data
        .set_index('position')
        [['goals', 'assists', 'shots', 'ice_time']]
        .rolling(5)
        .mean()
        .reset_index()
        .iloc[-1:]
    )
    return rolling

In [None]:
data = prepare_data(data)

In [None]:
def predict(data):
    data = mapper.transform(data)
    pred = round(model.predict(data)[0], 2)
    return pred

In [None]:
predict(data)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CleanAndRoll(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = (
            X
            .set_index('position')
            [['goals', 'assists', 'shots', 'ice_time']]
            .rolling(5)
            .mean()
            .reset_index()
            .iloc[-1:]
        )
        return X

car = CleanAndRoll()

data = read_data('Alex Ovechkin', con)
car.transform(data)

In [None]:
from sklearn.pipeline import make_pipeline

data = read_data('Alex Ovechkin', con)
pipe = make_pipeline(
    car,
    mapper,
    model
)
pipe.fit()