# Part I: Model Extending

Need data to talk about data, and a model to talk about models...

### Import Data

In [1]:
import pandas as pd

df = pd.read_csv('data/hockey.csv')
df['date'] = df['date'].apply(pd.to_datetime)

Let's pretend that it's Valentine's Day, 2018... (this will come in handy later)

In [2]:
df.date.max()

Timestamp('2019-02-14 00:00:00')

Train-test-split on time series data is a little different...

In [3]:
df = df[df.date <= '2018-12-31']

In [4]:
df.shape

(437, 13)

In [5]:
df.sample(5)

Unnamed: 0,id,player_id,name,position,date,team,venue,opponent,outcome,goals,assists,shots,ice_time
395,396,kapanka01,Kasperi Kapanen,RW,2018-12-22,TOR,Home,NYR,W,0,0,1,17.0
104,105,tavarjo01,John Tavares,C,2018-10-24,TOR,Away,WPG,W,1,0,5,17.0
160,161,mcdavco01,Connor McDavid,C,2018-11-06,EDM,Away,TBL,L,0,0,5,18.0
429,430,dermotr01,Travis Dermott,D,2018-12-29,TOR,Home,NYI,L,0,0,4,21.0
345,346,karlser01,Erik Karlsson,D,2018-12-13,SJS,Home,DAL,W,0,1,1,23.0


### Model

Predict \*goals\* next game based on the {goals, assists, ice time} rolling average for the last five games...

In [6]:
X = (
    df
    .groupby(['player_id', 'position'])
    [['goals', 'assists', 'shots', 'ice_time']]
    .rolling(5)
    .mean()
    .reset_index()
    .rename(columns={'level_2': 'index'})
    .set_index('index')
    .dropna(subset=['goals'])
    [['position', 'goals', 'assists', 'shots', 'ice_time']]
)

In [7]:
# need to shift games by one to predict next game
# less we be hit with target leakage
y = df[['player_id', 'goals']].groupby('player_id').shift(-1)
y = y.dropna(subset=['goals'])

In [8]:
# re-align index
train = pd.merge(X, y, left_index=True, right_index=True, suffixes=('', '_next'))

In [9]:
# (re-)identify our X and y matrices
target = 'goals_next'
X_train = train.drop(target, axis=1)
y_train = train[target]

In [10]:
print(X_train.shape)
X_train[:5]

(372, 5)


Unnamed: 0,position,goals,assists,shots,ice_time
58,C,1.0,1.2,2.8,15.8
73,C,1.2,1.4,3.6,16.6
81,C,1.0,1.2,3.8,17.8
93,C,0.4,1.0,3.4,18.8
101,C,0.4,1.2,3.0,18.6


In [11]:
print(y_train.shape)
y_train[:5]

(372,)


58     1.0
73     0.0
81     0.0
93     1.0
101    0.0
Name: goals_next, dtype: float64

### Model

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer

In [13]:
mapper = DataFrameMapper([
    ('position', [CategoricalImputer(), LabelBinarizer()]),
    (['goals'], [SimpleImputer(), StandardScaler()]), 
    (['assists'], [SimpleImputer(), StandardScaler()]),
    (['shots'], [SimpleImputer(), StandardScaler()]), 
    (['ice_time'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [14]:
Z_train = mapper.fit_transform(X_train)

In [15]:
model = LinearRegression()
model.fit(Z_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [16]:
model.score(Z_train, y_train)

0.1281200080796906

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [17]:
demo = pd.DataFrame({
    'position': ['LW', 'RW', 'RW', 'C']
})

pd.get_dummies(demo)

Unnamed: 0,position_C,position_LW,position_RW
0,0,1,0
1,0,0,1
2,0,0,1
3,1,0,0


In [18]:
demo_2 = pd.DataFrame({
    'position': ['LW', 'RW', 'RW', 'D']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_D,position_LW,position_RW
0,0,1,0
1,0,0,1
2,0,0,1
3,1,0,0


In [19]:
new = pd.DataFrame({
    'position': ['üçî']
})

pd.get_dummies(new)

Unnamed: 0,position_üçî
0,1


### What to do instead...

In [20]:
X_train.sample(5)

Unnamed: 0,position,goals,assists,shots,ice_time
270,D/RW,0.0,0.8,2.0,22.2
259,LW,1.0,0.4,3.0,21.0
151,C,0.2,1.0,3.2,20.2
356,D,0.4,0.8,1.4,23.0
269,C,0.4,0.2,4.2,14.8


In [21]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]])

In [22]:
lb.classes_

array(['C', 'D', 'D/RW', 'LW', 'RW'], dtype='<U4')

In [23]:
lb.transform(new['position'])

array([[0, 0, 0, 0, 0]])

In [24]:
new = pd.DataFrame({
    'position': [None]
})

# lb.transform(new['position'])

In [25]:
mapper = DataFrameMapper([
    ('position', [CategoricalImputer(), LabelBinarizer()]),
    (['goals'], [SimpleImputer(), StandardScaler()]), 
    (['assists'], [SimpleImputer(), StandardScaler()]),
    (['shots'], [SimpleImputer(), StandardScaler()]), 
    (['ice_time'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [26]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_C,position_D,position_D/RW,position_LW,position_RW,goals,assists,shots,ice_time
58,1,0,0,0,0,1.69057,1.75374,-0.036663,-1.438185
73,1,0,0,0,0,2.224888,2.285436,0.70056,-1.152887
81,1,0,0,0,0,1.69057,1.75374,0.884866,-0.72494
93,1,0,0,0,0,0.087617,1.222043,0.516254,-0.368318
101,1,0,0,0,0,0.087617,1.75374,0.147643,-0.439642
110,1,0,0,0,0,0.087617,0.158651,0.516254,0.059629
119,1,0,0,0,0,-0.446701,-0.373045,-0.036663,0.130953
132,1,0,0,0,0,-0.446701,1.222043,-0.036663,0.059629
151,1,0,0,0,0,-0.446701,1.222043,0.331949,0.130953
157,1,0,0,0,0,-0.981019,0.690347,0.884866,0.344927


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [27]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('dataframemapper', DataFrameMapper(default=False, df_out=True,
        features=[('position', [CategoricalImputer(copy=True, fill_value='?', missing_values='NaN',
          strategy='most_frequent'), LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)]), (['goals'], [SimpleImputer(...ression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))])

And pickle it!

In [28]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [29]:
del pipe

In [30]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [31]:
pipe.score(X_train, y_train)

0.1281200080796906

In [32]:
pipe.predict(X_train)[:10]

array([0.52168727, 0.56945698, 0.55943287, 0.57367113, 0.59062786,
       0.48563282, 0.43943013, 0.55553534, 0.57661568, 0.59564802])

In [33]:
X_train.sample(1).to_dict(orient='list')

{'position': ['RW'],
 'goals': [0.2],
 'assists': [0.6],
 'shots': [1.8],
 'ice_time': [16.8]}

In [34]:
new = pd.DataFrame({
    'position': ['RW'],
    'goals': [0.7],
    'assists': [0.0],
    'shots': [3],
    'ice_time': [20.0]
})

In [35]:
pipe.predict(new)

array([0.30075699])

### Test

Test the model on unseen data...

In [36]:
df = pd.read_csv('data/hockey.csv')
df['date'] = df['date'].apply(pd.to_datetime)
df = df[df.date > '2018-12-31']

In [37]:
X = (
    df
    .groupby(['player_id', 'position'])
    [['goals', 'assists', 'shots', 'ice_time']]
    .rolling(5)
    .mean()
    .reset_index()
    .rename(columns={'level_2': 'index'})
    .set_index('index')
    .dropna(subset=['goals'])
    [['position', 'goals', 'assists', 'shots', 'ice_time']]
)

In [38]:
y = df[['player_id', 'goals']].groupby('player_id').shift(-1)
y = y.dropna(subset=['goals'])

test = pd.merge(X, y, left_index=True, right_index=True, suffixes=('', '_next'))

target = 'goals_next'
X_test = train.drop(target, axis=1)
y_test = train[target]

In [39]:
score = pipe.score(X_test, y_test)
print(score)

0.1281200080796906


This actually isn't terrible...

In [40]:
with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)