### 1. Imports

In [1]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict

from luther_code import var_to_pickle, read_pickle

%matplotlib inline

### 2. Define List of Season Parameters

In [2]:
season_years = list(range(2010,2020))
season_month = 9
season_day = 1

# Every season in data range is 82 games except for lockout-shortened 2013 with 48
season_length = defaultdict(lambda:82)
season_length[2013] = 48

### 3. Add Season Column to Injuries Dataframe

In [3]:
injuries_df = read_pickle('../data/injuries_df.pickle')
injuries_df['Season'] = 0
min_year = injuries_df['Injury_Date'].min().year
max_year = injuries_df['Injury_Date'].max().year + 1
season = {min_year - 1:datetime.datetime(min_year - 1, season_month, season_day)}
for year in range(min_year, max_year):
    season[year] = datetime.datetime(year, season_month, season_day)
    mask = ((injuries_df['Injury_Date'] > season[year-1])
            & (injuries_df['Injury_Date'] <= season[year]))
    injuries_df.loc[mask, 'Season'] = year

### 4. Make DataFrame of Games Missed due to Injury by Player and Season

In [4]:
missed_df = (injuries_df.groupby(['Name', 'Birth_Date', 'Season'], as_index=False).sum())

# Cap games missed by injury to season length (seasons may be shortened by lockouts)
missed_df.loc[missed_df['Games_Missed'] > season_length[0], 'Games_Missed'] = season_length[0]
for key,val in season_length.items():
    mask = (missed_df['Season'] == key) & (missed_df['Games_Missed'] > val)
    missed_df.loc[mask, 'Games_Missed'] = val

### 5. Change Games Missed Names to Match Stats Names

In [5]:
name_changes = [
    ('Alex Burmistrov', 'Alexander Burmistrov'),
    ('Alexander Petrovic', 'Alex Petrovic'),
    ('Alexei Marchenko', 'Alexey Marchenko'),
    ('Matt Benning', 'Matthew Benning'),
    ('Michael Cammalleri', 'Mike Cammalleri'),
    ('Mike Sauer', 'Michael Sauer'),
    ('Mike Zigomanis', 'Michael Zigomanis'),
    ('P.A. Parenteau', 'PA Parenteau'),
    ('T.J. Brodie', 'TJ Brodie'),
    ('T.J. Galiardi', 'TJ Galiardi')
]
for old_name,new_name in name_changes:
    missed_df.loc[missed_df['Name'] == old_name, 'Name'] = new_name

### 6. Merge Stats and Games Missed DataFrames

In [6]:
stats_df = read_pickle('../data/stats_df.pickle')

# Not all player birthdays are consistent between DataFrames, so only use Birth_Date as
# a merge condition if multiple players share the same name
bdays_per_name = stats_df.groupby('Name')['Birth_Date'].nunique()
multiple_names = bdays_per_name[(bdays_per_name > 1)].index
multiples_df = stats_df[stats_df['Name'].isin(multiple_names)]
multiples_df = multiples_df.merge(missed_df, how='left', on=['Name', 'Birth_Date', 'Season'])

# Recombine multiple name and single name DataFrames
df = stats_df[~stats_df['Name'].isin(multiple_names)]
df = df.merge(missed_df[['Name', 'Season', 'Games_Missed']], how='left', on=['Name', 'Season'])
df = df.append(multiples_df)
df = df.sort_values(by=['Name', 'Birth_Date', 'Season']).reset_index(drop=True)
df['Games_Missed'] = df['Games_Missed'].fillna(0).astype(int)

* Add back players who missed whole seasons
* Average-per-game-played TOI, PIMs, Majors, Penalties Drawn, Hits, Hits Taken, Blocks
* Calculate Age by season
* Shift to get games missed previous season
* Calculate average games missed in all previous seasons

In [9]:
df.sample(10)

Unnamed: 0,Name,Team,Position,Games_Played,Time_On_Ice,Penalty_Minutes,Major_Penalties,Penalties_Drawn,Hits,Hits_Taken,Shots_Blocked,Season,Birth_Date,Nationality,Height,Weight,European,Russian,Games_Missed
8430,Tyler Ennis,BUF,W,51,654.633333,12,0,2,45,45,23,2017,1989-10-06,CAN,69,161,False,False,30
5932,Mike Cammalleri,MTL,W,65,1268.933333,16,0,16,18,50,27,2010,1982-06-08,CAN,69,185,False,False,17
2057,Craig Rivet,BUF,D,78,1421.133333,100,6,16,61,84,56,2010,1974-09-13,CAN,74,207,False,False,0
8014,Teemu Laakso,NSH,D,7,75.6,2,0,0,6,6,5,2010,1987-08-27,FIN,73,215,True,False,0
813,Blair Jones,PHI,C,4,27.666667,2,0,1,7,4,1,2015,1986-09-27,CAN,74,216,False,False,0
3770,Jayce Hawryluk,FLA,C,42,396.1,16,0,15,101,57,25,2019,1996-01-01,CAN,71,196,False,False,4
705,Auston Matthews,TOR,C,82,1445.75,14,0,22,21,114,61,2017,1997-09-17,USA,75,223,False,False,0
5274,Mark Recchi,BOS,W,81,1304.483333,35,1,23,81,81,25,2011,1968-02-01,CAN,70,195,False,False,0
1150,Brenden Dillon,S.J,D,81,1402.9,60,4,13,197,132,107,2018,1990-11-13,CAN,76,225,False,False,0
6892,Phillip Di Giuseppe,CAR,W,36,442.65,15,1,2,91,37,10,2017,1993-10-09,CAN,72,192,False,False,16
