In [1]:
import pandas as pd
import numpy as np

import re

## Game Data from NHL Database

We will start with getting the game data the Kaggle NHL Database. The only real reason to do this is to get game id's that we can then use to merge the Natural Stat Trick data with the gambling data. We could also just assign our own game id, which, may be easier.

In [2]:
path = "/home/mark/Dodd/software/Python/data/nhl/nhl-game-data/"

teams = pd.read_csv(path + "team_info.csv")
games_all = pd.read_csv(path + "game.csv")

In [3]:
#display(teams.head())
def get_games(season):
    # convert season into a season number in the same format as the datafile
    season = season * 10000 + (season + 1)
    
    # filter for regular season games of that season
    games = games_all[(games_all['season'] == season) & (games_all['type'] == 'R')].copy()

    # create a small lambda to merge, drop, and rename
    merge_games = lambda df, team: (df.merge(right = teams[['team_id','teamName']], left_on = team + '_id',
                                                right_on = 'team_id')
                                    .drop(columns = ['team_id'])
                                    .rename(columns = {'teamName':team}))
    
    games = merge_games(games, 'away_team')
    games = merge_games(games, 'home_team')
    
    #convert datatypes
    games['date_time'] = pd.to_datetime(games['date_time'], format = '%Y-%m-%d')
    games['outcome'] = games['outcome'].astype('category')
    
    #filter for useful columns
    cols = ['game_id', 'season', 'date_time', 'away_team', 'home_team',
            'away_goals', 'home_goals', 'outcome', 'home_team_id', 'away_team_id']
    games_reduced = games[cols].sort_values(by='game_id')
    
    return games_reduced

In [4]:
g2016 = get_games(2016)
g2017 = get_games(2017)
g2018 = get_games(2018)

## Natural Stat Trick Data

Now, it is time to deal with the Natural Stat Trick Data

Create a number of procedures to make the cleaning more compartmentatlized and repeatable.

Skip to the next cell to actually get going.

In [5]:
def natstat_to_numeric(df_in):
    # '-' were causing some columns to be treated as strings so let's replace and convert
    df = df_in.drop(columns = ["Unnamed: 2", "Attendance"]).replace('-', np.nan)
    df.loc[:,'TOI':] = df.loc[:,'TOI':].apply(pd.to_numeric)
    
    # I ultimately want to remove this and fix it properly, but I need to build a model
    null_columns = df.columns[df.isnull().any()]
    df = df.drop(columns = null_columns)
    
    return df


def natstat_add_ave_stats(df_in, cols = None):
    ''' we want to create running averages for all of the numberic values of interest'''
    df = df_in.copy()
    
    # get cols to create averages of
    if cols is None:
        cols = df.loc[:, 'CF':].columns

    for c in cols:
        # create a column header name
        new_col = c + '_ave'
        
        # create the new column and fill with 0.0
        df[new_col] = 0.0
        
        # filter for each team and create the running average for each team individually
        for team in df.Team.sort_values().unique():
            df.loc[df['Team'] == team, new_col] = df.loc[df['Team'] == team, c].expanding(2).mean().shift(1)
            
    return df


def natstat_split_game_info(df_in):
    '''Split "Game" column into date, home team, away team, home goals, away goals'''
    df = df_in.copy()
    
    # get date and add it to the dataframe
    temp = df['Game'].str.split(' - ', n=1, expand = True)
    
    # temp[0] contains the date
    df.insert(0, 'date', pd.to_datetime(temp[0], format = '%Y-%m-%d'))
    
    # temp[1] contains the game information, but it is messy
    temp = temp[1].str.split(',', n=1, expand = True)
    
    # now temp[0] contains the away team information and temp[1] contains the home team information
    df.insert(1, 'home', temp[1].str.replace('\d+','').str.strip())
    df.insert(2, 'home_goals', temp[1].str.extract(r'(\d+)'))
    df.insert(3, 'away', temp[0].str.replace('\d+','').str.strip())
    df.insert(4, 'away_goals', temp[0].str.extract(r'(\d+)'))
    
    # in case we want to easily filter for home and away games we set H or A for each row
    hoa = ['H' if home_team in full_name else 'A' for home_team, full_name in zip(df['home'], df['Team'])]
    df.insert(5, 'hoa', hoa)
    
    # in case we want to easily filter for home and away games we set H or A for each row
    #['yes' if v == 1 else 'no' if v == 2 else 'idle' for v in l]
    #d = {1: 'yes', 2: 'no'}
    #results = {''}
    #winner = ['H_reg' if h and t <= 60.0
           #for h, a, t in zip(df['home_goals'], df['away_goals'], df['TOI'])]
    winner = []
    for h, a, t in zip(df['home_goals'], df['away_goals'], df['TOI']):
        result = ('H_reg' if h > a else 'A_reg' if t <= 60.0 else 
                  'H_OT' if h > a else 'A_OT' if t < 65.0 else
                  'H_SO' if h > a else 'A_SO')
        winner.append(result)
    df.insert(6, 'result', winner)
    
    
    return df


def natstat_home_away(df_in, hoa, col_suffix):
    '''
        Filter the original dataframe for home team or away team information that can be used to flatten each game
        
        Return two dataframes:
            header = data, home, away, home_goals, away_goals, hoa, Game, Team
            stats = all of the raw data
        
    '''
    df = df_in[df_in['hoa'] == hoa].copy()
    
    header = df.loc[:, 'date':'TOI'].reset_index(drop=True)
    stats = df.loc[:, 'CF':].reset_index(drop=True)

    header.columns = [col + '_' + col_suffix for col in header.columns]
    stats.columns = [col + '_' + col_suffix for col in stats.columns]
    
    return header, stats


def natstat_flatten(df_in):
    ''' combine all stats into a single row for each game  '''
    h_head, h_stats = natstat_home_away(df_in, 'H', 'home')
    a_head, a_stats = natstat_home_away(df_in, 'A', 'away')
    
    h_head.columns = np.char.replace(h_head.columns.values.astype(str), '_home', '')
    
    return pd.concat([h_head, h_stats, a_stats], axis = 1)


def natstat_build_df(df_in):
    df = natstat_to_numeric(df_in)
    df = natstat_add_ave_stats(df)
    df = natstat_split_game_info(df)
    df = natstat_flatten(df).reset_index(drop = True)
    
    return df

In [6]:
natstat2016 = pd.read_csv("Games - Natural Stat TrickTeam Season Totals - 2016.csv")
natstat2017 = pd.read_csv("Games - Natural Stat TrickTeam Season Totals - 2017.csv")
natstat2018 = pd.read_csv("Games - Natural Stat TrickTeam Season Totals - 2018.csv")

natstat2016_flat = natstat_build_df(natstat2016)
#natstat2017_flat = natstat_build_df(natstat2017)
#natstat2018_flat = natstat_build_df(natstat2018)

In [7]:
natstat2016_flat[natstat2016_flat.Team == 'Calgary Flames']

Unnamed: 0,date,home,home_goals,away,away_goals,hoa,result,Game,Team,TOI,...,LDSF_ave_away,LDSA_ave_away,LDSF%_ave_away,LDGF_ave_away,LDGA_ave_away,LDSH%_ave_away,LDSV%_ave_away,SH%_ave_away,SV%_ave_away,PDO_ave_away
15,2016-10-14,Flames,3,Oilers,5,H,A_reg,"2016-10-14 - Oilers 5, Flames 3",Calgary Flames,60.0,...,,,,,,,,,,
45,2016-10-18,Flames,4,Sabres,3,H,H_reg,"2016-10-18 - Sabres 3, Flames 4",Calgary Flames,62.433333,...,12.0,10.5,51.85,0.5,0.0,6.25,100.0,14.115,88.635,1.0275
56,2016-10-20,Flames,2,Hurricanes,4,H,A_reg,"2016-10-20 - Hurricanes 4, Flames 2",Calgary Flames,60.0,...,15.666667,15.666667,48.636667,0.333333,1.333333,1.15,93.4,11.503333,85.72,0.972
66,2016-10-22,Flames,4,Blues,6,H,A_reg,"2016-10-22 - Blues 6, Flames 4",Calgary Flames,60.0,...,10.6,8.6,52.148,0.6,0.0,4.038,100.0,9.618,90.702,1.0034
111,2016-10-28,Flames,5,Senators,2,H,H_reg,"2016-10-28 - Senators 2, Flames 5",Calgary Flames,60.0,...,13.666667,16.833333,46.428333,0.333333,1.0,2.556667,94.081667,9.773333,89.018333,0.987667
122,2016-10-30,Flames,1,Capitals,3,H,A_reg,"2016-10-30 - Capitals 3, Flames 1",Calgary Flames,60.0,...,14.0,10.714286,55.295714,0.142857,0.142857,0.751429,98.571429,8.228571,92.067143,1.003143
203,2016-11-10,Flames,2,Stars,4,H,A_reg,"2016-11-10 - Stars 4, Flames 2",Calgary Flames,60.0,...,14.384615,13.769231,51.072308,0.692308,0.769231,5.139231,93.295385,9.319231,87.816154,0.971462
217,2016-11-12,Flames,1,Rangers,4,H,A_reg,"2016-11-12 - Rangers 4, Flames 1",Calgary Flames,60.0,...,10.642857,11.642857,47.912143,0.571429,0.214286,6.014286,97.962857,14.654286,90.53,1.051643
242,2016-11-16,Flames,2,Coyotes,1,H,H_reg,"2016-11-16 - Coyotes 1, Flames 2",Calgary Flames,60.816667,...,14.5,15.928571,47.907143,0.642857,0.857143,4.069286,94.920714,9.346429,89.572143,0.989214
254,2016-11-18,Flames,2,Blackhawks,3,H,A_reg,"2016-11-18 - Blackhawks 3, Flames 2",Calgary Flames,60.0,...,13.352941,14.176471,48.154706,0.588235,0.529412,4.757647,95.99,11.214706,91.685294,1.028941


### TO - DO
Deal with missing values - mostly in the percentage columns, but these are derived from other columns
I know that for some, like GF% the values are simply incorrectly recorded

skipping ahead for now - i want to build a model

In [8]:
#natstat2016_flat.loc[natstat2016_flat['GF%'].isnull(), ['GF', 'GA', 'GF%']]
#natstat2016_flat.loc[natstat2016_flat['GF%'].isnull(), :]
#natstat2016_flat.loc[natstat2016_flat.isnull(), :]
null_columns=natstat2016_flat.columns[natstat2016_flat.isnull().any()]

display(natstat2016_flat[natstat2016_flat.isnull().any(axis=1)][null_columns])
print(null_columns)

Unnamed: 0,CF_ave_home,CA_ave_home,CF%_ave_home,FF_ave_home,FA_ave_home,FF%_ave_home,SF_ave_home,SA_ave_home,SF%_ave_home,GF_ave_home,...,LDSF_ave_away,LDSA_ave_away,LDSF%_ave_away,LDGF_ave_away,LDGA_ave_away,LDSH%_ave_away,LDSV%_ave_away,SH%_ave_away,SV%_ave_away,PDO_ave_away
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


Index(['CF_ave_home', 'CA_ave_home', 'CF%_ave_home', 'FF_ave_home',
       'FA_ave_home', 'FF%_ave_home', 'SF_ave_home', 'SA_ave_home',
       'SF%_ave_home', 'GF_ave_home', 'GA_ave_home', 'xGF_ave_home',
       'xGA_ave_home', 'SCF_ave_home', 'SCA_ave_home', 'SCF%_ave_home',
       'HDCF_ave_home', 'HDCA_ave_home', 'HDSF_ave_home', 'HDSA_ave_home',
       'HDGF_ave_home', 'HDGA_ave_home', 'MDCF_ave_home', 'MDCA_ave_home',
       'MDCF%_ave_home', 'MDSF_ave_home', 'MDSA_ave_home', 'MDSF%_ave_home',
       'MDGF_ave_home', 'MDGA_ave_home', 'LDCF_ave_home', 'LDCA_ave_home',
       'LDCF%_ave_home', 'LDSF_ave_home', 'LDSA_ave_home', 'LDSF%_ave_home',
       'LDGF_ave_home', 'LDGA_ave_home', 'LDSH%_ave_home', 'LDSV%_ave_home',
       'SH%_ave_home', 'SV%_ave_home', 'PDO_ave_home', 'CF_ave_away',
       'CA_ave_away', 'CF%_ave_away', 'FF_ave_away', 'FA_ave_away',
       'FF%_ave_away', 'SF_ave_away', 'SA_ave_away', 'SF%_ave_away',
       'GF_ave_away', 'GA_ave_away', 'xGF_ave_away', 'xGA_av

In [9]:
display(natstat2016_flat.head(6))

Unnamed: 0,date,home,home_goals,away,away_goals,hoa,result,Game,Team,TOI,...,LDSF_ave_away,LDSA_ave_away,LDSF%_ave_away,LDGF_ave_away,LDGA_ave_away,LDSH%_ave_away,LDSV%_ave_away,SH%_ave_away,SV%_ave_away,PDO_ave_away
0,2016-10-12,Blackhawks,2,Blues,5,H,A_reg,"2016-10-12 - Blues 5, Blackhawks 2",Chicago Blackhawks,60.0,...,,,,,,,,,,
1,2016-10-12,Oilers,7,Flames,4,H,H_reg,"2016-10-12 - Flames 4, Oilers 7",Edmonton Oilers,60.0,...,,,,,,,,,,
2,2016-10-12,Sharks,2,Kings,1,H,H_reg,"2016-10-12 - Kings 1, Sharks 2",San Jose Sharks,60.0,...,,,,,,,,,,
3,2016-10-12,Senators,5,Maple Leafs,4,H,H_reg,"2016-10-12 - Maple Leafs 4, Senators 5",Ottawa Senators,60.616667,...,,,,,,,,,,
4,2016-10-13,Blue Jackets,3,Bruins,6,H,A_reg,"2016-10-13 - Bruins 6, Blue Jackets 3",Columbus Blue Jackets,60.0,...,,,,,,,,,,
5,2016-10-13,Sabres,1,Canadiens,4,H,A_reg,"2016-10-13 - Canadiens 4, Sabres 1",Buffalo Sabres,60.0,...,,,,,,,,,,


### Find Marginal Stats for Each Game

So far we have:
* loaded the dataframes
* calculated the running average for each metric for each team
* converted each game into a flat structure

Now we can calculate the marginal stats for the difference between these stats.

In [10]:
h = np.core.defchararray.find(natstat2016_flat.columns.values.astype(str), 'ave_home') >= 0
a = np.core.defchararray.find(natstat2016_flat.columns.values.astype(str), 'ave_away') >= 0

homestats_2016 = natstat2016_flat.loc[:, h]
awaystats_2016 = natstat2016_flat.loc[:, a]

homestats_2016.columns = np.char.replace(homestats_2016.columns.values.astype(str), '_home', '')
awaystats_2016.columns = np.char.replace(homestats_2016.columns.values.astype(str), '_away', '')

marginal_stats_2016 = homestats_2016 - awaystats_2016
marginal_stats_2016

Unnamed: 0,CF_ave,CA_ave,CF%_ave,FF_ave,FA_ave,FF%_ave,SF_ave,SA_ave,SF%_ave,GF_ave,...,LDSF_ave,LDSA_ave,LDSF%_ave,LDGF_ave,LDGA_ave,LDSH%_ave,LDSV%_ave,SH%_ave,SV%_ave,PDO_ave
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,-5.456790,6.543210,-5.313333,-3.000000,5.666667,-5.384198,-1.654321,3.901235,-4.776420,0.246914,...,-2.432099,1.543210,-8.060247,-0.111111,0.037037,-0.406173,-0.516049,1.397531,1.294938,0.026901
1226,-0.148148,0.419753,-0.595432,-1.876543,-1.839506,-0.180741,-2.049383,-3.666667,1.283827,0.716049,...,-2.111111,-2.135802,0.529506,0.358025,-0.395062,3.286296,2.270617,3.203951,1.391728,0.045975
1227,-3.691358,-2.197531,-0.703457,-2.679012,-1.296296,-0.765062,-3.876543,-2.530864,-0.899383,-0.320988,...,-0.222222,-0.629630,1.307778,0.024691,0.074074,0.295309,-0.722469,0.392716,-0.481358,-0.001012
1228,2.222222,-6.395062,3.593210,0.975309,-5.098765,3.399753,-0.580247,-3.888889,2.513086,0.358025,...,-0.925926,-3.333333,3.874444,-0.012346,0.222222,0.484444,-2.907778,1.444568,-1.085926,0.003630


### Should we try a very simple model?

I think so.

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
log = LogisticRegression(penalty='l1', solver='liblinear')

In [13]:
X_train = marginal_stats_2016.dropna().iloc[:-100]
X_test = marginal_stats_2016.dropna().iloc[-100:]

y_train = natstat2016_flat.iloc[X_train.index.values].result.apply(lambda x: 1 if x[0] == 'H' else 0)
y_test = natstat2016_flat.iloc[X_test.index.values].result.apply(lambda x: 1 if x[0] == 'H' else 0)

In [14]:
natstat2016_flat[natstat2016_flat['result'].isna()]

Unnamed: 0,date,home,home_goals,away,away_goals,hoa,result,Game,Team,TOI,...,LDSF_ave_away,LDSA_ave_away,LDSF%_ave_away,LDGF_ave_away,LDGA_ave_away,LDSH%_ave_away,LDSV%_ave_away,SH%_ave_away,SV%_ave_away,PDO_ave_away


In [15]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1097, 43) (1097,)
(100, 43) (100,)


In [16]:
log.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
log.score(X_train, y_train)

0.6071103008204193

In [18]:
log.score(X_test, y_test)

0.67

In [19]:
log.coef_

array([[ 0.        , -0.07096338,  0.        , -0.12938085,  0.02108885,
         0.        ,  0.09731925,  0.07647506,  0.        ,  0.        ,
         0.        ,  0.02662042,  0.19013263,  0.        ,  0.        ,
        -0.21082546,  0.33409212, -0.24714166,  0.        ,  0.01437741,
         0.        ,  0.        , -0.02108709,  0.21899701,  0.21518136,
        -0.16612123,  0.00343231,  0.05267563, -0.44530277,  0.        ,
         0.13324657,  0.02914943,  0.00794614, -0.1483903 ,  0.        ,
         0.03301258,  0.        ,  0.        ,  0.0709623 , -0.02566601,
         0.04702725,  0.0583532 ,  0.        ]])

In [20]:
temp = natstat2016_flat.iloc[X_test.index.values].copy()
temp['predict'] = log.predict(X_test)
display(temp[['date', 'home', 'away', 'home_goals', 'away_goals', 'result', 'predict']])

Unnamed: 0,date,home,away,home_goals,away_goals,result,predict
1130,2017-03-28,Bruins,Predators,4,1,H_reg,1
1131,2017-03-28,Sharks,Rangers,5,4,H_reg,1
1132,2017-03-28,Hurricanes,Red Wings,4,1,H_reg,1
1133,2017-03-28,Blue Jackets,Sabres,3,1,H_reg,1
1134,2017-03-28,Flyers,Senators,3,2,H_reg,1
...,...,...,...,...,...,...,...
1225,2017-04-09,Ducks,Kings,4,3,H_reg,1
1226,2017-04-09,Capitals,Panthers,0,2,A_reg,1
1227,2017-04-09,Rangers,Penguins,3,2,H_reg,0
1228,2017-04-09,Lightning,Sabres,4,2,H_reg,1
