## Game Data from NHL Database

We will start with getting the game data the Kaggle NHL Database. The only real reason to do this is to get game id's that we can then use to merge the Natural Stat Trick data with the gambling data. We could also just assign our own game id, which, may be easier.

In [131]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

%matplotlib inline
pd.options.display.width = 0

In [205]:
path = "data/original/kaggle/"

teams = pd.read_csv(path + "team_info.csv")
games_all = pd.read_csv(path + "game.csv")
team_stats = pd.read_csv(path + "game_teams_stats.csv")

In [5]:
#display(teams.head())
def get_games(season):
    # convert season into a season number in the same format as the datafile
    season = season * 10000 + (season + 1)
    
    # filter for regular season games of that season
    games = games_all[(games_all['season'] == season) & (games_all['type'] == 'R')].copy()

    # create a small lambda to merge, drop, and rename
    merge_games = lambda df, team: (df.merge(right = teams[['team_id','teamName']], left_on = team + '_id',
                                                right_on = 'team_id')
                                    .drop(columns = ['team_id'])
                                    .rename(columns = {'teamName':team}))
    
    games = merge_games(games, 'away_team')
    games = merge_games(games, 'home_team')
    
    #convert datatypes
    games['date_time'] = pd.to_datetime(games['date_time'], format = '%Y-%m-%d')
    games['outcome'] = games['outcome'].astype('category')
    
    #filter for useful columns
    cols = ['game_id', 'season', 'date_time', 'away_team', 'home_team',
            'away_goals', 'home_goals', 'outcome', 'home_team_id', 'away_team_id']
    games_reduced = games[cols].sort_values(by='game_id')
    
    return games_reduced

In [170]:
g2014 = get_games(2014)
g2015 = get_games(2015)
g2016 = get_games(2016)
g2017 = get_games(2017)
g2018 = get_games(2018)

In [49]:
g2016.columns

Index(['game_id', 'season', 'date_time', 'away_team', 'home_team',
       'away_goals', 'home_goals', 'outcome', 'home_team_id', 'away_team_id'],
      dtype='object')

In [54]:
(g2016[g2016.duplicated(subset=['home_team', 'home_goals', 'away_team', 'away_goals', 'date_time'], keep = False)]
     .sort_values(by='home_team'))

Unnamed: 0,game_id,season,date_time,away_team,home_team,away_goals,home_goals,outcome,home_team_id,away_team_id


In [7]:
for t in g2016.home_team.sort_values().unique():
    print(t)

Avalanche
Blackhawks
Blue Jackets
Blues
Bruins
Canadiens
Canucks
Capitals
Coyotes
Devils
Ducks
Flames
Flyers
Hurricanes
Islanders
Jets
Kings
Lightning
Maple Leafs
Oilers
Panthers
Penguins
Predators
Rangers
Red Wings
Sabres
Senators
Sharks
Stars
Wild


## Natural Stat Trick Data

Now, it is time to deal with the Natural Stat Trick Data

Create a number of procedures to make the cleaning more compartmentatlized and repeatable.

Skip to the next cell to actually get going.

In [185]:
def natstat_to_numeric(df_in):
    # '-' were causing some columns to be treated as strings so let's replace and convert
    df = df_in.drop(columns = ["Unnamed: 2", "Attendance"]).replace('-', np.nan)
    df.loc[:,'TOI':] = df.loc[:,'TOI':].apply(pd.to_numeric)
    
    
    #########################################################################################
    # we have to do something so let's fill with mean as we are calculating means
    #########################################################################################
    df = df.fillna(df.mean())
    
    # calculate blocked shots
    df['blocks'] = df['CA'] - df['FA']

    return df


def natstat_add_ave_stats(df_in, cols = None):
    ''' we want to create running averages for all of the numberic values of interest'''
    df = df_in.copy()
    df = df.sort_values(by='date')
    
    # get cols to create averages of
    if cols is None:
        cols = df.iloc[:, (df.columns.get_loc('TOI') + 1):].columns

    for c in cols:
        # create a column header name
        new_col = c + '_avg'
        
        # create the new column and fill with 0.0
        df[new_col] = 0.0
        
        # filter for each team and create the running average for each team individually
        for team in df.Team.sort_values().unique():
            f = df['Team'] == team
            df.loc[f, new_col] = df.loc[f, c].expanding(min_periods=1).mean().shift(1)
            
    return df


def natstat_split_game_info(df_in):
    '''Split "Game" column into date, home team, away team, home goals, away goals'''
    df = df_in.copy()
    
    # get date and add it to the dataframe
    temp = df['Game'].str.split(' - ', n=1, expand = True)
    
    # temp[0] contains the date
    df.insert(0, 'date', pd.to_datetime(temp[0], format = '%Y-%m-%d'))
    
    # temp[1] contains the game information, but it is messy
    temp = temp[1].str.split(',', n=1, expand = True)
    
    # now temp[0] contains the away team information and temp[1] contains the home team information
    df.insert(1, 'home', temp[1].str.replace('\d+','').str.strip())
    df.insert(2, 'home_goals', temp[1].str.extract(r'(\d+)').astype('int64'))
    df.insert(3, 'away', temp[0].str.replace('\d+','').str.strip())
    df.insert(4, 'away_goals', temp[0].str.extract(r'(\d+)').astype('int64'))
    
    # in case we want to easily filter for home and away games we set H or A for each row
    hoa = ['H' if home_team in full_name else 'A' for home_team, full_name in zip(df['home'], df['Team'])]
    df.insert(5, 'hoa', hoa)
    
    # in case we want to easily filter for home and away games we set H or A for each row
    winner = []
    for h, a, t in zip(df['home_goals'], df['away_goals'], df['TOI']):
        result = ('H_reg' if h > a else 'A_reg' if t <= 60.0 else 
                  'H_OT' if h > a else 'A_OT' if t < 65.0 else
                  'H_SO' if h > a else 'A_SO')
        winner.append(result)
    df.insert(6, 'result', winner)
    df.insert(7, 'result_bool', df.result.apply(lambda x: 1 if x[0] == 'H' else 0))
    
    return df


def natstat_home_away(df_in, hoa, col_suffix):
    '''
        Filter the original dataframe for home team or away team information that can be used to flatten each game
        
        Return two dataframes:
            header = data, home, away, home_goals, away_goals, hoa, Game, Team
            stats = all of the raw data
        
    '''
    df = df_in[df_in['hoa'] == hoa].copy()
    
    header = df.loc[:, 'date':'TOI'].reset_index(drop=True)
    stats = df.iloc[:, (df.columns.get_loc('TOI') + 1):].reset_index(drop=True)

    header.columns = [col + '_' + col_suffix for col in header.columns]
    stats.columns = [col + '_' + col_suffix for col in stats.columns]
    
    return header, stats


def natstat_flatten(df_in):
    ''' combine all stats into a single row for each game  '''
    h_head, h_stats = natstat_home_away(df_in, 'H', 'home')
    a_head, a_stats = natstat_home_away(df_in, 'A', 'away')
    
    h_head.columns = np.char.replace(h_head.columns.values.astype(str), '_home', '')
    
    return pd.concat([h_head, h_stats, a_stats], axis = 1)

def natstat_clean(df_in):
    ''' apply the cleaning functions to all data frames'''
    df = natstat_split_game_info(df_in)
    df = natstat_to_numeric(df)
    
    return df

def add_game_ids(nat_stat, nhl_df):
    ''' Add the NHL Game ID (from the Kaggle files) to the Nat Stat Data '''
    result = nat_stat.copy()
    game_ids = []
    try:
        for index, row in nat_stat.iterrows():
            i = nhl_df[(nhl_df.home_team == row.home) &
                            (nhl_df.away_team == row.away) &
                            (nhl_df.home_goals == row.home_goals) &
                            (nhl_df.away_goals == row.away_goals)].game_id.values[0]
            #print(, row.home, row.away, row.home_goals, row.away_goals)
            game_ids.append(i)
    except:
        x = nhl_df[(nhl_df.home_team == row.home) &
                            (nhl_df.away_team == row.away) &
                            (nhl_df.home_goals == row.home_goals) &
                            (nhl_df.away_goals == row.away_goals)].game_id
        print(x)
        print(row.date, row.home, row.away, row.home_goals, row.away_goals)
#     game_ids = [ nhl_df[(nhl_df.home_team == row.home) &
#                         (nhl_df.away_team == row.away) &
#                         (nhl_df.home_goals == row.home_goals) &
#                         (nhl_df.away_goals == row.away_goals)].game_id.values[0]
#                  for index, row in nat_stat.iterrows()]
    result.insert(0, 'game_id', game_ids)
    
    return result

Now we can build the flattened data frames. I've also kept the unflattened data frames as they can be useful for data inspection.

### Find Marginal Stats for Each Game

So far we have:
* loaded the dataframes
* calculated the running average for each metric for each team
* converted each game into a flat structure

Now we can calculate the marginal stats for the difference between these stats.

In [9]:
def natstat_marginal(df):
    header = df.loc[:, 'game_id':'TOI'].reset_index(drop=True)
    
    h = np.core.defchararray.find(df.columns.values.astype(str), 'avg_home') >= 0
    a = np.core.defchararray.find(df.columns.values.astype(str), 'avg_away') >= 0

    homestats = df.loc[:, h]
    awaystats = df.loc[:, a]

    homestats.columns = np.char.replace(homestats.columns.values.astype(str), '_home', '')
    awaystats.columns = np.char.replace(homestats.columns.values.astype(str), '_away', '')

    marginal_stats = homestats - awaystats
    
    return pd.concat([header, marginal_stats], axis = 1)

def natstat_percents(df):
    header = df.loc[:, 'game_id':'TOI'].reset_index(drop=True)
    p = np.core.defchararray.find(df.columns.values.astype(str), '%') >= 0
    percent = df.loc[:, p]
    return pd.concat([header, percent], axis = 1)

Now, we can build a data frame with marginal stats that we will use to build the model.

In [222]:
def natstat_build_df(df_in, nhl_in, cols = None):
    df = natstat_clean(df_in)
    df_long = natstat_add_ave_stats(df)
    df_flat = natstat_flatten(df).reset_index(drop = True)
    df_flat = add_game_ids(df_flat, nhl_in).sort_values(by = 'game_id')
    team_stats_ha = pd.merge(left = team_stats[team_stats['HoA'] == 'home'],
                         right = team_stats[team_stats['HoA'] == 'away'],
                         on = "game_id")
    temp = pd.merge(left = df_flat, right = team_stats_ha, on = "game_id")

    df_marg = natstat_marginal(df_flat).sort_values(by = 'game_id')
    
    return df_long, df_flat, df_marg

In [223]:
natstat_path = "data/original/natstat/"
natstat2014 = pd.read_csv(natstat_path + "Games - Natural Stat TrickTeam Season Totals - 2014.csv")

natstat_build_df(natstat2014, g2014)

['game_id', 'date', 'home', 'home_goals', 'away', 'away_goals', 'hoa', 'result', 'result_bool', 'Game', 'Team', 'TOI', 'CF_home', 'CA_home', 'CF%_home', 'FF_home', 'FA_home', 'FF%_home', 'SF_home', 'SA_home', 'SF%_home', 'GF_home', 'GA_home', 'GF%_home', 'xGF_home', 'xGA_home', 'xGF%_home', 'SCF_home', 'SCA_home', 'SCF%_home', 'HDCF_home', 'HDCA_home', 'HDCF%_home', 'HDSF_home', 'HDSA_home', 'HDSF%_home', 'HDGF_home', 'HDGA_home', 'HDGF%_home', 'HDSH%_home', 'HDSV%_home', 'MDCF_home', 'MDCA_home', 'MDCF%_home', 'MDSF_home', 'MDSA_home', 'MDSF%_home', 'MDGF_home', 'MDGA_home', 'MDGF%_home', 'MDSH%_home', 'MDSV%_home', 'LDCF_home', 'LDCA_home', 'LDCF%_home', 'LDSF_home', 'LDSA_home', 'LDSF%_home', 'LDGF_home', 'LDGA_home', 'LDGF%_home', 'LDSH%_home', 'LDSV%_home', 'SH%_home', 'SV%_home', 'PDO_home', 'blocks_home', 'CF_away', 'CA_away', 'CF%_away', 'FF_away', 'FA_away', 'FF%_away', 'SF_away', 'SA_away', 'SF%_away', 'GF_away', 'GA_away', 'GF%_away', 'xGF_away', 'xGA_away', 'xGF%_away', 'SC

In [213]:
print(m_combined_red.shape, test.shape)

(3461, 67) (3461, 95)


In [214]:
test.columns

Index(['game_id', 'date', 'home', 'home_goals', 'away', 'away_goals', 'hoa',
       'result', 'result_bool', 'Game', 'Team', 'TOI', 'CF_avg', 'CA_avg',
       'CF%_avg', 'FF_avg', 'FA_avg', 'FF%_avg', 'SF_avg', 'SA_avg', 'SF%_avg',
       'GF_avg', 'GA_avg', 'GF%_avg', 'xGF_avg', 'xGA_avg', 'xGF%_avg',
       'SCF_avg', 'SCA_avg', 'SCF%_avg', 'HDCF_avg', 'HDCA_avg', 'HDCF%_avg',
       'HDSF_avg', 'HDSA_avg', 'HDSF%_avg', 'HDGF_avg', 'HDGA_avg',
       'HDGF%_avg', 'HDSH%_avg', 'HDSV%_avg', 'MDCF_avg', 'MDCA_avg',
       'MDCF%_avg', 'MDSF_avg', 'MDSA_avg', 'MDSF%_avg', 'MDGF_avg',
       'MDGA_avg', 'MDGF%_avg', 'MDSH%_avg', 'MDSV%_avg', 'LDCF_avg',
       'LDCA_avg', 'LDCF%_avg', 'LDSF_avg', 'LDSA_avg', 'LDSF%_avg',
       'LDGF_avg', 'LDGA_avg', 'LDGF%_avg', 'LDSH%_avg', 'LDSV%_avg',
       'SH%_avg', 'SV%_avg', 'PDO_avg', 'blocks_avg', 'team_id_x', 'HoA_x',
       'won_x', 'settled_in_x', 'head_coach_x', 'goals_x', 'shots_x', 'hits_x',
       'pim_x', 'powerPlayOpportunities_x', 'p

In [227]:
natstat_path = "data/original/natstat/"
natstat2014 = pd.read_csv(natstat_path + "Games - Natural Stat TrickTeam Season Totals - 2014.csv")
natstat2015 = pd.read_csv(natstat_path + "Games - Natural Stat TrickTeam Season Totals - 2015.csv")
natstat2016 = pd.read_csv(natstat_path + "Games - Natural Stat TrickTeam Season Totals - 2016.csv")
natstat2017 = pd.read_csv(natstat_path + "Games - Natural Stat TrickTeam Season Totals - 2017.csv")
natstat2018 = pd.read_csv(natstat_path + "Games - Natural Stat TrickTeam Season Totals - 2018.csv")

natstat2014, natstat2014_flat, natstat2014_marg = natstat_build_df(natstat2014, g2014)
#natstat2015, natstat2015_flat, natstat2015_marg = natstat_build_df(natstat2015, g2015)
#natstat2016, natstat2016_flat, natstat2016_marg = natstat_build_df(natstat2016, g2016)
#natstat2017, natstat2017_flat, natstat2017_marg = natstat_build_df(natstat2017, g2017)
#natstat2018, natstat2018_flat, natstat2018_marg = natstat_build_df(natstat2018, g2018)

['game_id', 'date', 'home', 'home_goals', 'away', 'away_goals', 'hoa', 'result', 'result_bool', 'Game', 'Team', 'TOI', 'CF_home', 'CA_home', 'CF%_home', 'FF_home', 'FA_home', 'FF%_home', 'SF_home', 'SA_home', 'SF%_home', 'GF_home', 'GA_home', 'GF%_home', 'xGF_home', 'xGA_home', 'xGF%_home', 'SCF_home', 'SCA_home', 'SCF%_home', 'HDCF_home', 'HDCA_home', 'HDCF%_home', 'HDSF_home', 'HDSA_home', 'HDSF%_home', 'HDGF_home', 'HDGA_home', 'HDGF%_home', 'HDSH%_home', 'HDSV%_home', 'MDCF_home', 'MDCA_home', 'MDCF%_home', 'MDSF_home', 'MDSA_home', 'MDSF%_home', 'MDGF_home', 'MDGA_home', 'MDGF%_home', 'MDSH%_home', 'MDSV%_home', 'LDCF_home', 'LDCA_home', 'LDCF%_home', 'LDSF_home', 'LDSA_home', 'LDSF%_home', 'LDGF_home', 'LDGA_home', 'LDGF%_home', 'LDSH%_home', 'LDSV%_home', 'SH%_home', 'SV%_home', 'PDO_home', 'blocks_home', 'CF_away', 'CA_away', 'CF%_away', 'FF_away', 'FA_away', 'FF%_away', 'SF_away', 'SA_away', 'SF%_away', 'GF_away', 'GA_away', 'GF%_away', 'xGF_away', 'xGA_away', 'xGF%_away', 'SC

TypeError: cannot unpack non-iterable NoneType object

In [None]:
for c

In [192]:
g2015[g2015.date_time == '2016-02-07']

Unnamed: 0,game_id,season,date_time,away_team,home_team,away_goals,home_goals,outcome,home_team_id,away_team_id
557,2015020779,20152016,2016-02-07,Sabres,Bruins,1,2,home win OT,6,7
1069,2015020780,20152016,2016-02-07,Maple Leafs,Senators,1,6,home win REG,9,10
744,2015020781,20152016,2016-02-07,Penguins,Panthers,3,2,away win OT,13,5
386,2015020782,20152016,2016-02-07,Wild,Blues,1,4,home win REG,19,30
588,2015020783,20152016,2016-02-07,Sharks,Predators,2,6,home win REG,18,28
732,2015020784,20152016,2016-02-07,Blackhawks,Stars,5,1,away win REG,25,16
1189,2015020785,20152016,2016-02-07,Jets,Avalanche,4,2,away win REG,21,52
310,2015020786,20152016,2016-02-07,Flames,Canucks,4,1,away win REG,23,20
226,2015020787,20152016,2016-02-07,Flyers,Capitals,2,3,home win REG,15,4
523,2015020788,20152016,2016-02-07,Hurricanes,Canadiens,1,2,home win SO,8,12


In [55]:
natstat2016_flat[natstat2016_flat.duplicated(subset=['home', 'home_goals', 'away', 'away_goals', 'date'], keep = False)].sort_values(by='game_id')

Unnamed: 0,game_id,date,home,home_goals,away,away_goals,hoa,result,result_bool,Game,...,LDSF%_avg_away,LDGF_avg_away,LDGA_avg_away,LDGF%_avg_away,LDSH%_avg_away,LDSV%_avg_away,SH%_avg_away,SV%_avg_away,PDO_avg_away,blocks_avg_away


In [12]:
display(natstat2016_flat.tail())
display(natstat2016_marg.tail())

Unnamed: 0,game_id,date,home,home_goals,away,away_goals,hoa,result,result_bool,Game,...,LDSF%_avg_away,LDGF_avg_away,LDGA_avg_away,LDGF%_avg_away,LDSH%_avg_away,LDSV%_avg_away,SH%_avg_away,SV%_avg_away,PDO_avg_away,blocks_avg_away
1222,2016021226,2017-04-09,Rangers,3,Penguins,2,H,H_reg,1,"2017-04-09 - Penguins 2, Rangers 3",...,51.88,0.407407,0.592593,42.38679,2.48963,96.186296,7.970123,90.820617,0.987926,11.209877
1224,2016021227,2017-04-09,Flyers,3,Hurricanes,4,H,A_SO,0,"2017-04-09 - Hurricanes 4, Flyers 3",...,46.872099,0.37037,0.481481,46.275679,2.515185,96.881235,8.150741,90.411605,0.985605,12.617284
1223,2016021228,2017-04-09,Capitals,0,Panthers,2,H,A_reg,0,"2017-04-09 - Panthers 2, Capitals 0",...,56.732716,0.555556,0.358025,56.131728,3.915185,96.969012,7.938395,90.255926,0.981988,13.111111
1220,2016021229,2017-04-09,Ducks,4,Kings,3,H,H_reg,1,"2017-04-09 - Kings 3, Ducks 4",...,45.244568,0.358025,0.444444,46.707778,2.298025,97.284938,8.311852,91.634568,0.999444,13.691358
1227,2016021230,2017-04-09,Oilers,5,Canucks,2,H,H_reg,1,"2017-04-09 - Canucks 2, Oilers 5",...,47.260617,0.45679,0.580247,44.526667,3.621111,96.089259,7.363827,89.045679,0.964099,13.08642


Unnamed: 0,game_id,date,home,home_goals,away,away_goals,hoa,result,result_bool,Game,...,LDSF%_avg,LDGF_avg,LDGA_avg,LDGF%_avg,LDSH%_avg,LDSV%_avg,SH%_avg,SV%_avg,PDO_avg,blocks_avg
1225,2016021226,2017-04-09,Rangers,3,Penguins,2,H,H_reg,1,"2017-04-09 - Penguins 2, Rangers 3",...,-2.050247,0.197531,0.185185,-0.308519,1.507037,-1.58963,1.380123,0.335802,0.017173,4.049383
1226,2016021227,2017-04-09,Flyers,3,Hurricanes,4,H,A_SO,0,"2017-04-09 - Hurricanes 4, Flyers 3",...,-1.164568,-0.259259,0.037037,-5.267407,-1.817531,0.128519,-1.552593,-2.163333,-0.037235,-1.358025
1227,2016021228,2017-04-09,Capitals,0,Panthers,2,H,A_reg,0,"2017-04-09 - Panthers 2, Capitals 0",...,3.434815,-0.024691,-0.148148,5.041235,-0.345556,0.120494,2.518025,2.039136,0.04563,2.740741
1228,2016021229,2017-04-09,Ducks,4,Kings,3,H,H_reg,1,"2017-04-09 - Kings 3, Ducks 4",...,-5.957654,0.123457,-0.08642,6.059753,1.550247,0.812593,1.595679,1.121975,0.02716,1.481481
1229,2016021230,2017-04-09,Oilers,5,Canucks,2,H,H_reg,1,"2017-04-09 - Canucks 2, Oilers 5",...,0.812716,-0.160494,-0.234568,2.777901,-0.64321,1.770123,1.455926,-0.826049,0.006346,-2.666667


Write resulting dataframes to csv files

In [226]:
path_result = "data/wrangled/"
# natstat2016_flat.to_csv(path_result + "natstat2016_flat.csv")
# natstat2016_marg.to_csv(path_result + "natstat2016_marg.csv")
# natstat2017_flat.to_csv(path_result + "natstat2017_flat.csv")
# natstat2017_marg.to_csv(path_result + "natstat2017_marg.csv")
# natstat2018_flat.to_csv(path_result + "natstat2018_flat.csv")
# natstat2018_marg.to_csv(path_result + "natstat2018_marg.csv")
natstat2014_marg.to_csv(path_result + "natstat2014_marg.csv")
natstat2015_marg.to_csv(path_result + "natstat2015_marg.csv")

In [13]:
natstat2016_flat[natstat2016_flat.Team == 'Calgary Flames']

Unnamed: 0,game_id,date,home,home_goals,away,away_goals,hoa,result,result_bool,Game,...,LDSF%_avg_away,LDGF_avg_away,LDGA_avg_away,LDGF%_avg_away,LDSH%_avg_away,LDSV%_avg_away,SH%_avg_away,SV%_avg_away,PDO_avg_away,blocks_avg_away
13,2016020015,2016-10-14,Flames,3,Oilers,5,H,A_reg,0,"2016-10-14 - Oilers 5, Flames 3",...,27.59,0.0,1.0,0.0,0.0,95.24,25.0,90.24,1.152,10.0
38,2016020046,2016-10-18,Flames,4,Sabres,3,H,H_reg,1,"2016-10-18 - Sabres 3, Flames 4",...,51.85,0.5,0.0,75.0,6.25,100.0,14.115,88.635,1.0275,11.5
54,2016020059,2016-10-20,Flames,2,Hurricanes,4,H,A_reg,0,"2016-10-20 - Hurricanes 4, Flames 2",...,49.226667,1.0,0.333333,83.333333,9.286667,97.436667,7.516667,85.713333,0.932333,13.0
72,2016020075,2016-10-22,Flames,4,Blues,6,H,A_reg,0,"2016-10-22 - Blues 6, Flames 4",...,52.36,1.0,0.0,75.0,5.8325,100.0,13.325,95.495,1.088,12.75
107,2016020110,2016-10-28,Flames,5,Senators,2,H,H_reg,1,"2016-10-28 - Senators 2, Flames 5",...,46.428333,0.333333,1.0,33.333333,2.556667,94.081667,9.773333,89.018333,0.987667,17.166667
129,2016020130,2016-10-30,Flames,1,Capitals,3,H,A_reg,0,"2016-10-30 - Capitals 3, Flames 1",...,53.425,0.625,0.375,56.25,5.3025,95.80125,10.3825,88.97625,0.993625,9.875
197,2016020204,2016-11-10,Flames,2,Stars,4,H,A_reg,0,"2016-11-10 - Stars 4, Flames 2",...,44.76,0.727273,0.636364,50.0,6.726364,96.01,12.179091,92.793636,1.049909,13.363636
215,2016020222,2016-11-12,Flames,1,Rangers,4,H,A_reg,0,"2016-11-12 - Rangers 4, Flames 1",...,44.11,0.285714,0.642857,35.714286,3.293571,94.775714,9.66,88.809286,0.984714,15.857143
243,2016020244,2016-11-16,Flames,2,Coyotes,1,H,H_reg,1,"2016-11-16 - Coyotes 1, Flames 2",...,47.907143,0.642857,0.857143,42.857143,4.069286,94.920714,9.346429,89.572143,0.989214,14.714286
257,2016020259,2016-11-18,Flames,2,Blackhawks,3,H,A_reg,0,"2016-11-18 - Blackhawks 3, Flames 2",...,48.496471,0.705882,0.352941,60.784118,5.128235,97.592941,12.253529,93.338235,1.055824,17.058824


In [194]:
corr = natstat_percents(natstat2016_marg).corr()
# Generate a mask for the upper triangle
df = corr.where(np.tril(np.ones(corr.shape)).astype(np.bool)).fillna(0.0)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

df.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

Unnamed: 0,game_id,home_goals,away_goals,result_bool,TOI,CF%_avg,FF%_avg,SF%_avg,GF%_avg,xGF%_avg,SCF%_avg,HDCF%_avg,HDSF%_avg,HDGF%_avg,HDSH%_avg,HDSV%_avg,MDCF%_avg,MDSF%_avg,MDGF%_avg,MDSH%_avg,MDSV%_avg,LDCF%_avg,LDSF%_avg,LDGF%_avg,LDSH%_avg,LDSV%_avg,SH%_avg,SV%_avg
game_id,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
home_goals,-0.016,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
away_goals,0.034,-0.055,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
result_bool,-0.057,0.63,-0.62,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
TOI,-0.028,-0.061,0.0022,-0.012,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
CF%_avg,0.0087,0.011,-0.026,0.032,0.0023,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
FF%_avg,0.0086,-0.0099,-0.031,0.016,0.012,0.94,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
SF%_avg,0.022,-0.0076,-0.02,0.0054,0.0034,0.86,0.95,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
GF%_avg,0.0016,-0.025,-0.028,0.013,-0.016,0.11,0.18,0.23,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
xGF%_avg,0.027,-0.027,-0.0097,-0.0076,0.018,0.68,0.78,0.76,0.46,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


### Betting Odds Cleanup

We have to cleanup the betting odds data files.


In [97]:
team_map = {
    "Anaheim" : "Ducks",
    "Arizona" : "Coyotes",
    "Boston" : "Bruins",
    "Buffalo" : "Sabres",
    "Calgary" : "Flames",
    "Carolina" : "Hurricanes",
    "Chicago" : "Blackhawks",
    "Colorado" : "Avalanche",
    "Columbus" : "Blue Jackets",
    "Dallas" : "Stars",
    "Detroit" : "Red Wings",
    "Edmonton" : "Oilers",
    "Florida" : "Panthers",
    "LosAngeles" : "Kings",
    "Minnesota" : "Wild",
    "Montreal" : "Canadiens",
    "NYIslanders" : "Islanders",
    "NYRangers" : "Rangers",
    "Nashville" : "Predators",
    "NewJersey" : "Devils",
    "Ottawa" : "Senators",
    "Philadelphia" : "Flyers",
    "Pittsburgh" : "Penguins",
    "SanJose" : "Sharks",
    "St.Louis" : "Blues",
    "TampaBay" : "Lightning",
    "Toronto" : "Maple Leafs",
    "Vancouver" : "Canucks",
    "Washington" : "Capitals",
    "Winnipeg" : "Jets"
}


betting_path = "data/original/betting/"
bet2016 = (pd.read_excel(betting_path + "nhl odds 2016-17_mrd.xlsx")
                .rename(columns = {'Unnamed: 12':'Puck Line Odds',
                                    'Unnamed: 14':'OU Open Odds',
                                    'Unnamed: 16':'OU Close Odds'
                                   })
                .drop(columns = ['1st', '2nd', '3rd'])
                .replace(team_map)
          )

bet2016['Date'] = bet2016.Date.apply(lambda x: 17*10000 + x if x < 900 else 16*10000 + x)
bet2016['Date'] = pd.to_datetime(bet2016.Date, format = "%y%m%d")


In [98]:
bet2016.head(20)

Unnamed: 0,Date,Rot,Temp_Index,VH,Team,Final,Open,Close,Puck Line,Puck Line Odds,Open OU,OU Open Odds,Close OU,OU Close Odds
0,2016-10-12,1,1,V,Maple Leafs,4,114,121,1.5,-245,5.5,-110,5.5,105
1,2016-10-12,2,1,H,Senators,5,-134,-141,-1.5,205,5.5,-110,5.5,-125
2,2016-10-12,3,2,V,Blues,5,108,101,1.5,-270,5.0,-120,5.0,-120
3,2016-10-12,4,2,H,Blackhawks,2,-128,-121,-1.5,220,5.0,100,5.0,100
4,2016-10-12,5,3,V,Flames,4,108,103,1.5,-295,5.5,100,5.5,115
5,2016-10-12,6,3,H,Oilers,7,-128,-123,-1.5,235,5.5,-120,5.5,-135
6,2016-10-12,7,4,V,Kings,1,104,143,1.5,-240,5.0,-120,5.0,-120
7,2016-10-12,8,4,H,Sharks,2,-124,-163,-1.5,200,5.0,100,5.0,100
8,2016-10-13,51,5,V,Bruins,6,-105,120,1.5,-260,5.5,115,5.5,110
9,2016-10-13,52,5,H,Blue Jackets,3,-115,-140,-1.5,210,5.5,-135,5.5,-130


In [92]:
h = bet2016[bet2016['VH'] == 'H']
a = bet2016[bet2016['VH'] == 'V']
print(h.shape, a.shape)

x = pd.merge(left = h, right = a, left_on = 'Temp_Index', right_on = 'Temp_Index')
print(x.shape)
display(x)

(1230, 14) (1230, 14)
(1230, 27)


Unnamed: 0,Date_x,Rot_x,Temp_Index,VH_x,Team_x,Final_x,Open_x,Close_x,Puck Line_x,Puck Line Odds_x,...,Team_y,Final_y,Open_y,Close_y,Puck Line_y,Puck Line Odds_y,Open OU_y,OU Open Odds_y,Close OU_y,OU Close Odds_y
0,2016-10-12,2,1,H,Senators,5,-134,-141,-1.5,205,...,Maple Leafs,4,114,121,1.5,-245,5.5,-110,5.5,105
1,2016-10-12,4,2,H,Blackhawks,2,-128,-121,-1.5,220,...,Blues,5,108,101,1.5,-270,5.0,-120,5.0,-120
2,2016-10-12,6,3,H,Oilers,7,-128,-123,-1.5,235,...,Flames,4,108,103,1.5,-295,5.5,100,5.5,115
3,2016-10-12,8,4,H,Sharks,2,-124,-163,-1.5,200,...,Kings,1,104,143,1.5,-240,5.0,-120,5.0,-120
4,2016-10-13,52,5,H,Blue Jackets,3,-115,-140,-1.5,210,...,Bruins,6,-105,120,1.5,-260,5.5,115,5.5,110
5,2016-10-13,54,6,H,Sabres,1,-115,111,1.5,-270,...,Canadiens,4,-105,-131,-1.5,220,5.5,115,5.5,120
6,2016-10-13,56,7,H,Rangers,5,-135,-130,-1.5,205,...,Islanders,3,115,110,1.5,-245,5.5,120,5.0,-140
7,2016-10-13,58,8,H,Lightning,6,-170,-173,-1.5,170,...,Red Wings,4,150,153,1.5,-200,5.0,-135,5.0,-110
8,2016-10-13,60,9,H,Predators,2,-145,-153,-1.5,200,...,Devils,1,125,133,1.5,-240,5.0,105,5.0,125
9,2016-10-13,62,10,H,Jets,5,-140,-124,-1.5,240,...,Hurricanes,4,120,104,1.5,-300,5.0,-130,5.0,-135


In [104]:
from datetime import timedelta
print(bet2016.iloc[0].Date)
print(bet2016.iloc[0].Date + timedelta(days=1))

2016-10-12 00:00:00
2016-10-13 00:00:00


In [99]:
def bet_home_away(df_in, hoa, col_suffix):
    '''
        Filter the original dataframe for home team or away team information that can be used to flatten each game
        
        Return two dataframes:
            header 
            odds
        
    '''
    df = df_in[df_in['VH'] == hoa]
    
    header =  df.loc[:, 'Date':'VH'].copy().reset_index(drop=True)
    odds   = df.iloc[:, df.columns.get_loc('Team'):].copy().reset_index(drop=True)

    header.columns = [col + '_' + col_suffix for col in header.columns]
    odds.columns = [col + '_' + col_suffix for col in odds.columns]
    #display(header.head())
    #display(odds.head())
    odds['Temp_Index'] = header['Temp_Index_' + col_suffix]
    
    return header, odds


def bet_flatten(df_in):
    ''' combine betting  '''
    h_head, h_odds = bet_home_away(df_in, 'H', 'home')
    a_head, a_odds = bet_home_away(df_in, 'V', 'away')
    
    h_head.columns = np.char.replace(h_head.columns.values.astype(str), '_home', '')
    
    merged = pd.merge(left = h_odds, right = a_odds, on = 'Temp_Index')
    merged = pd.merge(left = h_head, right = merged, on = 'Temp_Index')
    #return pd.concat([h_head, h_odds, a_odds], axis = 1)
    return merged


def add_game_ids(bet, nhl_df):
    ''' Add the NHL Game ID (from the Kaggle files) to the Betting '''
    result = bet.copy()
#     game_ids = [ (nhl_df[(nhl_df.home_team == row.Team_home) &
#                         (nhl_df.away_team == row.Team_away) &
#                         (nhl_df.home_goals == row.Final_home) &
#                         (nhl_df.away_goals == row.Final_away)].game_id).values[0] for index, row in bet.iterrows()]
    game_ids = []
    for index, row in bet.iterrows():        
        v =      nhl_df[(nhl_df.home_team  == row.Team_home) &
                        (nhl_df.away_team  == row.Team_away) &
                        (nhl_df.home_goals == row.Final_home) &
                        (nhl_df.away_goals == row.Final_away) &
                        (nhl_df.date_time.dt.month == row.Date.month)].game_id.values
        if len(v) != 1:
            print(v, row.Date, row.Team_home, row.Team_away, row.Final_home, row.Final_away)
#         #display(v)
#         #print(v)
#         game_ids.append(v)
    
    
    
    
    
    #for g in game_ids:
    #    print(g)
    #print(v)
    #print(game_ids[0])
    #print(game_ids[0][0])

    #result.insert(0, 'game_id', game_ids)
    
    #return result

bet2016_flat = bet_flatten(bet2016)
#display(bet2016_flat)
bet2016_flat = add_game_ids(bet2016_flat, g2016)


# display(bet2016.head())
# display(bet2016.tail())

# display(bet2016_flat.head())
# display(bet2016_flat.tail())

#for t in bet2016.Team.sort_values().unique():
#    print(t)

[2016020135 2016020314] 2016-11-01 00:00:00 Senators Hurricanes 2 1
[2016020141 2016020269] 2016-11-01 00:00:00 Coyotes Sharks 3 2
[2016020141 2016020269] 2016-11-19 00:00:00 Coyotes Sharks 3 2
[2016020135 2016020314] 2016-11-26 00:00:00 Senators Hurricanes 2 1
[] 2016-11-30 00:00:00 Islanders Penguins 5 3
[] 2016-11-30 00:00:00 Flames Maple Leafs 3 0
[] 2016-11-30 00:00:00 Kings Sharks 1 4
[] 2016-12-12 00:00:00 Canadiens Bruins 0 1
[] 2016-12-31 00:00:00 Lightning Hurricanes 3 1
[] 2016-12-31 00:00:00 Penguins Canadiens 4 3
[] 2016-12-31 00:00:00 Jets Islanders 2 6
[] 2016-12-31 00:00:00 Avalanche Rangers 2 6
[] 2016-12-31 00:00:00 Stars Panthers 1 3
[] 2016-12-31 00:00:00 Oilers Canucks 2 3
[] 2016-12-31 00:00:00 Flames Coyotes 4 2
[] 2016-12-31 00:00:00 Kings Sharks 3 2
[] 2017-01-31 00:00:00 Rangers Blue Jackets 4 6
[] 2017-01-31 00:00:00 Penguins Predators 4 2
[] 2017-01-31 00:00:00 Islanders Capitals 3 2
[] 2017-01-31 00:00:00 Hurricanes Flyers 5 1
[] 2017-01-31 00:00:00 Red Win

In [85]:
g2016[(g2016['home_team'] == 'Predators') & (g2016['away_team'] == 'Devils')]

Unnamed: 0,game_id,season,date_time,away_team,home_team,away_goals,home_goals,outcome,home_team_id,away_team_id
204,2016020362,20162017,2016-12-03,Devils,Predators,5,4,away win OT,18,1


### TO - DO
Deal with missing values - mostly in the percentage columns, but these are derived from other columns
I know that for some, like GF% the values are simply incorrectly recorded

skipping ahead for now - i want to build a model

### Should we try a very simple model?

I think so.

In [108]:
# logistic regression model and parameters to test
# log = LogisticRegression(penalty='l1', solver='liblinear')
# param_grid = {'C': np.arange(0.025, 0.1, 0.005)}

# let's also build a random foreset classifier and parameters
# rf = RandomForestClassifier()
# param_grid = {'n_estimators': [100],
#               #'max_leaf_nodes': [4, 8, 16],
#               'max_leaf_nodes': [2,3,4],
#               'max_depth': [2,3,4],
#               'random_state': [42]
#              }

classifiers = {
    'log': {'clf': LogisticRegression(penalty='l1', solver='liblinear'),
            'params': {'C': np.arange(0.025, 0.1, 0.005)}
           },
    'rf': {'clf': RandomForestClassifier(),
           'params': {'n_estimators': [100],
                      #'max_leaf_nodes': [4, 8, 16],
                      'max_leaf_nodes': [2,3,4],
                      'max_depth': [2,3,4],
                      'random_state': [42]
                     }
          }
}

Set up our data for training, validation and testing.

In [132]:
# marginal_stats_combined = (pd.concat([natstat2016_marg, natstat2017_marg], sort = False)
#                              .reset_index(drop=True))
# natstat_combined = (pd.concat([natstat2016_flat, natstat2017_flat], sort = False)
#                       .reset_index(drop=True))

# n_test = 200
# X_train = marginal_stats_combined.dropna().iloc[:-n_test].loc[:,'CF%_avg':]
# X_test = marginal_stats_combined.dropna().iloc[-n_test:].loc[:,'CF%_avg':]
# #X_train['h'] = 1
# #X_test['h'] = 1

# y_train = natstat_combined.iloc[X_train.index.values].result_bool
# y_test = natstat_combined.iloc[X_test.index.values].result_bool

# X_test_act = natstat2018_marg.loc[500:, 'CF%_avg':].dropna().copy()
# #X_test_act['h'] = 1
# y_test_act = natstat2018_flat.iloc[X_test_act.index.values].result_bool

In [142]:
# marginal_stats_combined = (pd.concat([natstat2016_marg, natstat2017_marg], sort = False)
#                              .reset_index(drop=True))
# natstat_combined = (pd.concat([natstat2016_flat, natstat2017_flat], sort = False)
#                       .reset_index(drop=True))

In [206]:
test = pd.merge(left = m_combined_red, right = team_stats, on = "game_id")
display(test.head())
display(test.tail())

Unnamed: 0,game_id,date,home,home_goals,away,away_goals,hoa,result,result_bool,Game,...,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways
0,2014020369,2014-12-02,Flames,5,Coyotes,2,H,H_reg,1,"2014-12-02 - Coyotes 2, Flames 5",...,Dave Tippett,2,29,24,6,1,0,51.4,1,7
1,2014020369,2014-12-02,Flames,5,Coyotes,2,H,H_reg,1,"2014-12-02 - Coyotes 2, Flames 5",...,Bob Hartley,5,24,23,2,3,1,48.6,19,15
2,2014020370,2014-12-02,Sharks,2,Flyers,1,H,H_reg,1,"2014-12-02 - Flyers 1, Sharks 2",...,Craig Berube,1,29,27,7,2,1,50.0,16,6
3,2014020370,2014-12-02,Sharks,2,Flyers,1,H,H_reg,1,"2014-12-02 - Flyers 1, Sharks 2",...,Todd McLellan,2,27,38,9,1,0,50.0,12,14
4,2014020371,2014-12-02,Kings,2,Bruins,0,H,H_reg,1,"2014-12-02 - Bruins 0, Kings 2",...,Claude Julien,0,31,36,8,0,0,50.0,9,0


Unnamed: 0,game_id,date,home,home_goals,away,away_goals,hoa,result,result_bool,Game,...,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways
6917,2017021269,2018-04-07,Oilers,3,Canucks,2,H,H_reg,1,"2018-04-07 - Canucks 2, Oilers 3",...,Todd McLellan,2,33,12,8,2,1,53.6,13,12
6918,2017021270,2018-04-07,Kings,2,Stars,4,H,A_reg,0,"2018-04-07 - Stars 4, Kings 2",...,Ken Hitchcock,4,18,11,6,1,0,58.5,6,2
6919,2017021270,2018-04-07,Kings,2,Stars,4,H,A_reg,0,"2018-04-07 - Stars 4, Kings 2",...,John Stevens,2,36,15,2,3,0,41.5,11,2
6920,2017021271,2018-04-07,Sharks,3,Wild,6,H,A_reg,0,"2018-04-07 - Wild 6, Sharks 3",...,Bruce Boudreau,6,24,15,4,1,0,48.1,11,9
6921,2017021271,2018-04-07,Sharks,3,Wild,6,H,A_reg,0,"2018-04-07 - Wild 6, Sharks 3",...,Peter DeBoer,3,30,16,2,2,0,51.9,4,5


In [200]:
n_skip = int(25/82*1230)
m_combined_red = (pd.concat([natstat2014_marg.iloc[n_skip:], 
                             natstat2015_marg.iloc[n_skip:], 
                             natstat2016_marg.iloc[n_skip:], 
                             natstat2017_marg.iloc[n_skip:]], sort = False)
                             .reset_index(drop=True))

X = m_combined_red.dropna().loc[:,'CF%_avg':]
y = m_combined_red.iloc[X.index.values].result_bool

X_test_act = natstat2018_marg.iloc[n_skip:].loc[:, 'CF%_avg':].dropna().copy()
y_test_act = natstat2018_marg.iloc[X_test_act.index.values].result_bool

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [225]:
m_combined_red.result_bool.mean()

0.5483964172204565

In [201]:
X_home= X.copy()
X_home['h'] = 1
y_home = y.copy()

X_test_home_act = X_test_act.copy()
X_test_home_act['h'] = 1
y_test_home_act = y_test_act.copy()

X_home_train, X_home_test, y_home_train, y_home_test = train_test_split(X_home, y_home, test_size=0.25, random_state=42)

In [None]:
classifiers = {
    'log': {'clf': LogisticRegression(penalty='l1', solver='liblinear'),
            'params': {'C': np.arange(0.025, 0.1, 0.005)}
           },
    'rf': {'clf': RandomForestClassifier(),
           'params': {'n_estimators': [100],
                      #'max_leaf_nodes': [4, 8, 16],
                      'max_leaf_nodes': [2,3,4],
                      'max_depth': [2,3,4],
                      'random_state': [42]
                     }
          }
}

X = m_combined_red.dropna().loc[:,'CF%_avg':]
y = m_combined_red.iloc[X.index.values].result_bool

X_home_train, X_home_test, y_home_train, y_home_test = train_test_split(X, y, test_size=0.25, random_state=42)

grid_search = GridSearchCV(classifiers['log']['clf'],
                           classifiers['log']['params'],
                           cv = 10, scoring = 'neg_log_loss', return_train_score=True)

In [202]:
#logit = LogisticRegression(penalty='l1', solver='liblinear')
#tscv = TimeSeriesSplit(n_splits=5)
#for key in classifiers:
grid_search = GridSearchCV(classifiers['log']['clf'],
                           classifiers['log']['params'],
                           cv = 10, scoring = 'neg_log_loss', return_train_score=True)
grid_normal = grid_search.fit(X_train, y_train)

grid_search = GridSearchCV(classifiers['log']['clf'],
                           classifiers['log']['params'],
                           cv = 10, scoring = 'neg_log_loss', return_train_score=True)
grid_home   = grid_search.fit(X_home_train, y_home_train)

In [203]:
best_log_normal = grid_normal.best_estimator_
best_log_home   = grid_home.best_estimator_
#print(best_log)
print(best_log_normal.coef_)
print(best_log_home.coef_)
print(best_log_normal.score(X_test, y_test))
print(best_log_home.score(X_home_test, y_home_test))
print(best_log_normal.score(X_test_act, y_test_act))
print(best_log_home.score(X_test_home_act, y_test_home_act))
#print(best_log_normal.score(X_test, y_test), best_log_home.score(X_test_home_act, y_test_home_act))

[[ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.00775313  0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.01857455  0.          0.          0.          0.
   0.00211877  0.01657215  0.          0.          0.         -0.00521471
   0.          0.          0.          0.          0.         -0.00118291
   0.         -0.01446943  0.          0.          0.          0.
   0.          0.          0.          0.         -0.0035819   0.
   0.          0.          0.          0.          0.01615391]]
[[ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.00861535  0.          0.
   0.          0.          0.          0.          0.          0.
  -0.0008769   0.02830926  0.          0.          0.          0.
   0.00115913  0.01800868  0.         -0.00407061  0.         -0.00384609
   0.          0.          0.          0.          0. 

In [130]:
#print(X_train.columns[best_log.coef_[0,:]==0.0])
print(X_train.columns[best_log.coef_[0,:]!=0.0])
#print(best_log.coef_[0, best_log.coef_[0,:] != 0.0])

Index(['GF%_avg', 'xGF%_avg', 'SCF_avg', 'SCA_avg', 'HDCF%_avg', 'HDGF%_avg',
       'HDSH%_avg', 'MDSF%_avg', 'MDSH%_avg', 'MDSV%_avg', 'LDCF%_avg',
       'LDSF%_avg', 'LDGF%_avg', 'LDSV%_avg', 'blocks_avg'],
      dtype='object')


### Let's Refine the Model

We will use:
* TimeSeriesSplit
* cross_val_score
* GridSearchCV

To refine the hyperparamters and get the best model


In [46]:
logit = LogisticRegression(penalty='l1', solver='liblinear')
tscv = TimeSeriesSplit(n_splits=5)


In [48]:
marginal_stats_combined = (pd.concat([natstat2016_marg, natstat2017_marg], sort = False)
                             .reset_index(drop=True))
natstat_combined = (pd.concat([natstat2016_flat, natstat2017_flat], sort = False)
                      .reset_index(drop=True))

X = marginal_stats_combined.dropna().loc[:,'CF%_avg':]
y = natstat_combined.iloc[X.index.values].result_bool

#X_train = marginal_stats_combined.dropna().iloc[:-n_test].loc[:,'CF%_avg':]
#X_test = marginal_stats_combined.dropna().iloc[-n_test:].loc[:,'CF%_avg':]
#X_train['h'] = 1
#X_test['h'] = 1

#y_train = natstat_combined.iloc[X_train.index.values].result_bool
#y_test = natstat_combined.iloc[X_test.index.values].result_bool

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
X = marginal_stats_combined.dropna().loc[:,'CF%_avg':]
y = natstat_combined.iloc[X.index.values].result_bool

#X_train = marginal_stats_combined.dropna().iloc[:-n_test].loc[:,'CF%_avg':]
#X_test = marginal_stats_combined.dropna().iloc[-n_test:].loc[:,'CF%_avg':]
#X_train['h'] = 1
#X_test['h'] = 1

#y_train = natstat_combined.iloc[X_train.index.values].result_bool
#y_test = natstat_combined.iloc[X_test.index.values].result_bool

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [49]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(2265, 19) (2265,)
(200, 19) (200,)


In [53]:
#param_grid = [{'C': [0.01, 0.1, 1, 10, 100]}]
param_grid = [{'C': np.arange(0.025, 0.1, 0.005)}]

grid_search = GridSearchCV(logit, param_grid, cv = tscv, scoring = 'neg_log_loss', return_train_score=True)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=5),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l1',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': array([0.025, 0.03 , 0.035, 0.04 , 0.045, 0.05 , 0.055, 0.06 , 0.065,
       0.07 , 0.075, 0.08 , 0.085, 0.09 , 0.095, 0.1  ])}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='neg_log_loss', verbo

In [54]:
display(grid_search.best_params_)
display(grid_search.best_estimator_)

cvres = grid_search.cv_results_
for mean_score, params in zip(-cvres['mean_test_score'], cvres['params']):
    print(mean_score, params)

{'C': 0.04000000000000001}

LogisticRegression(C=0.04000000000000001, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

0.689847709673148 {'C': 0.025}
0.6893430598398959 {'C': 0.030000000000000002}
0.6892274355267735 {'C': 0.035}
0.6891958487926442 {'C': 0.04000000000000001}
0.6892350266285665 {'C': 0.045000000000000005}
0.689304542982544 {'C': 0.05}
0.6893576689862118 {'C': 0.05500000000000001}
0.6893872269699224 {'C': 0.060000000000000005}
0.6894060951277035 {'C': 0.065}
0.6894373831073888 {'C': 0.07}
0.6894801122234456 {'C': 0.07500000000000001}
0.6895119354038882 {'C': 0.08000000000000002}
0.6895428542685524 {'C': 0.08500000000000002}
0.6895732663883434 {'C': 0.09000000000000002}
0.6896021974068658 {'C': 0.095}
0.6896313419237595 {'C': 0.1}


In [55]:
best_log_reg = grid_search.best_estimator_
best_log_reg.coef_

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -1.49354669e-02,
         2.03395701e-03,  7.45119038e-03,  1.15679084e-02,
         2.52204708e-03,  5.00598306e-03, -7.04762245e-03,
         0.00000000e+00,  1.63254557e-03,  2.51229234e-02,
        -4.26535685e-02,  0.00000000e+00,  0.00000000e+00,
         3.77974382e-09]])

Some of these will obviously be collinear and we need to deal with this.

In [56]:
X_train.columns[best_log_reg.coef_[0,:]!=0.0]

Index(['SCF%_avg', 'HDSF%_avg', 'HDSH%_avg', 'HDSV%_avg', 'MDSF%_avg',
       'MDSH%_avg', 'MDSV%_avg', 'LDSH%_avg', 'LDSV%_avg', 'SV%_avg', 'h'],
      dtype='object')

In [57]:
best_log_reg.score(X_test, y_test)

0.575

In [59]:
n_test = 200
X_train = natstat2017_marg.dropna().iloc[:-n_test].loc[:,'CF%_avg':]
X_test = natstat2017_marg.dropna().iloc[n_test:].loc[:,'CF%_avg':]
X_train['h'] = 1
X_test['h'] = 1

y_train = natstat2017_flat.iloc[X_train.index.values].result_bool
y_test = natstat2017_flat.iloc[X_test.index.values].result_bool

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1053, 19) (1053,)
(1053, 19) (1053,)


In [60]:
best_log_reg.score(X_test, y_test)

0.5707502374169041

In [65]:
best_log_reg_act = LogisticRegression(C=0.03, penalty='l1',  solver='liblinear')

marginal_stats_combined = (pd.concat([natstat2016_marg, natstat2017_marg], sort = False)
                             .reset_index(drop=True).loc[:,'CF%_avg':])
netstat_combined = (pd.concat([natstat2016_flat, natstat2017_flat], sort = False)
                      .reset_index(drop=True))

x = marginal_stats_combined.dropna().copy()
x['h'] = 1
y = netstat_combined.iloc[x.index.values].result_bool
print(x.shape, y.shape)
best_log_reg_act.fit(x, y)

(2465, 19) (2465,)


LogisticRegression(C=0.03, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [68]:
X_test_act = natstat2018_marg.loc[:, 'CF%_avg':].dropna().copy()
X_test_act['h'] = 1
y_test_act = natstat2018_flat.iloc[X_test_act.index.values].result_bool

print(best_log_reg.score(X_test_act, y_test_act))
print(best_log_reg_act.score(X_test_act, y_test_act))

0.529505582137161
0.5374800637958532


### Just testing out a random forest

No tuning atm

In [72]:
n_test = 200
X_train = marginal_stats_combined.dropna().iloc[:-n_test]
X_test = marginal_stats_combined.dropna().iloc[-n_test:]

y_train = natstat_combined.iloc[X_train.index.values].result_bool
y_test = natstat_combined.iloc[X_test.index.values].result_bool

**WARNING** Next Cell takes a LONG time to run

In [78]:
rf = RandomForestClassifier()

param_grid = {'n_estimators': [100],
              #'max_leaf_nodes': [4, 8, 16],
              'max_leaf_nodes': [2,3,4],
              'max_depth': [2,3,4],
              'random_state': [42]
             }

grid_search = GridSearchCV(rf, param_grid, cv = tscv, scoring = 'neg_log_loss', return_train_score=True)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=5),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                     

In [79]:
print(grid_search.best_params_)
print(grid_search.best_estimator_)

rf_best = grid_search.best_estimator_

cvres = grid_search.cv_results_
for mean_score, params in zip(-cvres['mean_test_score'], cvres['params']):
    print(mean_score, params)

{'max_depth': 2, 'max_leaf_nodes': 2, 'n_estimators': 100, 'random_state': 42}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=2,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
0.6915870804170436 {'max_depth': 2, 'max_leaf_nodes': 2, 'n_estimators': 100, 'random_state': 42}
0.6920059998477489 {'max_depth': 2, 'max_leaf_nodes': 3, 'n_estimators': 100, 'random_state': 42}
0.6916786960656084 {'max_depth': 2, 'max_leaf_nodes': 4, 'n_estimators': 100, 'random_state': 42}
0.6915870804170436 {'max_depth': 3, 'max_leaf_nodes': 2, 'n_estimators': 100, 'random_state': 42}
0.6920059998477489 {'max_depth': 3, 'max_leaf_

In [81]:
rf_best.score(X_test, y_test)

0.575

In [83]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols': df.columns,'imp' :
    m.feature_importances_}).sort_values('imp',ascending = False)

In [85]:
rf_feat_importance(rf_best, X_test)

Unnamed: 0,cols,imp
2,GF_avg,0.11
4,xGF%_avg,0.1
7,HDSH%_avg,0.09
5,SCF%_avg,0.08
8,HDSV%_avg,0.08
15,SV%_avg,0.07
3,GA_avg,0.06
11,MDSV%_avg,0.06
9,MDSF%_avg,0.05
10,MDSH%_avg,0.05


In [82]:
best_log_reg.score(X_test, y_test)

ValueError: X has 18 features per sample; expecting 19

In [45]:
# X_test_act = marginal_stats_2018.dropna()
# y_test_act = natstat2018_flat.iloc[X_test_act.index.values].result_bool

In [46]:
# best_log_reg.score(X_test_act, y_test_act)

In [47]:
# rf_best.score(X_test_act, y_test_act)

In [48]:
# from sklearn.naive_bayes import GaussianNB

# nb = GaussianNB()
# nb.fit(X_train, y_train)

In [49]:
# nb.score(X_test, y_test)

In [50]:
# nb.score(X_test_act, y_test_act)

In [51]:
# from xgboost import XGBClassifier

# reg = XGBClassifier(n_estimators=1000)
# reg.fit(X_train, y_train) # Change verbose to True if you want to see it train

In [52]:
# reg.score(X_test_act, y_test_act)

In [53]:
# from sklearn.svm import SVC
# from sklearn.naive_bayes import GaussianNB

# svc_clf = SVC(gamma='scale', probability = True)
# #svc_clf.fit(X_train, y_train)

In [54]:
# param_grid = {'C': [10, 50, 100],
#               'kernel': ['rbf', 'linear'], # 'poly', 
#               #'degree': [3, 4, 5, 6],
#               'gamma': [0.01, 0.05, 0.1],
#               'random_state': [42]
#              }

# grid_search = GridSearchCV(svc_clf, 
#                            param_grid, 
#                            cv = tscv, 
#                            scoring = 'neg_log_loss',
#                            return_train_score=True)

# grid_search.fit(X_train, y_train)

In [55]:
# svc_best = grid_search.best_estimator_
# svc_best

In [56]:
# grid_search.best_params_

In [57]:
# cvres = grid_search.cv_results_
# for mean_score, params in zip(-cvres['mean_test_score'], cvres['params']):
#     print(mean_score, params)

In [58]:
# svc_best.score(X_test_act, y_test_act)

In [59]:
# svc_best.score(X_test, y_test)