## Game Data from NHL Database

We will start with getting the game data the Kaggle NHL Database. The only real reason to do this is to get game id's that we can then use to merge the Natural Stat Trick data with the gambling data. We could also just assign our own game id, which, may be easier.

In [272]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import log_loss

%matplotlib inline
pd.options.display.width = 0

In [273]:
path = "data/original/kaggle/"

teams = pd.read_csv(path + "team_info.csv")
games_all = pd.read_csv(path + "game.csv")
team_stats = pd.read_csv(path + "game_teams_stats.csv")

In [274]:
#display(teams.head())
def get_games(season):
    # convert season into a season number in the same format as the datafile
    season = season * 10000 + (season + 1)
    
    # filter for regular season games of that season
    games = games_all[(games_all['season'] == season) & (games_all['type'] == 'R')].copy()

    # create a small function to merge, drop, and rename
    def merge_games(df, team):
        x = (pd
             .merge(left = df,
                    right = teams[['team_id','teamName']],
                    left_on = team + '_id',
                    right_on = 'team_id')
             .drop(columns = ['team_id'])
             .rename(columns = {'teamName':team}))
        return x
        
#     merge_games = lambda df, team: (df.merge(right = teams[['team_id','teamName']], left_on = team + '_id',
#                                                 right_on = 'team_id')
#                                     .drop(columns = ['team_id'])
#                                     .rename(columns = {'teamName':team}))
    
    games = merge_games(games, 'away_team')
    games = merge_games(games, 'home_team')
    
    #convert datatypes
    games['date_time'] = pd.to_datetime(games['date_time'], format = '%Y-%m-%d')
    games['outcome'] = games['outcome'].astype('category')
    
    #filter for useful columns
    cols = ['game_id', 'season', 'date_time', 'away_team', 'home_team',
            'away_goals', 'home_goals', 'outcome', 'home_team_id', 'away_team_id']
    games_reduced = games[cols].sort_values(by='game_id')
    
    return games_reduced

In [275]:
g2014 = get_games(2014)
g2015 = get_games(2015)
g2016 = get_games(2016)
g2017 = get_games(2017)
g2018 = get_games(2018)

In [277]:
(g2016[g2016.duplicated(subset=['home_team', 'home_goals', 'away_team', 'away_goals', 'date_time'], keep = False)]
     .sort_values(by='home_team'))

Unnamed: 0,game_id,season,date_time,away_team,home_team,away_goals,home_goals,outcome,home_team_id,away_team_id


## Natural Stat Trick Data

Now, it is time to deal with the Natural Stat Trick Data

Create a number of procedures to make the cleaning more compartmentatlized and repeatable.

Skip to the next cell to actually get going.

In [278]:
def natstat_to_numeric(df_in):
    # '-' were causing some columns to be treated as strings so let's replace and convert
    df = df_in.drop(columns = ["Unnamed: 2", "Attendance"]).replace('-', np.nan)
    df.loc[:,'TOI':] = df.loc[:,'TOI':].apply(pd.to_numeric)
    
    
    #########################################################################################
    # we have to do something so let's fill with mean as we are calculating means
    #########################################################################################
    df = df.fillna(df.mean())
    
    # calculate blocked shots
    df['blocks'] = df['CA'] - df['FA']

    return df


def natstat_add_ave_stats(df_in, cols = None):
    ''' we want to create running averages for all of the numberic values of interest'''
    df = df_in.copy()
    df = df.sort_values(by='date')
    
    # get cols to create averages of
    if cols is None:
        cols = df.iloc[:, (df.columns.get_loc('TOI') + 1):].columns

    for c in cols:
        # create a column header name
        new_col = c + '_avg'
        
        # create the new column and fill with 0.0
        df[new_col] = 0.0
        
        # filter for each team and create the running average for each team individually
        for team in df.Team.sort_values().unique():
            f = df['Team'] == team
            df.loc[f, new_col] = df.loc[f, c].expanding(min_periods=2).mean().shift(1)
            
    return df


def natstat_split_game_info(df_in):
    '''Split "Game" column into date, home team, away team, home goals, away goals'''
    df = df_in.copy()
    
    # get date and add it to the dataframe
    temp = df['Game'].str.split(' - ', n=1, expand = True)
    
    # temp[0] contains the date
    df.insert(0, 'date', pd.to_datetime(temp[0], format = '%Y-%m-%d'))
    
    # temp[1] contains the game information, but it is messy
    temp = temp[1].str.split(',', n=1, expand = True)
    
    # now temp[0] contains the away team information and temp[1] contains the home team information
    df.insert(1, 'home', temp[1].str.replace('\d+','').str.strip())
    df.insert(2, 'home_goals', temp[1].str.extract(r'(\d+)').astype('int64'))
    df.insert(3, 'away', temp[0].str.replace('\d+','').str.strip())
    df.insert(4, 'away_goals', temp[0].str.extract(r'(\d+)').astype('int64'))
    
    # in case we want to easily filter for home and away games we set H or A for each row
    hoa = ['H' if home_team in full_name else 'A' for home_team, full_name in zip(df['home'], df['Team'])]
    df.insert(5, 'hoa', hoa)
    
    # in case we want to easily filter for home and away games we set H or A for each row
    winner = []
    for h, a, t in zip(df['home_goals'], df['away_goals'], df['TOI']):
        result = ('H_reg' if h > a else 'A_reg' if t <= 60.0 else 
                  'H_OT' if h > a else 'A_OT' if t < 65.0 else
                  'H_SO' if h > a else 'A_SO')
        winner.append(result)
    df.insert(6, 'result', winner)
    df.insert(7, 'result_bool', df.result.apply(lambda x: 1 if x[0] == 'H' else 0))
    
    return df


def natstat_home_away(df_in, hoa, col_suffix):
    '''
        Filter the original dataframe for home team or away team information that can be used to flatten each game
        
        Return two dataframes:
            header = data, home, away, home_goals, away_goals, hoa, Game, Team
            stats = all of the raw data
        
    '''
    df = df_in[df_in['hoa'] == hoa].copy()
    
    header = df.loc[:, 'date':'TOI'].reset_index(drop=True)
    stats = df.iloc[:, (df.columns.get_loc('TOI') + 1):].reset_index(drop=True)

    header.columns = [col + '_' + col_suffix for col in header.columns]
    stats.columns = [col + '_' + col_suffix for col in stats.columns]
    
    return header, stats


def natstat_flatten(df_in):
    ''' combine all stats into a single row for each game  '''
    h_head, h_stats = natstat_home_away(df_in, 'H', 'home')
    a_head, a_stats = natstat_home_away(df_in, 'A', 'away')
    
    h_head.columns = np.char.replace(h_head.columns.values.astype(str), '_home', '')
    
    return pd.concat([h_head, h_stats, a_stats], axis = 1)


def natstat_flatten2(df_in):
    ''' combine all stats into a single row for each game  '''
    df_all = []
    for t in df_in.home.unique():
        temp = df_in[(df_in.home == t) | (df_in.away == t)].copy()
        df_all.append(natstat_flatten(temp))
        
    return pd.concat(df_all)


def natstat_clean(df_in):
    ''' apply the cleaning functions to all data frames'''
    df = natstat_split_game_info(df_in)
    df = natstat_to_numeric(df)
    
    return df

def add_game_ids(nat_stat, nhl_df):
    ''' Add the NHL Game ID (from the Kaggle files) to the Nat Stat Data '''
    result = nat_stat.copy()
    game_ids = []
    
    for index, row in nat_stat.iterrows():
        try:
            f = ((nhl_df.home_team == row.home) &
                (nhl_df.away_team == row.away) &
                (nhl_df.home_goals == row.home_goals) &
                (nhl_df.away_goals == row.away_goals) &
                (
                    (nhl_df.date_time >= (row.date - timedelta(days=1))) &
                    (nhl_df.date_time <= (row.date + timedelta(days=1)))
                ))
            game_id = nhl_df[f].game_id.values[0]
            game_ids.append(game_id)
        except Exception as e:
            x = nhl_df[ (nhl_df.home_team == row.home) &
                        (nhl_df.away_team == row.away) &
                        (nhl_df.home_goals == row.home_goals) &
                        (nhl_df.away_goals == row.away_goals) &
                        ((nhl_df.date_time >= (row.date - timedelta(days=1))) | (nhl_df.date_time <= (row.date + timedelta(days=1))))]
            print(e)
            print(row.date, row.home, row.away, row.home_goals, row.away_goals)

    result.insert(0, 'game_id', game_ids)
    
    return result

Now we can build the flattened data frames. I've also kept the unflattened data frames as they can be useful for data inspection.

### Find Marginal Stats for Each Game

So far we have:
* loaded the dataframes
* calculated the running average for each metric for each team
* converted each game into a flat structure

Now we can calculate the marginal stats for the difference between these stats.

In [309]:
def natstat_marginal(df):
    header = df.loc[:, 'game_id':'TOI'].reset_index(drop=True)
    
    h = np.core.defchararray.find(df.columns.values.astype(str), 'avg_home') >= 0
    a = np.core.defchararray.find(df.columns.values.astype(str), 'avg_away') >= 0

    homestats = df.loc[:, h]
    awaystats = df.loc[:, a]
    #display(header)
    #display(homestats)
    #display(awaystats)

    homestats.columns = np.char.replace(homestats.columns.values.astype(str), '_home', '')
    awaystats.columns = np.char.replace(homestats.columns.values.astype(str), '_away', '')

    marginal_stats = homestats - awaystats
    
    return pd.concat([header.reset_index(drop=True), marginal_stats.reset_index(drop=True)], axis = 1)

def natstat_percents(df):
    header = df.loc[:, 'game_id':'TOI'].reset_index(drop=True)
    p = np.core.defchararray.find(df.columns.values.astype(str), '%') >= 0
    percent = df.loc[:, p]
    return pd.concat([header, percent], axis = 1)

Now, we can build a data frame with marginal stats that we will use to build the model.

In [280]:
def natstat_build_df(df_in, nhl_in, cols = None):
    df = natstat_clean(df_in)
    df_long = natstat_add_ave_stats(df)
    df_flat = natstat_flatten(df_long.sort_values(by="Game")).reset_index(drop = True)
    df_flat = add_game_ids(df_flat, nhl_in).sort_values(by = 'game_id')
    
#     team_stats_ha = pd.merge(left = team_stats[team_stats['HoA'] == 'home'],
#                          right = team_stats[team_stats['HoA'] == 'away'],
#                          on = "game_id")
#     temp = pd.merge(left = df_flat, right = team_stats_ha, on = "game_id")

    df_marg = natstat_marginal(df_flat).sort_values(by = 'game_id')
    
    return df_long, df_flat, df_marg

In [310]:
natstat_path = "data/original/natstat/"
natstat2014 = pd.read_csv(natstat_path + "Games - Natural Stat TrickTeam Season Totals - 2014.csv")
natstat2015 = pd.read_csv(natstat_path + "Games - Natural Stat TrickTeam Season Totals - 2015.csv")
natstat2016 = pd.read_csv(natstat_path + "Games - Natural Stat TrickTeam Season Totals - 2016.csv")
natstat2017 = pd.read_csv(natstat_path + "Games - Natural Stat TrickTeam Season Totals - 2017.csv")
natstat2018 = pd.read_csv(natstat_path + "Games - Natural Stat TrickTeam Season Totals - 2018.csv")

natstat2014, natstat2014_flat, natstat2014_marg = natstat_build_df(natstat2014, g2014)
natstat2015, natstat2015_flat, natstat2015_marg = natstat_build_df(natstat2015, g2015)
natstat2016, natstat2016_flat, natstat2016_marg = natstat_build_df(natstat2016, g2016)
natstat2017, natstat2017_flat, natstat2017_marg = natstat_build_df(natstat2017, g2017)
natstat2018, natstat2018_flat, natstat2018_marg = natstat_build_df(natstat2018, g2018)

In [311]:
x2017 = pd.read_csv(natstat_path + "Games - Natural Stat TrickTeam Season Totals - 2017.csv")
x2017 = natstat_clean(x2017)

x2017_ave = natstat_add_ave_stats(x2017)
boston_stat = x2017_ave[x2017_ave.Game.str.contains('Bruins')].copy()
#display(boston_stat)
x2017_flat = natstat_flatten(x2017_ave.sort_values(by='Game')).reset_index(drop = True)
boston_flat = natstat_flatten(boston_stat).reset_index(drop = True)
#x2017_flat.columns

In [312]:
display(boston_stat[boston_stat.Team == 'Boston Bruins'][['CF','CF_avg', 'CA', 'CA_avg']].head(10))

Unnamed: 0,CF,CF_avg,CA,CA_avg
20,54,,46,
62,59,,44,
90,52,56.5,63,45.0
131,59,55.0,69,51.0
152,44,56.0,54,55.5
190,64,53.6,56,55.2
236,59,55.333333,71,55.333333
292,56,55.857143,71,57.571429
322,58,55.875,63,59.25
338,63,56.111111,57,59.666667


In [313]:
#display(boston_stat[boston_stat.Team == 'Boston Bruins'].head(6))
#display(boston_stat[boston_stat.Team == 'Boston Bruins'][['CF','CF_avg', 'CA', 'CA_avg']].head(6))

#boston_stat.count()
x, y = natstat_home_away(boston_stat, 'H', 'home')
x2, y2 = natstat_home_away(boston_stat, 'A', 'away')
#boston_flat[['home', 'away', 'CF_home', 'CA_home', 'CF_away', 'CA_away', 'CF_avg_home', 'CA_avg_home', 'CF_avg_away', 'CA_avg_away']]
#display(pd.concat([x, y, y2], axis=1)[['home_home', 'away_home', 'CF_home', 'CA_home', 'CF_away', 'CA_away', 'CF_avg_home', 'CA_avg_home', 'CF_avg_away', 'CA_avg_away']])


In [315]:
#x2017_flat[(x2017_flat.home == "Bruins") | (x2017_flat.away == "Bruins")][['home', 'away', 'CF_home', 'CA_home', 'CF_away', 'CA_away', 'CF_avg_home', 'CA_avg_home', 'CF_avg_away', 'CA_avg_away']]

In [316]:
#df[df['A'].str.contains("hello")]
temp = x2017_ave[x2017_ave.Team.str.contains("Bruins")][['date', 'Game', 'Team', 'xGF%', 'xGF%_avg']]
#display(temp.head())
#temp.count()

In [317]:
#print((49.8+49)/2)
#print((49.8+49+36.69)/3)
#print((49.8+49+36.69+52.23)/4)

In [318]:
#temp2 = x2017_flat[x2017_flat.Game.str.contains("Bruins")][['date', 'Game', 'Team', 'xGF%_home', 'xGF%_avg_home', 'xGF%_away', 'xGF%_avg_away']]
#display(temp2.head())
#temp2.count()

In [322]:
#natstat2017_flat.iloc[100:102]

In [321]:
# temp = natstat2017_flat.iloc[100:101]
# display(temp)
# display(natstat_marginal(temp))

In [323]:
#natstat2017_marg.iloc[100:102]

In [324]:
#temp[temp.Team == 'Boston Bruins'].head()

In [325]:
# display(natstat2014_marg[natstat2014_marg.duplicated(['game_id'], keep=False)])
# display(natstat2015_marg[natstat2015_marg.duplicated(['game_id'], keep=False)])
# display(natstat2016_marg[natstat2016_marg.duplicated(['game_id'], keep=False)])
# display(natstat2017_marg[natstat2017_marg.duplicated(['game_id'], keep=False)])
# display(natstat2018_marg[natstat2018_marg.duplicated(['game_id'], keep=False)])

In [326]:
#natstat2018_marg[natstat2018_marg.duplicated(['game_id'], keep=False)]

In [327]:
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)
#display(natstat2016_flat.head())
#display(natstat2016_marg.tail())

In [328]:
path_result = "data/wrangled/"
natstat2016_flat.to_csv(path_result + "natstat2016_flat3.csv")
natstat2016_marg.to_csv(path_result + "natstat2016_marg3.csv")
natstat2017_flat.to_csv(path_result + "natstat2017_flat3.csv")
natstat2017_marg.to_csv(path_result + "natstat2017_marg3.csv")
natstat2018_flat.to_csv(path_result + "natstat2018_flat3.csv")
natstat2018_marg.to_csv(path_result + "natstat2018_marg3.csv")
natstat2014_marg.to_csv(path_result + "natstat2014_marg3.csv")
natstat2014_flat.to_csv(path_result + "natstat2014_flat3.csv")
natstat2015_marg.to_csv(path_result + "natstat2015_marg3.csv")
natstat2015_flat.to_csv(path_result + "natstat2015_flat3.csv")

Write resulting dataframes to csv files

In [297]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [329]:
#pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
#natstat2016_flat[natstat2016_flat.Team == 'Calgary Flames'].sort_values(by='game_id')

In [299]:
corr = natstat_percents(natstat2016_marg).corr()
# Generate a mask for the upper triangle
df = corr.where(np.tril(np.ones(corr.shape)).astype(np.bool)).fillna(0.0)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

df.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

Unnamed: 0,game_id,home_goals,away_goals,result_bool,TOI,CF%_avg,FF%_avg,SF%_avg,GF%_avg,xGF%_avg,SCF%_avg,HDCF%_avg,HDSF%_avg,HDGF%_avg,HDSH%_avg,HDSV%_avg,MDCF%_avg,MDSF%_avg,MDGF%_avg,MDSH%_avg,MDSV%_avg,LDCF%_avg,LDSF%_avg,LDGF%_avg,LDSH%_avg,LDSV%_avg,SH%_avg,SV%_avg
game_id,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
home_goals,-0.017,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
away_goals,0.029,-0.055,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
result_bool,-0.048,0.63,-0.62,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
TOI,-0.027,-0.061,0.0022,-0.012,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
CF%_avg,0.043,0.098,-0.088,0.1,0.041,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
FF%_avg,0.032,0.096,-0.084,0.11,0.04,0.94,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
SF%_avg,0.035,0.091,-0.1,0.11,0.032,0.88,0.95,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
GF%_avg,-0.0097,0.084,-0.16,0.15,-0.041,0.13,0.21,0.27,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
xGF%_avg,0.042,0.076,-0.13,0.11,0.01,0.69,0.8,0.79,0.47,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


### Betting Odds Cleanup

We have to cleanup the betting odds data files.


In [300]:
# team_map = {
#     "Anaheim" : "Ducks",
#     "Arizona" : "Coyotes",
#     "Boston" : "Bruins",
#     "Buffalo" : "Sabres",
#     "Calgary" : "Flames",
#     "Carolina" : "Hurricanes",
#     "Chicago" : "Blackhawks",
#     "Colorado" : "Avalanche",
#     "Columbus" : "Blue Jackets",
#     "Dallas" : "Stars",
#     "Detroit" : "Red Wings",
#     "Edmonton" : "Oilers",
#     "Florida" : "Panthers",
#     "LosAngeles" : "Kings",
#     "Minnesota" : "Wild",
#     "Montreal" : "Canadiens",
#     "NYIslanders" : "Islanders",
#     "NYRangers" : "Rangers",
#     "Nashville" : "Predators",
#     "NewJersey" : "Devils",
#     "Ottawa" : "Senators",
#     "Philadelphia" : "Flyers",
#     "Pittsburgh" : "Penguins",
#     "SanJose" : "Sharks",
#     "St.Louis" : "Blues",
#     "TampaBay" : "Lightning",
#     "Toronto" : "Maple Leafs",
#     "Vancouver" : "Canucks",
#     "Washington" : "Capitals",
#     "Winnipeg" : "Jets"
# }


# betting_path = "data/original/betting/"
# bet2016 = (pd.read_excel(betting_path + "nhl odds 2016-17_mrd.xlsx")
#                 .rename(columns = {'Unnamed: 12':'Puck Line Odds',
#                                     'Unnamed: 14':'OU Open Odds',
#                                     'Unnamed: 16':'OU Close Odds'
#                                    })
#                 .drop(columns = ['1st', '2nd', '3rd'])
#                 .replace(team_map)
#           )

# bet2016['Date'] = bet2016.Date.apply(lambda x: 17*10000 + x if x < 900 else 16*10000 + x)
# bet2016['Date'] = pd.to_datetime(bet2016.Date, format = "%y%m%d")


In [301]:
#bet2016.head(20)

In [302]:
# h = bet2016[bet2016['VH'] == 'H']
# a = bet2016[bet2016['VH'] == 'V']
# #print(h.shape, a.shape)

# x = pd.merge(left = h, right = a, left_on = 'Temp_Index', right_on = 'Temp_Index')
# #print(x.shape)
# #display(x)

In [303]:
# def bet_home_away(df_in, hoa, col_suffix):
#     '''
#         Filter the original dataframe for home team or away team information that can be used to flatten each game
        
#         Return two dataframes:
#             header 
#             odds
        
#     '''
#     df = df_in[df_in['VH'] == hoa]
    
#     header =  df.loc[:, 'Date':'VH'].copy().reset_index(drop=True)
#     odds   = df.iloc[:, df.columns.get_loc('Team'):].copy().reset_index(drop=True)

#     header.columns = [col + '_' + col_suffix for col in header.columns]
#     odds.columns = [col + '_' + col_suffix for col in odds.columns]
#     #display(header.head())
#     #display(odds.head())
#     odds['Temp_Index'] = header['Temp_Index_' + col_suffix]
    
#     return header, odds


# def bet_flatten(df_in):
#     ''' combine betting  '''
#     h_head, h_odds = bet_home_away(df_in, 'H', 'home')
#     a_head, a_odds = bet_home_away(df_in, 'V', 'away')
    
#     h_head.columns = np.char.replace(h_head.columns.values.astype(str), '_home', '')
    
#     merged = pd.merge(left = h_odds, right = a_odds, on = 'Temp_Index')
#     merged = pd.merge(left = h_head, right = merged, on = 'Temp_Index')
#     #return pd.concat([h_head, h_odds, a_odds], axis = 1)
#     return merged


# def add_game_ids(bet, nhl_df):
#     ''' Add the NHL Game ID (from the Kaggle files) to the Betting '''
#     result = bet.copy()
# #     game_ids = [ (nhl_df[(nhl_df.home_team == row.Team_home) &
# #                         (nhl_df.away_team == row.Team_away) &
# #                         (nhl_df.home_goals == row.Final_home) &
# #                         (nhl_df.away_goals == row.Final_away)].game_id).values[0] for index, row in bet.iterrows()]
#     game_ids = []
#     for index, row in bet.iterrows():        
#         v =      nhl_df[(nhl_df.home_team  == row.Team_home) &
#                         (nhl_df.away_team  == row.Team_away) &
#                         (nhl_df.home_goals == row.Final_home) &
#                         (nhl_df.away_goals == row.Final_away) &
#                         (nhl_df.date_time.dt.month == row.Date.month)].game_id.values
#         if len(v) != 1:
#             print(v, row.Date, row.Team_home, row.Team_away, row.Final_home, row.Final_away)
# #         #display(v)
# #         #print(v)
# #         game_ids.append(v)
    
    
    
    
    
#     #for g in game_ids:
#     #    print(g)
#     #print(v)
#     #print(game_ids[0])
#     #print(game_ids[0][0])

#     #result.insert(0, 'game_id', game_ids)
    
#     #return result

# bet2016_flat = bet_flatten(bet2016)
# #display(bet2016_flat)
# bet2016_flat = add_game_ids(bet2016_flat, g2016)


# # display(bet2016.head())
# # display(bet2016.tail())

# # display(bet2016_flat.head())
# # display(bet2016_flat.tail())

# #for t in bet2016.Team.sort_values().unique():
# #    print(t)

In [331]:
#g2016[(g2016['home_team'] == 'Predators') & (g2016['away_team'] == 'Devils')]

### TO - DO
Deal with missing values - mostly in the percentage columns, but these are derived from other columns
I know that for some, like GF% the values are simply incorrectly recorded

skipping ahead for now - i want to build a model

### Should we try a very simple model?

I think so.

In [332]:
# logistic regression model and parameters to test
# log = LogisticRegression(penalty='l1', solver='liblinear')
# param_grid = {'C': np.arange(0.025, 0.1, 0.005)}

# let's also build a random foreset classifier and parameters
# rf = RandomForestClassifier()
# param_grid = {'n_estimators': [100],
#               #'max_leaf_nodes': [4, 8, 16],
#               'max_leaf_nodes': [2,3,4],
#               'max_depth': [2,3,4],
#               'random_state': [42]
#              }

# classifiers = {
#     'log': {'clf': LogisticRegression(penalty='l1', solver='liblinear'),
#             'params': {'C': np.arange(0.025, 0.1, 0.005)}
#            },
#     'rf': {'clf': RandomForestClassifier(),
#            'params': {'n_estimators': [100],
#                       #'max_leaf_nodes': [4, 8, 16],
#                       'max_leaf_nodes': [2,3,4],
#                       'max_depth': [2,3,4],
#                       'random_state': [42]
#                      }
#           }
# }

Set up our data for training, validation and testing.

In [306]:
# marginal_stats_combined = (pd.concat([natstat2016_marg, natstat2017_marg], sort = False)
#                              .reset_index(drop=True))
# natstat_combined = (pd.concat([natstat2016_flat, natstat2017_flat], sort = False)
#                       .reset_index(drop=True))

# n_test = 200
# X_train = marginal_stats_combined.dropna().iloc[:-n_test].loc[:,'CF%_avg':]
# X_test = marginal_stats_combined.dropna().iloc[-n_test:].loc[:,'CF%_avg':]
# #X_train['h'] = 1
# #X_test['h'] = 1

# y_train = natstat_combined.iloc[X_train.index.values].result_bool
# y_test = natstat_combined.iloc[X_test.index.values].result_bool

# X_test_act = natstat2018_marg.loc[500:, 'CF%_avg':].dropna().copy()
# #X_test_act['h'] = 1
# y_test_act = natstat2018_flat.iloc[X_test_act.index.values].result_bool

In [307]:
# marginal_stats_combined = (pd.concat([natstat2016_marg, natstat2017_marg], sort = False)
#                              .reset_index(drop=True))
# natstat_combined = (pd.concat([natstat2016_flat, natstat2017_flat], sort = False)
#                       .reset_index(drop=True))

In [333]:
# test = pd.merge(left = m_combined_red, right = team_stats, on = "game_id")
# display(test.head())
# display(test.tail())

In [None]:
# X_test_act = marginal_stats_2018.dropna()
# y_test_act = natstat2018_flat.iloc[X_test_act.index.values].result_bool

In [None]:
# best_log_reg.score(X_test_act, y_test_act)

In [None]:
# rf_best.score(X_test_act, y_test_act)

In [None]:
# from sklearn.naive_bayes import GaussianNB

# nb = GaussianNB()
# nb.fit(X_train, y_train)

In [None]:
# nb.score(X_test, y_test)

In [None]:
# nb.score(X_test_act, y_test_act)

In [None]:
# from xgboost import XGBClassifier

# reg = XGBClassifier(n_estimators=1000)
# reg.fit(X_train, y_train) # Change verbose to True if you want to see it train

In [None]:
# reg.score(X_test_act, y_test_act)

In [None]:
# from sklearn.svm import SVC
# from sklearn.naive_bayes import GaussianNB

# svc_clf = SVC(gamma='scale', probability = True)
# #svc_clf.fit(X_train, y_train)

In [None]:
# param_grid = {'C': [10, 50, 100],
#               'kernel': ['rbf', 'linear'], # 'poly', 
#               #'degree': [3, 4, 5, 6],
#               'gamma': [0.01, 0.05, 0.1],
#               'random_state': [42]
#              }

# grid_search = GridSearchCV(svc_clf, 
#                            param_grid, 
#                            cv = tscv, 
#                            scoring = 'neg_log_loss',
#                            return_train_score=True)

# grid_search.fit(X_train, y_train)

In [None]:
# svc_best = grid_search.best_estimator_
# svc_best

In [None]:
# grid_search.best_params_

In [None]:
# cvres = grid_search.cv_results_
# for mean_score, params in zip(-cvres['mean_test_score'], cvres['params']):
#     print(mean_score, params)

In [None]:
# svc_best.score(X_test_act, y_test_act)

In [None]:
# svc_best.score(X_test, y_test)

In [334]:
def natstat_add_ave_stats_new(df_in, cols = None):
    ''' we want to create running averages for all of the numberic values of interest'''
    df = df_in.copy()
    
    # get cols to create averages of
    if cols is None:
        #cols = df.loc[:, 'CF':].columns
        cols = df.iloc[:, (df.columns.get_loc('TOI') + 1):].columns

    for c in cols:
        # create a column header name
        new_col = c + '_avg'
        
        # create the new column and fill with 0.0
        df[new_col] = 0.0
        
        # filter for each team and create the running average for each team individually
        for team in df.team_id.sort_values().unique():
            f = df['team_id'] == team
            df.loc[f, new_col] = df.loc[f, c].expanding(min_periods=2).mean().shift(1)
            
    return df

def natstat_marginal_new(df):
    header = df.loc[:, 'game_id'].reset_index(drop=True)
    
    h = np.core.defchararray.find(df.columns.values.astype(str), 'avg_home') >= 0
    a = np.core.defchararray.find(df.columns.values.astype(str), 'avg_away') >= 0

    homestats = df.loc[:, h]
    #display(homestats.tail())
    awaystats = df.loc[:, a]
    #display(awaystats.tail())

    homestats.columns = np.char.replace(homestats.columns.values.astype(str), '_home', '')
    awaystats.columns = np.char.replace(homestats.columns.values.astype(str), '_away', '')

    marginal_stats = homestats - awaystats
    #display(marginal_stats.tail())
    
    z = pd.concat([header, marginal_stats], axis = 1)
    #display(z.head())
    
    return z


In [335]:
natstat2018 = natstat2018_marg.copy()
natstat2017 = natstat2017_marg.copy()
natstat2016 = natstat2016_marg.copy()
natstat2015 = natstat2015_marg.copy()
natstat2014 = natstat2014_marg.copy()

natastat = pd.concat([natstat2014_marg.copy(),
                      natstat2015_marg.copy(),
                      natstat2016_marg.copy(),
                      natstat2017_marg.copy()])

In [336]:
game_teams_stats = team_stats.copy()
#display(game_teams_stats.head(1))

get_year = lambda df, year:df[(df.game_id >= (year * 1000000 + 20001)) &
                              (df.game_id <  (year * 1000000 + 30001))].sort_values(by='game_id')

cols_team = ['goals', 'shots',
             'hits', 'pim', 'powerPlayOpportunities', 'powerPlayGoals',
             'faceOffWinPercentage', 'giveaways', 'takeaways']

teams_game_stats = dict()
dfTeam = dict()
dfMarg = dict()
for i in range(2014,2019):
    teams_game_stats[i] = get_year(game_teams_stats, i)
    dfTeam[i] = natstat_add_ave_stats_new(teams_game_stats[i], cols_team)
    away_team = dfTeam[i][dfTeam[i]['HoA'] == 'away']
    home_team = dfTeam[i][dfTeam[i]['HoA'] == 'home']
    dfResult = pd.merge(away_team, home_team, on='game_id', suffixes = ['_away', '_home'])
    dfMarg[i] = natstat_marginal_new(dfResult.reset_index())

# do some qa on this shit

In [337]:
dfTeam[2016].iloc[100:105]

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,goals_avg,shots_avg,hits_avg,pim_avg,powerPlayOpportunities_avg,powerPlayGoals_avg,faceOffWinPercentage_avg,giveaways_avg,takeaways_avg
11086,2016020051,1,away,False,REG,John Hynes,1,29,20,8,3,1,52.2,14,11,1.666667,28.666667,20.333333,12.333333,4.0,0.666667,47.7,8.0,7.0
11087,2016020051,6,home,True,REG,Claude Julien,2,36,29,6,4,0,47.8,4,3,3.666667,29.0,17.666667,12.0,3.333333,0.333333,49.5,9.0,8.0
9493,2016020052,4,home,False,REG,Dave Hakstol,2,22,22,6,7,1,39.1,7,2,3.666667,29.0,26.666667,15.333333,3.333333,0.666667,47.866667,8.333333,4.0
9492,2016020052,24,away,True,REG,Randy Carlyle,3,26,28,16,2,0,60.9,9,6,1.75,30.25,24.75,10.75,4.0,0.5,58.7,2.25,3.25
10930,2016020053,28,away,False,REG,Peter DeBoer,2,34,14,20,3,0,47.5,5,5,3.0,31.75,16.25,5.0,3.25,0.75,43.5,9.5,6.5


In [338]:
dfTeam[2016][dfTeam[2016].team_id == 1].iloc[:8]

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,goals_avg,shots_avg,hits_avg,pim_avg,powerPlayOpportunities_avg,powerPlayGoals_avg,faceOffWinPercentage_avg,giveaways_avg,takeaways_avg
13952,2016020010,1,away,False,OT,John Hynes,1,24,18,16,2,0,46.4,15,8,,,,,,,,,
12500,2016020019,1,away,False,REG,John Hynes,2,34,27,8,3,0,48.2,2,7,,,,,,,,,
11007,2016020037,1,home,True,REG,John Hynes,2,28,16,13,7,2,48.5,7,6,1.5,29.0,22.5,12.0,2.5,0.0,47.3,8.5,7.5
11086,2016020051,1,away,False,REG,John Hynes,1,29,20,8,3,1,52.2,14,11,1.666667,28.666667,20.333333,12.333333,4.0,0.666667,47.7,8.0,7.0
10021,2016020069,1,home,True,OT,John Hynes,2,32,27,6,3,0,50.0,12,4,1.5,28.75,20.25,11.25,3.75,0.75,48.825,9.5,8.0
11617,2016020084,1,home,True,REG,John Hynes,5,35,14,6,3,2,52.5,8,3,1.6,29.4,21.6,10.2,3.6,0.6,49.06,10.0,7.2
7801,2016020107,1,home,False,OT,John Hynes,2,32,22,10,4,1,43.3,7,7,2.166667,30.333333,20.333333,9.5,3.5,0.833333,49.633333,9.666667,6.5
10101,2016020116,1,home,True,REG,John Hynes,3,23,22,13,1,0,54.2,10,10,2.142857,30.571429,20.571429,9.571429,3.571429,0.857143,48.728571,9.285714,6.571429


In [339]:
dfTeam[2016][dfTeam[2016].game_id == 2016020101]

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,goals_avg,shots_avg,hits_avg,pim_avg,powerPlayOpportunities_avg,powerPlayGoals_avg,faceOffWinPercentage_avg,giveaways_avg,takeaways_avg
11884,2016020101,13,away,False,REG,Gerard Gallant,2,31,29,6,3,1,47.4,12,17,3.0,31.833333,18.333333,7.5,3.666667,0.5,51.933333,8.0,10.833333
11885,2016020101,10,home,True,REG,Mike Babcock,3,28,19,8,2,0,52.6,9,11,3.5,35.333333,21.833333,10.166667,3.166667,0.833333,50.966667,8.333333,6.5


In [340]:
dfMarg[2016].iloc[100:105]
#dfMarg[2016][dfMarg[2016].home == 1].iloc[:8]

Unnamed: 0,game_id,goals_avg,shots_avg,hits_avg,pim_avg,powerPlayOpportunities_avg,powerPlayGoals_avg,faceOffWinPercentage_avg,giveaways_avg,takeaways_avg
100,2016020101,0.5,3.5,3.5,2.666667,-0.5,0.333333,-0.966667,0.333333,-4.333333
101,2016020102,-0.119048,-0.095238,1.238095,0.690476,-0.119048,-0.452381,-2.133333,3.595238,-2.47619
102,2016020103,-0.571429,1.285714,1.285714,-7.285714,0.0,0.142857,0.185714,-1.714286,0.142857
103,2016020104,-0.166667,1.666667,2.666667,-2.833333,-0.166667,-0.166667,-6.75,2.666667,0.5
104,2016020105,0.0,-1.0,9.5,-1.5,-1.0,-1.166667,4.1,2.0,0.0


## I Found a fucking critical mistake with our data processing of averages

I had changed min_periods from 2 to 1 and that messes up all of the calcs

In [341]:
dfMarg_combined = pd.concat([dfMarg[2017], dfMarg[2016], dfMarg[2015], dfMarg[2014]])
dfTeam_combined = pd.concat([teams_game_stats[i] for i in range(2014,2018)])
dfCombine = pd.merge(natastat, dfMarg_combined, on = 'game_id', suffixes = ['_natastat', '_dfResult_tmp'])
dfCombine_2018 = pd.merge(natstat2018, dfMarg[2018], on='game_id', suffixes = ['_natastat', '_dfResult_tmp'])

In [239]:
#display(natstat2018.head())
#display(dfMarg[2018].head())

In [342]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
display(natstat2017[natstat2017.game_id == 2017021000][['GF_avg', 'GA_avg']].head())
display(dfTeam[2017][dfTeam[2017].game_id == 2017021000]['goals_avg'].head())
display(dfCombine[dfCombine.game_id == 2017021000][['GF_avg', 'GA_avg', 'goals_avg']].head())
display(dfCombine[dfCombine.game_id == 2017021000]['GF_avg'] - dfCombine[dfCombine.game_id == 2017021000]['GA_avg'])
display(dfTeam[2017].columns)
display(natstat2017.columns)

Unnamed: 0,GF_avg,GA_avg
999,0.585577,-0.088702


12044    2.937500
12045    3.523077
Name: goals_avg, dtype: float64

Unnamed: 0,GF_avg,GA_avg,goals_avg
4689,0.585577,-0.088702,0.585577


4689    0.674279
dtype: float64

Index(['game_id', 'team_id', 'HoA', 'won', 'settled_in', 'head_coach', 'goals', 'shots', 'hits', 'pim', 'powerPlayOpportunities', 'powerPlayGoals', 'faceOffWinPercentage', 'giveaways', 'takeaways', 'goals_avg', 'shots_avg', 'hits_avg', 'pim_avg', 'powerPlayOpportunities_avg', 'powerPlayGoals_avg', 'faceOffWinPercentage_avg', 'giveaways_avg', 'takeaways_avg'], dtype='object')

Index(['game_id', 'date', 'home', 'home_goals', 'away', 'away_goals', 'hoa', 'result', 'result_bool', 'Game', 'Team', 'TOI', 'CF_avg', 'CA_avg', 'CF%_avg', 'FF_avg', 'FA_avg', 'FF%_avg', 'SF_avg', 'SA_avg', 'SF%_avg', 'GF_avg', 'GA_avg', 'GF%_avg', 'xGF_avg', 'xGA_avg', 'xGF%_avg', 'SCF_avg', 'SCA_avg', 'SCF%_avg', 'HDCF_avg', 'HDCA_avg', 'HDCF%_avg', 'HDSF_avg', 'HDSA_avg', 'HDSF%_avg', 'HDGF_avg', 'HDGA_avg', 'HDGF%_avg', 'HDSH%_avg', 'HDSV%_avg', 'MDCF_avg', 'MDCA_avg', 'MDCF%_avg', 'MDSF_avg', 'MDSA_avg', 'MDSF%_avg', 'MDGF_avg', 'MDGA_avg', 'MDGF%_avg', 'MDSH%_avg', 'MDSV%_avg', 'LDCF_avg', 'LDCA_avg', 'LDCF%_avg', 'LDSF_avg', 'LDSA_avg', 'LDSF%_avg', 'LDGF_avg', 'LDGA_avg', 'LDGF%_avg', 'LDSH%_avg', 'LDSV%_avg', 'SH%_avg', 'SV%_avg', 'PDO_avg', 'blocks_avg'], dtype='object')

In [161]:
display(natstat2017[natstat2017.game_id == 2017021000])
display(dfTeam[2017][dfTeam[2017].game_id == 2017021000].head())

Unnamed: 0,game_id,date,home,home_goals,away,away_goals,hoa,result,result_bool,Game,Team,TOI,CF_avg,CA_avg,CF%_avg,FF_avg,FA_avg,FF%_avg,SF_avg,SA_avg,SF%_avg,GF_avg,GA_avg,GF%_avg,xGF_avg,xGA_avg,xGF%_avg,SCF_avg,SCA_avg,SCF%_avg,HDCF_avg,HDCA_avg,HDCF%_avg,HDSF_avg,HDSA_avg,HDSF%_avg,HDGF_avg,HDGA_avg,HDGF%_avg,HDSH%_avg,HDSV%_avg,MDCF_avg,MDCA_avg,MDCF%_avg,MDSF_avg,MDSA_avg,MDSF%_avg,MDGF_avg,MDGA_avg,MDGF%_avg,MDSH%_avg,MDSV%_avg,LDCF_avg,LDCA_avg,LDCF%_avg,LDSF_avg,LDSA_avg,LDSF%_avg,LDGF_avg,LDGA_avg,LDGF%_avg,LDSH%_avg,LDSV%_avg,SH%_avg,SV%_avg,PDO_avg,blocks_avg
999,2017021000,2018-03-03,Lightning,7,Flyers,6,H,H_reg,1,"2018-03-03 - Flyers 6, Lightning 7",Tampa Bay Lightning,65.0,7.271394,-3.388702,4.294356,4.290385,-3.972596,4.519255,2.444712,-3.082452,4.1922,-0.034135,0.189423,-4.327325,0.098476,-0.257675,2.73105,2.853365,-2.126683,4.375363,-0.267067,-1.276683,2.727832,-0.474279,-0.939423,1.576288,-0.269471,0.289183,-9.919046,-2.945945,-5.14493,3.120433,-0.85,6.218558,2.033413,-0.319712,7.783856,0.259135,-0.232933,12.917661,1.377337,3.566344,4.459856,-1.804327,6.399697,1.492308,-1.949038,7.210445,0.021635,0.13149,-1.732875,-0.203228,-1.557007,-0.9623,-1.33068,-0.022902,0.583894


Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,goals_avg,shots_avg,hits_avg,pim_avg,powerPlayOpportunities_avg,powerPlayGoals_avg,faceOffWinPercentage_avg,giveaways_avg,takeaways_avg
12044,2017021000,4,away,False,SO,Dave Hakstol,6,43,16,4,5,1,59.7,10,3,2.9375,31.46875,20.84375,8.703125,3.296875,0.6875,53.382812,9.296875,5.796875
12045,2017021000,14,home,True,SO,Jon Cooper,6,35,16,12,1,1,40.3,6,12,3.523077,32.230769,20.553846,9.015385,3.476923,0.830769,47.64,8.461538,7.061538


In [36]:
0.352823+0.265625	

0.618448

In [117]:
dfCombine.columns

Index(['game_id', 'date', 'home', 'home_goals', 'away', 'away_goals', 'hoa',
       'result', 'result_bool', 'Game', 'Team', 'TOI', 'CF_avg', 'CA_avg',
       'CF%_avg', 'FF_avg', 'FA_avg', 'FF%_avg', 'SF_avg', 'SA_avg', 'SF%_avg',
       'GF_avg', 'GA_avg', 'GF%_avg', 'xGF_avg', 'xGA_avg', 'xGF%_avg',
       'SCF_avg', 'SCA_avg', 'SCF%_avg', 'HDCF_avg', 'HDCA_avg', 'HDCF%_avg',
       'HDSF_avg', 'HDSA_avg', 'HDSF%_avg', 'HDGF_avg', 'HDGA_avg',
       'HDGF%_avg', 'HDSH%_avg', 'HDSV%_avg', 'MDCF_avg', 'MDCA_avg',
       'MDCF%_avg', 'MDSF_avg', 'MDSA_avg', 'MDSF%_avg', 'MDGF_avg',
       'MDGA_avg', 'MDGF%_avg', 'MDSH%_avg', 'MDSV%_avg', 'LDCF_avg',
       'LDCA_avg', 'LDCF%_avg', 'LDSF_avg', 'LDSA_avg', 'LDSF%_avg',
       'LDGF_avg', 'LDGA_avg', 'LDGF%_avg', 'LDSH%_avg', 'LDSV%_avg',
       'SH%_avg', 'SV%_avg', 'PDO_avg', 'blocks_avg', 'goals_avg', 'shots_avg',
       'hits_avg', 'pim_avg', 'powerPlayOpportunities_avg',
       'powerPlayGoals_avg', 'faceOffWinPercentage_avg', 'give

In [343]:
natstat2018_marg.to_csv('temp2.csv')
dfCombine.to_csv('Tradition_stat_mrd10.csv')
dfCombine_2018.to_csv('Tradition_stat_2018_mrd10.csv')

In [165]:
get_start_id = lambda year: year*1000000+20001

dfs = dict()
for year in range(2014, 2019):
    temp = dfCombine[(dfCombine.game_id >= get_start_id(year)) &(dfCombine.game_id <= get_start_id(year+1))]
    dfs[year] = temp.iloc[int(25/82*len(temp.index)):]
dfs[2018] = dfCombine_2018.iloc[int(25/82*len(dfCombine_2018.index)):].reset_index(drop=True)

nhl2 = pd.concat([dfs[k] for k in range(2014, 2018)]).reset_index(drop=True)
nhl2.to_csv('Tradition_stat_mrd5.csv')
dfs[2018].to_csv('Tradition_stat_2018_mrd5.csv')

In [164]:
natstat2014_marg.to_csv('temp2014.csv')

In [120]:
display(dfCombine.head())
display(dfCombine.tail())

Unnamed: 0,game_id,date,home,home_goals,away,away_goals,hoa,result,result_bool,Game,...,blocks_avg,goals_avg,shots_avg,hits_avg,pim_avg,powerPlayOpportunities_avg,powerPlayGoals_avg,faceOffWinPercentage_avg,giveaways_avg,takeaways_avg
0,2017020001,2017-10-04,Jets,2,Maple Leafs,7,H,A_reg,0,"2017-10-04 - Maple Leafs 7, Jets 2",...,,,,,,,,,,
1,2017020002,2017-10-04,Penguins,4,Blues,5,H,A_OT,0,"2017-10-04 - Blues 5, Penguins 4",...,,,,,,,,,,
2,2017020003,2017-10-04,Oilers,3,Flames,0,H,H_reg,1,"2017-10-04 - Flames 0, Oilers 3",...,,,,,,,,,,
3,2017020004,2017-10-04,Sharks,3,Flyers,5,H,A_reg,0,"2017-10-04 - Flyers 5, Sharks 3",...,,,,,,,,,,
4,2017020005,2017-10-05,Bruins,4,Predators,3,H,H_reg,1,"2017-10-05 - Predators 3, Bruins 4",...,,,,,,,,,,


Unnamed: 0,game_id,date,home,home_goals,away,away_goals,hoa,result,result_bool,Game,...,blocks_avg,goals_avg,shots_avg,hits_avg,pim_avg,powerPlayOpportunities_avg,powerPlayGoals_avg,faceOffWinPercentage_avg,giveaways_avg,takeaways_avg
4956,2014021226,2015-04-11,Blues,4,Wild,2,H,H_reg,1,"2015-04-11 - Wild 2, Blues 4",...,-3.938272,0.123457,0.185185,6.320988,2.123457,0.08642,0.209877,3.967901,-1.222222,-0.580247
4957,2014021227,2015-04-11,Stars,4,Predators,1,H,H_reg,1,"2015-04-11 - Predators 1, Stars 4",...,3.358025,0.345679,-0.691358,-0.469136,1.209877,0.37037,0.148148,2.648148,1.925926,1.358025
4958,2014021228,2015-04-11,Avalanche,3,Blackhawks,2,H,H_reg,1,"2015-04-11 - Blackhawks 2, Avalanche 3",...,3.975309,-0.148148,-5.987654,7.962963,2.209877,-0.197531,-0.135802,-1.248148,-1.123457,0.753086
4959,2014021229,2015-04-11,Coyotes,1,Ducks,2,H,A_reg,0,"2015-04-11 - Ducks 2, Coyotes 1",...,0.950617,-0.765432,-0.62963,2.320988,-1.37037,-0.061728,0.111111,0.333333,-2.592593,0.135802
4960,2014021230,2015-04-11,Canucks,6,Oilers,5,H,H_reg,1,"2015-04-11 - Oilers 5, Canucks 6",...,-0.765432,0.518519,1.580247,-4.925926,2.703704,0.049383,0.061728,-1.238272,-2.54321,-1.246914


In [121]:
display(dfMarg[2015].head())
display(dfMarg[2015].tail())

Unnamed: 0,game_id,goals_avg,shots_avg,hits_avg,pim_avg,powerPlayOpportunities_avg,powerPlayGoals_avg,faceOffWinPercentage_avg,giveaways_avg,takeaways_avg
0,2015020001,,,,,,,,,
1,2015020002,,,,,,,,,
2,2015020003,,,,,,,,,
3,2015020004,,,,,,,,,
4,2015020005,,,,,,,,,


Unnamed: 0,game_id,goals_avg,shots_avg,hits_avg,pim_avg,powerPlayOpportunities_avg,powerPlayGoals_avg,faceOffWinPercentage_avg,giveaways_avg,takeaways_avg
1225,2015021226,-0.308642,-0.246914,-0.790123,1.358025,-0.185185,-0.049383,1.803704,-3.45679,-0.790123
1226,2015021227,0.493827,1.54321,-4.061728,-1.123457,0.037037,0.08642,1.355556,2.925926,1.17284
1227,2015021228,-0.160494,-0.814815,-3.481481,0.209877,0.123457,-0.037037,-3.158025,-2.728395,-1.580247
1228,2015021229,0.135802,2.444444,4.814815,-0.740741,-0.234568,0.111111,3.658025,-0.481481,-2.506173
1229,2015021230,0.345679,2.62963,-1.506173,-3.308642,-0.333333,0.098765,-3.851852,5.691358,1.333333


In [198]:
classifiers = {
    'log': {'clf': LogisticRegression(penalty='l1', solver='liblinear'),
            'params': {'C': np.arange(0.05, .5, 0.005)}
           },
    'rf': {'clf': RandomForestClassifier(),
           'params': {'n_estimators': [100, 500],
                      #'max_leaf_nodes': [4, 8, 16],
                      'max_leaf_nodes': [2,4, 8, 16],
                      'max_depth': [2,4, 8, 16],
                      'random_state': [42]
                     }
          }
}

In [203]:
# def our_interaction2(X):
#     X2 = X.copy()
#     #X2['int'] = 1
#     #X2['G_rat'] = np.where(X2['GA_avg'] != 0.0, X2['GF_avg'] / X2['GA_avg'], 0.0)
#     X2 = X2.drop(columns = ['GF_avg', 'GA_avg'])
#     X2['blocks-faceoff'] = X2['blocks_avg']*X2['faceOffWinPercentage_avg']
#     #X2['shots_avg-faceokk_avg'] = X2['shots_avg'] * X2['faceOffWinPercentage_avg']
#     #X2['SCF_avg-giveaways_avg'] = X2['SCF_avg'] * X2['giveaways_avg']
#     X2['xGF%_avg-takeaways_avg'] = X2['xGF%_avg'] * X2['takeaways_avg']
#     #X2['xGF%_avg-goals_avg'] = X2['xGF%_avg'] * X2['goals_avg']
    
#     return X2

def our_interaction(X):
    X2 = X.copy()
    #X2['a'] = X2['xGF%_avg']*X2['SCF_avg']
    X2['b'] = X2['goals_avg']*X2['SCF_avg']
    #X2['c'] = X2['blocks_avg']*X2['giveaways_avg']
    X2['d'] = X2['goals_avg']*X2['giveaways_avg']
    #X2['e'] = X2['shots_avg']*X2['giveaways_avg']
    #X2['f'] = X2['takeaways_avg']*X2['giveaways_avg']
    #X2['shots_avg-faceokk_avg'] = X2['shots_avg'] * X2['faceOffWinPercentage_avg']
    #X2['SCF_avg-giveaways_avg'] = X2['SCF_avg'] * X2['giveaways_avg']
    #X2['xGF%_avg-takeaways_avg'] = X2['xGF%_avg'] * X2['takeaways_avg']
    #X2['xGF%_avg-goals_avg'] = X2['xGF%_avg'] * X2['goals_avg']
    
    return X2

In [207]:
best_cols2 = ['GF_avg', 'GA_avg','xGF%_avg','SCF_avg','HDSF%_avg','HDGF%_avg','HDSH%_avg','HDSV%_avg','MDCA_avg','MDSF_avg','MDSF%_avg','MDGF%_avg','MDSH%_avg','LDCA_avg','LDCF%_avg','LDSF_avg','LDGF%_avg','LDSH%_avg','SH%_avg','SV%_avg','blocks_avg','goals_avg','shots_avg','hits_avg','pim_avg','faceOffWinPercentage_avg','giveaways_avg','takeaways_avg']
best_cols22 = ['xGF%_avg','SCF_avg','HDSF%_avg','HDGF%_avg','HDSH%_avg','HDSV%_avg','MDCA_avg','MDSF_avg','MDSF%_avg','MDGF%_avg','MDSH%_avg','LDCA_avg','LDCF%_avg','LDSF_avg','LDGF%_avg','SH%_avg','SV%_avg','blocks_avg','goals_avg','shots_avg','pim_avg','faceOffWinPercentage_avg','giveaways_avg','takeaways_avg']
best_cols23 = ['xGF%_avg','SCF_avg','SV%_avg','blocks_avg','goals_avg','shots_avg','pim_avg','faceOffWinPercentage_avg','giveaways_avg','takeaways_avg']
best_cols24 = ['GF_avg', 'GA_avg', 'xGF%_avg','SCF_avg', 'SV%_avg', 'blocks_avg','goals_avg','shots_avg','pim_avg','faceOffWinPercentage_avg','giveaways_avg','takeaways_avg']
best_cols = ['GF_avg', 'GA_avg', 'xGF%_avg', 'SCF_avg', 'blocks_avg', 'goals_avg', 'shots_avg', 'pim_avg', 'faceOffWinPercentage_avg', 'giveaways_avg', 'takeaways_avg']

best_cols_new_1 = ['CF_avg', 'FF_avg', 'FA_avg', 'GF%_avg', 'xGF%_avg', 'SCF_avg',
       'SCA_avg', 'SCF%_avg', 'HDCF_avg', 'HDCF%_avg', 'HDSF_avg', 'HDGF%_avg',
       'HDSH%_avg', 'HDSV%_avg', 'MDCA_avg', 'MDSF_avg', 'MDSF%_avg',
       'MDGF%_avg', 'MDSH%_avg', 'LDCA_avg', 'LDSF_avg', 'LDGF%_avg',
       'LDSH%_avg', 'LDSV%_avg', 'SH%_avg', 'SV%_avg', 'blocks_avg',
       'goals_avg', 'shots_avg', 'hits_avg', 'pim_avg',
       'powerPlayOpportunities_avg', 'faceOffWinPercentage_avg',
       'giveaways_avg', 'takeaways_avg']
best_cols_new_2 = ['xGF%_avg', 'SCF_avg',
       'HDSF_avg', 'HDGF%_avg',
       'HDSH%_avg', 'HDSV%_avg', 'MDCA_avg', 'MDSF_avg', 'MDSF%_avg',
       'MDGF%_avg', 'MDSH%_avg', 'LDCA_avg', 'LDSF_avg', 'LDGF%_avg',
       'LDSH%_avg', 'LDSV%_avg', 'SH%_avg', 'SV%_avg', 'blocks_avg',
       'goals_avg', 'shots_avg', 'hits_avg', 'pim_avg',
       'powerPlayOpportunities_avg', 'faceOffWinPercentage_avg',
       'giveaways_avg', 'takeaways_avg']

best_cols_new_3 = ['xGF%_avg', 'SCF_avg',
       'SH%_avg', 'SV%_avg', 'blocks_avg',
       'goals_avg', 'shots_avg', 'hits_avg', 'pim_avg',
       'powerPlayOpportunities_avg', 'faceOffWinPercentage_avg',
       'giveaways_avg', 'takeaways_avg']

best_cols_new_4 = ['xGF%_avg', 'SCF_avg',
       'blocks_avg',
       'goals_avg', 'shots_avg', 'hits_avg', 'pim_avg',
       'powerPlayOpportunities_avg', 'faceOffWinPercentage_avg',
       'giveaways_avg', 'takeaways_avg']

best_cols_new_5 = ['xGF%_avg', 'SCF_avg',
       'blocks_avg',
       'goals_avg', 'shots_avg', 'hits_avg', 'pim_avg',
       'faceOffWinPercentage_avg',
       'giveaways_avg', 'takeaways_avg']

best_cols_new_6 = ['xGF%_avg', 'SCF_avg',
       'blocks_avg',
       'goals_avg', 'shots_avg', 'hits_avg', 'pim_avg',
       'faceOffWinPercentage_avg',
       'giveaways_avg', 'takeaways_avg']

best_cols_new_7 = ['xGF%_avg', 'SCF_avg',
       'blocks_avg',
       'goals_avg', 'shots_avg', 'hits_avg', 'pim_avg',
       'faceOffWinPercentage_avg',
       'giveaways_avg']

best_cols_new_8 = ['xGF%_avg', 'SCF_avg',
       'blocks_avg',
       'goals_avg', 'shots_avg', 'pim_avg',
       'faceOffWinPercentage_avg',
       'giveaways_avg']


X = dfCombine.dropna().loc[:, best_cols_new_8]
X['h'] = 1
y = dfCombine.iloc[X.index.values].result_bool

#pf = PolynomialFeatures(degree = 2, interaction_only = True)
#tdf = pd.DataFrame(pf.fit_transform(X))
#print(t.shape, tdf.shape)
X2 = our_interaction(X)
#X2 = tdf

#print(X.shape, X2.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.25, random_state=42)

grid_search = GridSearchCV(classifiers['log']['clf'],
                           classifiers['log']['params'],
                           cv = 10, scoring = 'neg_log_loss', return_train_score=True)


In [208]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l1',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([0.0...
       0.23 , 0.235, 0.24 , 0.245, 0.25 , 0.255, 0.26 , 0.265, 0.27 ,
       0.275, 0.28 , 0.285, 0.29 , 0.295, 0.3  , 0.305, 0.31 , 0.315,
       0.32 , 0.325, 0.33 , 0.335, 0.34 , 0.345, 0.35 , 0.355, 0.36 ,
       0.365, 0.37 , 0.375, 0.38 , 0.385, 0.39 , 0.395, 0.4  , 0.405,
       0.41 ,

In [209]:
log_best2 = grid_search.best_estimator_
print(log_best2)

X_test_act = dfCombine_2018.loc[:, best_cols_new_8].dropna().copy()
X_test_act['h'] = 1
y_test_act = dfCombine_2018.iloc[X_test_act.index.values].result_bool

t2df = pd.DataFrame(pf.fit_transform(X_test_act))
X2_test_act = pd.concat([X_test_act.reset_index(drop=True), t2df.reset_index(drop=True)], axis = 1)
X2_test_act = t2df
X2_test_act = our_interaction(X_test_act)




print(log_best2.coef_)
print(X_train.columns[log_best2.coef_[0,:]!=0.0])
print(X_train.columns[log_best2.coef_[0,:]==0.0])

print(log_best2.score(X_test, y_test), log_loss(y, log_best2.predict_proba(X2)[:,1]))
print(log_best2.score(X2_test_act, y_test_act), log_loss(y_test_act, log_best2.predict_proba(X2_test_act)[:,1]))
#print(log_loss(y, log_best2.predict_proba(X)[:,1]))
#print(log_loss(y_test_act, log_best2.predict_proba(X_test_act)[:,1]))

LogisticRegression(C=0.2649999999999999, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
[[-0.01466674  0.02172763  0.01374207  0.26670216  0.03668615  0.02216411
  -0.00149777  0.01393147  0.01234436 -0.03144612 -0.03484016]]
Index(['xGF%_avg', 'SCF_avg', 'blocks_avg', 'goals_avg', 'shots_avg',
       'pim_avg', 'faceOffWinPercentage_avg', 'giveaways_avg', 'h', 'b', 'd'],
      dtype='object')
Index([], dtype='object')
0.5729508196721311 0.679244005244263
0.547923322683706 0.6926238940204316
