In [1]:
import requests
import json
import pandas
import numpy
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## Setup
### Retrieving the data from the API
We will start this notebook by using the same function as in the previous post to obtain game data from hte live feed end point of the NHL API, and do so for the first 50 games of the 2019-2020 season. 

This time, we will define a function to process each game output:

In [2]:
def processApiOutput(responseRaw): 
    #convert full response to dataframe
    responseFinal = pandas.DataFrame(responseRaw)

    #split out player info into 2 columns
    responseFinal['player1'] = responseFinal['players'].str[0].apply(pandas.Series)['player']
    responseFinal['player1_type'] = responseFinal['players'].str[0].apply(pandas.Series)['playerType']
    responseFinal['player1'].loc[responseFinal['player1'].isnull()] = responseFinal['player1'].loc[responseFinal['player1'].isnull()].apply(lambda x: {})

    responseFinal['player2'] = responseFinal['players'].str[1].apply(pandas.Series)['player']
    responseFinal['player2_type'] = responseFinal['players'].str[1].apply(pandas.Series)['playerType']
    responseFinal['player2'].loc[responseFinal['player2'].isnull()] = responseFinal['player2'].loc[responseFinal['player2'].isnull()].apply(lambda x: {})

    #team pre-processing
    responseFinal['team'].loc[responseFinal['team'].isnull()] = responseFinal['team'].loc[responseFinal['team'].isnull()].apply(lambda x: {})

    #loop through all columns of the new dataframe
    for column in responseFinal:
        #if column is a dictionary, convert to temporary dataframe and append to main one
        if(type(responseFinal[column].iloc[3]) is dict):
            tempData = pandas.DataFrame(responseFinal[column].tolist())
            tempData.columns = [str(column) + '_' + str(string) for string in tempData.columns]
            responseFinal = pandas.concat([responseFinal.reset_index(drop=True), tempData], axis=1)

    responseFinal = responseFinal[['result_event','result_eventCode', 'result_eventTypeId', 
                                   'result_description','about_eventIdx', 'about_eventId', 
                                   'about_period', 'about_periodType','about_ordinalNum', 
                                   'about_periodTime', 'about_periodTimeRemaining',
                                   'about_dateTime', 'about_goals', 'coordinates_x', 'coordinates_y',
                                   'team_name', 'team_triCode', 'player1_id','player1_fullName', 
                                   'player1_type','player2_id', 'player2_fullName','player2_type',]]

    return responseFinal

And use this function to retrieve the live feed data for the first 50 games: 

In [3]:
#create empty dataframe
gameData = pandas.DataFrame(columns = ['result_event', 'result_eventCode', 'result_eventTypeId',
       'result_description', 'about_eventIdx', 'about_eventId', 'about_period',
       'about_periodType', 'about_ordinalNum', 'about_periodTime',
       'about_periodTimeRemaining', 'about_dateTime', 'about_goals',
       'coordinates_x', 'coordinates_y', 'team_name', 'team_triCode',
       'player1_id', 'player1_fullName', 'player1_type', 'player2_id',
       'player2_fullName', 'player2_type','gameId'])

for i in range(1,201):
    #fill game id with necessary number of leading 0s
    if len(str(i))==1:
        gameNum = '000' + str(i)
    elif len(str(i))==2:
        gameNum = '00' + str(i)
    else:
        gameNum = '0' + str(i)
    
    #make API call and retrieve response 
    url = 'https://statsapi.web.nhl.com/api/v1/game/201902' + gameNum + '/feed/live'
    responseRaw = requests.get(url).json()['liveData']['plays']['allPlays']
    
    #process output
    tempData = processApiOutput(responseRaw)
    
    #append to dataframe
    tempData['gameId'] = gameNum
    gameData = gameData.append(tempData, ignore_index=True)
    

### Additional data

To the live feed data, we add some information taken from other sections of the output, mainly from the player listing (e.g. player position) and the game information (e.g. home vs away team). 

#### Game information

In [4]:
#create empty dataframe
teams = pandas.DataFrame()

for i in range(1,201):
    #fill game id with necessary number of leading 0s
    if len(str(i))==1:
        gameNum = '000' + str(i)
    elif len(str(i))==2:
        gameNum = '00' + str(i)
    else:
        gameNum = '0' + str(i)
        
    #retrieve API response
    url = 'https://statsapi.web.nhl.com/api/v1/game/201902' + gameNum + '/feed/live'
    
    #get team info for each game
    dict_ = {
        'gameId': gameNum,
        'awayTeam': requests.get(url).json()['gameData']['teams']['away']['abbreviation'],
        'homeTeam': requests.get(url).json()['gameData']['teams']['home']['abbreviation']
    }

    #append to final teams dataframe
    teams = teams.append(dict_, ignore_index=True)

#join team data onto main dataframe
gameData = pandas.merge(gameData,teams,how='inner',left_on='gameId',right_on='gameId')

#keep only shot and goal data
gameData = gameData[gameData['result_eventTypeId'].isin(['SHOT','GOAL'])]

#### Player information

In [5]:
#create empty dataframe
players = pandas.DataFrame()

allPlayers = list(set(gameData[gameData['player1_id'].isnull()==False]['player1_id']))

for i in range(0,len(set(gameData[gameData['player1_id'].isnull()==False]['player1_id']))): 
    #retrieve API response
    url = 'https://statsapi.web.nhl.com/api/v1/people/' + str(int(allPlayers[i]))
    res = requests.get(url)
    
    #get info for each players
    dict_ = {
        'playerId': allPlayers[i],
        'playerPosition': requests.get(url).json()['people'][0]['primaryPosition']['abbreviation']
    }
    
    #append to final teams dataframe
    players = players.append(dict_, ignore_index=True)
    
#join team data onto main dataframe
gameData = pandas.merge(gameData,players,how='inner',left_on='player1_id',right_on='playerId')


### Data overview

With the data from these 50 games, we have games from all 31 teams, with a total of 3,193 shot events (goals and shots on goal). Missed and blocked shots have been purposefully left out as the event description does not specify what kind of shot was taken (e.g. backhand vs tip-in vs slapshot). 

Of the 3193 shots, 2872 were shots on goal and 321 were actual goals (or almost exactly 10%).

In [6]:
gameData.head()

Unnamed: 0,result_event,result_eventCode,result_eventTypeId,result_description,about_eventIdx,about_eventId,about_period,about_periodType,about_ordinalNum,about_periodTime,...,player1_fullName,player1_type,player2_id,player2_fullName,player2_type,gameId,awayTeam,homeTeam,playerId,playerPosition
0,Goal,TOR10,GOAL,"Brady Tkachuk (1) Tip-In, assists: Connor Brow...",4,10,1,REGULAR,1st,00:25,...,Brady Tkachuk,Scorer,8477015.0,Connor Brown,Assist,1,OTT,TOR,8480801.0,LW
1,Shot,TOR210,SHOT,Brady Tkachuk Wrist Shot saved by Frederik And...,75,210,1,REGULAR,1st,13:27,...,Brady Tkachuk,Shooter,8475883.0,Frederik Andersen,Goalie,1,OTT,TOR,8480801.0,LW
2,Shot,OTT22,SHOT,Brady Tkachuk Wrap-around saved by Alexandar G...,19,22,1,REGULAR,1st,03:10,...,Brady Tkachuk,Shooter,8480382.0,Alexandar Georgiev,Goalie,20,NYR,OTT,8480801.0,LW
3,Shot,OTT39,SHOT,Brady Tkachuk Snap Shot saved by Alexandar Geo...,42,39,1,REGULAR,1st,07:41,...,Brady Tkachuk,Shooter,8480382.0,Alexandar Georgiev,Goalie,20,NYR,OTT,8480801.0,LW
4,Goal,OTT202,GOAL,"Brady Tkachuk (2) Deflected, assists: Thomas C...",71,202,1,REGULAR,1st,12:14,...,Brady Tkachuk,Scorer,8478469.0,Thomas Chabot,Assist,20,NYR,OTT,8480801.0,LW


### Defining the dependent variable

The NHL tracks 7 different types of shots: tip-in, wrist shot, slap shot, snap shot, backhand, wrap-around, and deflected. These are specified in the event description, in different formats depending on whether the event is a shot or a goal. 

To extract the shot type from each description, we look for a partial string match: 

In [7]:
#categorize each shot based on the description field
gameData['shotType'] = numpy.where(gameData['result_description'].str.contains("Slap"),'Slap shot',
                                   numpy.where(gameData['result_description'].str.contains("Snap"),'Snap shot',
                                               numpy.where(gameData['result_description'].str.contains('Backhand'),'Backhand',
                                                           numpy.where(gameData['result_description'].str.contains('Tip-In'),'Tip-in',
                                                                       numpy.where(gameData['result_description'].str.contains('Wrist Shot'),'Wrist Shot',
                                                                                   numpy.where(gameData['result_description'].str.contains('Deflected'),'Deflected',
                                                                                               numpy.where(gameData['result_description'].str.contains('Wrap-around'),'Wrap-around',
                                   'Other')))))))

#validate that no shot ended up being categorized as other
set(gameData['shotType'])

{'Backhand',
 'Deflected',
 'Slap shot',
 'Snap shot',
 'Tip-in',
 'Wrap-around',
 'Wrist Shot'}

## Exploratory data analysis and feature engineering
### Motivations
The following subsections have one primary purpose: to get a basic understanding of how shot type relates to the other variable present in the data set. This exercise will help determine which variables we expect to be valuable predictors in model, and how we expect them to influence the model. 

For example, we might make the assumption that shots taken from near the blue line are more likely to be slap shots than other shot types. Making a chart of shot type vs coordinates can help confirm whether that intuition is valid or not. 

Once we have a model, we can then also look at the sign and magnitude of the relevant coefficient, and see whether it falls in line with the initial assumption (and if not, try to understand why - whether the issue is with the variable definition, the model itself, or simply our assumption being partially incorrect).   

At the same time, this exploratory analysis will us think of variables that are not currently included in the data set but that we would expect to be predictive of shot type. 

Similarly to the blue line example mentioned above, we could imagine player position mattering (e.g. defensemen are more like to take slap shots than make tip-ins). While player position is not currently in our data set, it's something that we can obtain. 

Other variables can also be constructed using already available data (e.g. using coordinates to identify the zone the shot was taken in, or score to determine whether the team taking the shot was leading or trailing). All of these fall under feature engineering. 

### Event breakdown by shot type
Wrist shots are by far the most common shot type, following quite distantly by slap shots and snap shots. 

In [8]:
#absolute counts
gameData[['about_eventId','shotType']].groupby(['shotType']).count().reset_index()

Unnamed: 0,shotType,about_eventId
0,Backhand,1034
1,Deflected,247
2,Slap shot,1720
3,Snap shot,1677
4,Tip-in,707
5,Wrap-around,114
6,Wrist Shot,7181


### Shot type by period
Quick observations:
* More shots taken in the 2nd period than 1st and 3rd
* In all cases, wrist shots account for over 50% of all shots taken
* Looks like the proportion of snap shots taken in the 3rd period in lower than in other (regular time) periods
* Deflected shots ocurred more often in the 2nd period than in the 1st and 3rd

In [9]:
#combine period type (regular vs OT vs shoot-out) and period number into one column
gameData['about_periodDetailed'] = gameData['about_periodType'].str.cat(gameData[['about_period']].astype(str), sep='-')

#remove data from shootouts and overtime
gameData = gameData[gameData['about_periodType']!='SHOOTOUT']
gameData = gameData[gameData['about_periodType']!='OVERTIME']

#get counts by group and transform to wide
graphData = gameData[['about_periodDetailed','shotType','about_eventId']].groupby(['about_periodDetailed','shotType']).count().unstack().fillna(0)

#create plot
graphData.plot(kind='bar',stacked=True).legend(bbox_to_anchor=(1.2, 0.5))


<matplotlib.legend.Legend at 0x12e68c990>

### Shot type by score (leading vs trailing vs tied)
Quick observations: 
* Slapshots were more commonly taken when the team is trailing, rather than leading or tied
* Fewer wrist shots were taken when teams were tied, but comparable proportions when leading or trailing
* The frequency of certains shot types, such as backhands and tip-ins, appear to be virtually identical across score states

In [10]:
#extracting number of goals for home vs away team
gameData['about_homeGoals'] = [element['home'] for element in gameData['about_goals']]
gameData['about_awayGoals'] = [element['away'] for element in gameData['about_goals']]

#adjust "current" (i.e. pre-event) score for goal events so that the score right before the current goal is reflected
gameData['about_homeGoals'] = numpy.where((gameData['result_eventTypeId']=='GOAL') & (gameData['team_triCode']==gameData['homeTeam']),gameData['about_homeGoals']-1,gameData['about_homeGoals'])
gameData['about_awayGoals'] = numpy.where((gameData['result_eventTypeId']=='GOAL') & (gameData['team_triCode']==gameData['awayTeam']),gameData['about_awayGoals']-1,gameData['about_awayGoals'])

#inferring score state of the team taking a shot
gameData['team_scoreState'] = numpy.where(gameData['about_homeGoals']==gameData['about_awayGoals'],'Tied',
                                          numpy.where((gameData['about_homeGoals']>gameData['about_awayGoals']) & (gameData['homeTeam']==gameData['team_triCode']),'Leading',
                                                      numpy.where((gameData['about_homeGoals']<gameData['about_awayGoals']) & (gameData['awayTeam']==gameData['team_triCode']),'Leading',
                                                                  'Trailing')))


In [11]:
#get counts by score state and shot type
graphData = pandas.crosstab(index=gameData["shotType"],
                columns=gameData["team_scoreState"],
                            margins=True)

#display as proportions
graphData/graphData.ix["All"]


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  import sys


team_scoreState,Leading,Tied,Trailing,All
shotType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Backhand,0.08465,0.081316,0.076575,0.080632
Deflected,0.021966,0.019506,0.017671,0.019594
Slap shot,0.128851,0.131375,0.148618,0.136752
Snap shot,0.132065,0.140306,0.123924,0.131995
Tip-in,0.055451,0.054759,0.059357,0.056604
Wrap-around,0.010715,0.009871,0.00725,0.009192
Wrist Shot,0.566301,0.562867,0.566606,0.565231
All,1.0,1.0,1.0,1.0


### Shot type by location
For the sake of simplicity, we simply initially look at the average X and Y coordinates of each shot type. In a later step, we use those coordinates to create new features that categorize each shot more broadly (e.g. below the face-off dots, below the goal line, in the neutral zone). 

For context, the y-axis ranges from -42 to +42 (left to right when facing the goalie), and the x-axis from -97 to +97. 

Because teams switch where they shoot, we'll first need to adjsut the data slightly, so that the coordinates are consistent across periods, but also comparable directly between home and away teams. To make matters more complicated, different arenas use different coordinates system (x-axis oriented in opposite direction, for example). 

We first determine which arenas/teams used which system: 

In [12]:
#data used to determine which coordinte system each team uses in their home arena
a = gameData[['gameId','about_period','team_triCode','homeTeam','awayTeam','result_description','shotType','coordinates_x','coordinates_y']]
a[(a['shotType']=='Slap shot') & (a['homeTeam']=='WSH') & (a['homeTeam']==a['team_triCode']) & (a['about_period'].isin([1,3]))][['gameId','shotType','homeTeam','coordinates_x']]

#home team shoots negative in first period
xHomeNegative = ['CAR','COL','DET','FLA','PHI','COL','TOR','VAN','VGK','MTL','NYR','CHI','LAK','MIN','WPG','WSH']

#home teaam shoots positive in first period
xHomePositive = ['BUF','CBJ','EDM','NJD','NYI','PIT','STL','ANA','ARI','NSH','OTT','SJS','TBL']

And then we adjust the data to have consistent coordinate references: 

In [13]:
#home team (xHomeNegative)
gameData['coordinates_x'] = numpy.where((gameData['about_period']!=2) & (gameData['homeTeam'].isin(xHomeNegative)) & (gameData['team_triCode']==gameData['homeTeam']),
                                        (-1*gameData['coordinates_x']),gameData['coordinates_x'])
gameData['coordinates_y'] = numpy.where((gameData['about_period']!=2) & (gameData['homeTeam'].isin(xHomeNegative)) & (gameData['team_triCode']==gameData['homeTeam']),
                                        (-1*gameData['coordinates_y']),gameData['coordinates_y'])

#away team (xHomeNegative)
gameData['coordinates_x'] = numpy.where((gameData['about_period']==2) & (gameData['homeTeam'].isin(xHomeNegative)) & (gameData['team_triCode']==gameData['awayTeam']),
                                        (-1*gameData['coordinates_x']),gameData['coordinates_x'])
gameData['coordinates_y'] = numpy.where((gameData['about_period']==2) & (gameData['homeTeam'].isin(xHomeNegative)) & (gameData['team_triCode']==gameData['awayTeam']),
                                        (-1*gameData['coordinates_y']),gameData['coordinates_y'])

#home team (xHomePositive)
gameData['coordinates_x'] = numpy.where((gameData['about_period']==2) & (gameData['homeTeam'].isin(xHomePositive)) & (gameData['team_triCode']==gameData['homeTeam']),
                                        (-1*gameData['coordinates_x']),gameData['coordinates_x'])
gameData['coordinates_y'] = numpy.where((gameData['about_period']==2) & (gameData['homeTeam'].isin(xHomePositive)) & (gameData['team_triCode']==gameData['homeTeam']),
                                        (-1*gameData['coordinates_y']),gameData['coordinates_y'])

#away team (xHomePositive)
gameData['coordinates_x'] = numpy.where((gameData['about_period']!=2) & (gameData['homeTeam'].isin(xHomePositive)) & (gameData['team_triCode']==gameData['awayTeam']),
                                        (-1*gameData['coordinates_x']),gameData['coordinates_x'])
gameData['coordinates_y'] = numpy.where((gameData['about_period']!=2) & (gameData['homeTeam'].isin(xHomePositive)) & (gameData['team_triCode']==gameData['awayTeam']),
                                        (-1*gameData['coordinates_y']),gameData['coordinates_y'])


gameData[['shotType','coordinates_x','coordinates_y']].groupby(['shotType']).mean()

Unnamed: 0_level_0,coordinates_x,coordinates_y
shotType,Unnamed: 1_level_1,Unnamed: 2_level_1
Backhand,67.519,-2.079
Deflected,61.238683,-0.72428
Slap shot,40.364976,0.376769
Snap shot,53.403177,0.114233
Tip-in,65.910256,0.066952
Wrap-around,73.859649,-0.263158
Wrist Shot,51.383452,0.466049


Quick observations:
* In terms of left-to-right, most shot types have an average y coordinate very close to 0, which makes sense. The closest thing to an exception are backhands, which on average are taken from slightly more to the left. 
* In terms of distance across the rink, slap shots came the furthest away from the goal (or closest to centre ice, which is the axis origin). Wrap-arounds, also unsurprisingly, came from closest to the goal line, while other shot types sit in between those extremes.  

In addition to this, we also create a few new categorical variables, based on the adjusted coordinates: 

In [14]:
##Y-axis: left vs right vs middle lane (divided into thirds)
gameData['yCoordCategory'] = numpy.where(gameData['coordinates_y']<(-14),'Left',
                                         numpy.where(gameData['coordinates_y']<14,'Middle','Right'))

##X-axis: above the blue line vs below the blue line and above the face-off dots, below the face-off dots and above the goal line, below the goal line
gameData['xCoordCategory'] = numpy.where(gameData['coordinates_x']<25,'Above blue line',
                                         numpy.where(gameData['coordinates_x']>=89,'Below goal line',
                                                     numpy.where(gameData['coordinates_x']>=69,'Below face-off dots',
                                                                 'Below blue line')))

### Shot type by player position

Quick observations: 
* Unsurprisingly, snap shots and slapshots were much more commonly taken by defensemen than forwards
* Conversely, forwards (whether centers or wingers) took a greater proportion of the wrist and backhanded shots
* No significant difference between left and right wing, some smaller ones between wingers and centers (e.g. smaller proportion of slapshots taken by centers)

In [59]:
#get counts by player position and shot type
graphData = pandas.crosstab(index=gameData['shotType'],
                           columns=gameData['playerPosition'],
               margins=True)

#display as proportions
graphData/graphData.ix['All']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  import sys


playerPosition,C,D,G,LW,RW,All
shotType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Backhand,0.101474,0.03137,0.0,0.100738,0.100705,0.080632
Deflected,0.027273,0.003853,0.0,0.02952,0.019134,0.019594
Slap shot,0.065602,0.289763,0.0,0.078229,0.082578,0.136752
Snap shot,0.12973,0.124656,0.0,0.137269,0.143001,0.131995
Tip-in,0.07543,0.009081,0.0,0.07786,0.076032,0.056604
Wrap-around,0.012285,0.002201,0.0,0.014022,0.009063,0.009192
Wrist Shot,0.588206,0.539075,1.0,0.562362,0.569486,0.565231
All,1.0,1.0,1.0,1.0,1.0,1.0


## Data pre-processing
Before training the models, we'll clean-up the data slightly to remove some of the unecessary variables, and reformat others. 

We'll start by transforming and/or reformatting a few variables:

In [63]:
#saving original data in duplicate variable
fullData = gameData

#create function to convert MM:SS timestamp to minutes with decimals
def convert_time(string): 
    converted_string = float(int(string[0:2])+(int((string)[3:5])/60))
    return converted_string

#time remaining in period (in minutes)
#fullData['about_periodTimeRemaining'] = fullData['about_periodTimeRemaining'].map(convert_time)

#time remaining in game (in minutes)
fullData['about_gameTimeRemaining'] = numpy.where(fullData['about_ordinalNum']==1,40+fullData['about_periodTimeRemaining'],
            numpy.where(fullData['about_ordinalNum']==1,20+fullData['about_periodTimeRemaining'],
                        fullData['about_periodTimeRemaining']))

#goal difference
fullData['goalDiff'] = numpy.where(fullData['team_triCode']==fullData['homeTeam'],fullData['about_homeGoals']-fullData['about_awayGoals'],
                                   fullData['about_awayGoals']-fullData['about_homeGoals'])


We'll then convert all categorical variables to numeric, using dummy variables: 

In [67]:
#define the column containing categorical variables
catVars = ['about_ordinalNum','team_scoreState','yCoordCategory','xCoordCategory','playerPosition']

#loop through all categorical columns
for var in catVars:
    #create dummy variable columns
    catData = pandas.get_dummies(fullData[var], prefix=var)
    #append new columns to existing data
    fullData = fullData.join(catData)
    
#remove categorical columns
allCols = fullData.columns
keepCols = [i for i in allCols if i not in catVars]
fullData = fullData[keepCols]

#remove spaces and dashes from column names
prevCols = fullData.columns
newCols = [] 

for col in prevCols:
    newName = col.replace(' ','_').replace('-','')
    newCols.append(newName)
    
fullData.columns = newCols

Then we'll subset the datframe to keep only the columns corresponding to our depending variable, and the predictors we will be using in the model: 

In [69]:
fullData = fullData[['shotType', 'result_description', 'about_eventIdx',
                     'about_periodTimeRemaining','about_gameTimeRemaining', 
                     'coordinates_x', 'coordinates_y',
                     'about_homeGoals', 'about_awayGoals','goalDiff',
                     'about_ordinalNum_1st','about_ordinalNum_2nd', 'about_ordinalNum_3rd',
                     'team_scoreState_Leading', 'team_scoreState_Tied',
                     'team_scoreState_Trailing', 'yCoordCategory_Left',
                     'yCoordCategory_Middle', 'yCoordCategory_Right',
                     'xCoordCategory_Above_blue_line', 'xCoordCategory_Below_blue_line',
                     'xCoordCategory_Below_faceoff_dots', 'xCoordCategory_Below_goal_line',
                     'playerPosition_C', 'playerPosition_D', 'playerPosition_G',
                     'playerPosition_LW', 'playerPosition_RW']]

depVar = fullData['shotType']

predVar = fullData[['about_periodTimeRemaining','about_gameTimeRemaining', 
                     'coordinates_x', 'coordinates_y',
                     'about_homeGoals', 'about_awayGoals','goalDiff',
                     'about_ordinalNum_1st','about_ordinalNum_2nd', 'about_ordinalNum_3rd',
                     'team_scoreState_Leading', 'team_scoreState_Tied',
                     'team_scoreState_Trailing', 'yCoordCategory_Left',
                     'yCoordCategory_Middle', 'yCoordCategory_Right',
                     'xCoordCategory_Above_blue_line', 'xCoordCategory_Below_blue_line',
                     'xCoordCategory_Below_faceoff_dots', 'xCoordCategory_Below_goal_line',
                     'playerPosition_C', 'playerPosition_D', 'playerPosition_G',
                     'playerPosition_LW', 'playerPosition_RW']]

And finally split the data into training and testing dataset: 

In [77]:
xTrain, xTest, yTrain, yTest = train_test_split(pandas.DataFrame.to_numpy(predVar), 
                                                pandas.DataFrame.to_numpy(depVar), 
                                                test_size=0.3, random_state=0)