In [1]:
import pandas as pd
import numpy as np
import math
import torch
from torch.autograd import Variable
from torch import nn
from torch.nn import functional as F

import matplotlib.pyplot as plt
import time

In [2]:
base_dir = "Data/"

label2one = {'B':0,'S':1,'X':2, '<PAD>':3}
one2label = {0:'B', 1:'S', 2:'X', 3:'<PAD>'}

def normalize(data):
    mu = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    return mu, std, (data-mu)/std

vfunc = np.vectorize(lambda x:label2one[x])

#input_labels = ['pitcher','batter', 'pitch_type','x0','x','y','ax','ay','az','px','pz','sz_top','sz_bot',
#             'vx0','vy0','vz0','pfx_x','z0','start_speed','end_speed',
#             'break_y','break_angle','break_length','spin_dir','spin_rate',
#             'inning','balls','strikes'
#             ]
#input_labels = ['pitcher','batter', 'pitch_type','balls', 'strikes','inning','pitch_count']
input_labels = ['date','stadium', 'inning', 'side',
                'pitcher','batter', 
               'on_1b', 'on_2b', 'on_3b', 
               'pitch_count', 'balls', 'strikes',
#       'ay', 'px', 'ax',  
#       'sz_bot', 'vz0', 'vy0', 'pfx_x',
#       'type_confidence', 'z0', 'tfs', 'pz', 'start_speed', 'az', 'zone',
#       'break_angle', 'spin_dir', 'end_speed', 'vx0', 'sz_top', 'nasty',
#       'pfx_z', 'break_y', 'x', 'spin_rate',
#       'y0', 'break_length', 'y', 'x0'
      ]
feature_length = len(input_labels)-3
print("Feature length:{}".format(feature_length))
train_years = [4,5,6]
dev_years = [7]


train_x = {}
train_y = {}
ctr = 0
for y in train_years:
    filename= base_dir+"MLB_201{0}/MLB_PitchFX_201{0}_RegularSeason.csv".format(str(y))
    print("Loading {}".format(filename))
    f = pd.read_csv(filename)
    
    tmp_x = f[input_labels]
    tmp_y = f['umpcall']

    tmp_x = tmp_x.as_matrix()
    tmp_y = tmp_y.as_matrix()
    tmp_y = vfunc(tmp_y)

    if ctr==0:
        ctr=1
        train_x = tmp_x
        train_y = tmp_y
    else:
        print(train_x.shape)
        print(tmp_x.shape)
        train_x = np.concatenate((train_x, tmp_x), axis=0)
        train_y = np.concatenate((train_y, tmp_y), axis=0)
    
    filename= base_dir+"MLB_201{0}/MLB_PitchFX_201{0}_PostSeason.csv".format(str(y))
    print("Loading {}".format(filename))
    f = pd.read_csv(filename)
    
    
    tmp_x = f[input_labels]
    tmp_y = f['umpcall']

    tmp_x = tmp_x.as_matrix()
    tmp_y = tmp_y.as_matrix()
    tmp_y = vfunc(tmp_y)
    if ctr==0:
        ctr=1
        train_x = tmp_x
        train_y = tmp_y
    else:
        print(train_x.shape)
        print(tmp_x.shape)
        train_x = np.concatenate((train_x, tmp_x), axis=0)
        train_y = np.concatenate((train_y, tmp_y), axis=0)



Feature length:9
Loading Data/MLB_2014/MLB_PitchFX_2014_RegularSeason.csv
Loading Data/MLB_2014/MLB_PitchFX_2014_PostSeason.csv
(658428, 12)
(8837, 12)
Loading Data/MLB_2015/MLB_PitchFX_2015_RegularSeason.csv
(667265, 12)
(672271, 12)
Loading Data/MLB_2015/MLB_PitchFX_2015_PostSeason.csv
(1339536, 12)
(10277, 12)
Loading Data/MLB_2016/MLB_PitchFX_2016_RegularSeason.csv
(1349813, 12)
(703809, 12)
Loading Data/MLB_2016/MLB_PitchFX_2016_PostSeason.csv
(2053622, 12)
(10076, 12)


In [3]:
filename = base_dir+"MLB_2017/MLB_PitchFX_2017_RegularSeason.csv"
print("Loading test file {}".format(filename))
f2 = pd.read_csv(filename)

test_x = f2[input_labels]
test_y = f2['umpcall']

test_x = test_x.as_matrix()
test_y = test_y.as_matrix()
test_y = vfunc(test_y)

filename = base_dir+"MLB_2017/MLB_PitchFX_2017_PostSeason.csv"
print("Loading test file {}".format(filename))
f2 = pd.read_csv(filename)

tmp_x = f2[input_labels]
tmp_y = f2['umpcall']

tmp_x = tmp_x.as_matrix()
tmp_y = tmp_y.as_matrix()
tmp_y = vfunc(tmp_y)

test_x = np.concatenate((test_x, tmp_x), axis=0)
test_y = np.concatenate((test_y, tmp_y), axis=0)

Loading test file Data/MLB_2017/MLB_PitchFX_2017_RegularSeason.csv
Loading test file Data/MLB_2017/MLB_PitchFX_2017_PostSeason.csv


In [4]:
MAX_GAME_LEN = 597

# String to Index

In [5]:
class Lang:
    def __init__(self, name):
        """Init Lang with a name."""
        self.name = name
        self.word2index = {"<UNK>": 0, '<EMP>':1}
        self.word2count = {}
        self.index2word = {0: "<UNK>", 1:'<EMP>'}
        self.n_words = len(self.word2index)  # Count SOS and EOS

    def addword(self, word):
        """Add a word to the dict."""
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [6]:
input_labels

['date',
 'stadium',
 'inning',
 'side',
 'pitcher',
 'batter',
 'on_1b',
 'on_2b',
 'on_3b',
 'pitch_count',
 'balls',
 'strikes']

In [7]:
players = Lang('players')
loc = Lang('stadium')
dt = Lang('date')
sd = Lang('side')

for i in range(train_x.shape[0]):
    loc.addword(train_x[i,1])    
    sd.addword(train_x[i,3])
    players.addword(train_x[i,4])
    players.addword(train_x[i,5])

In [8]:
def mat2ind(x):
    def findindex(s,lan):
        try:
            return lan.word2index[s]
        except KeyError:
            return lan.word2index['<UNK>']
    n_train_x = np.zeros((x.shape[0], x.shape[1]+2))
    for i in range(x.shape[0]):
        n_train_x[i,0] = int(x[i,0][5:7]) # month
        n_train_x[i,1] = int(x[i,0][8:]) # day
        n_train_x[i,2] = findindex(x[i,1], loc) # stadium

        n_train_x[i,3] = x[i,2] # inning

        n_train_x[i,4] = 1 if x[i,3]=='top' else -1 # side
        n_train_x[i,5] = findindex(x[i,4], players) # pitcher
        n_train_x[i,6] = findindex(x[i,5], players) # batter
        n_train_x[i,7] = findindex('<EMP>', players) if isinstance(x[i,6],float) and math.isnan(x[i,6]) else findindex(x[i,6], players)
        n_train_x[i,8] = findindex('<EMP>', players)if isinstance(x[i,7],float) and math.isnan(x[i,7]) else findindex(x[i,7], players)
        n_train_x[i,9] = findindex('<EMP>', players) if isinstance(x[i,8],float) and math.isnan(x[i,8]) else findindex(x[i,8], players)
    return n_train_x

In [9]:
nx = mat2ind(train_x)

In [10]:
ntx = mat2ind(test_x)

# To Game

In [11]:
def getNextGame(f):
    ctr = 0
    ptr = ctr
    while ctr < f.shape[0]:
        prev_inn = 0
        while ptr < f.shape[0] and f[ctr,0] == f[ptr,0] and f[ctr,1]== f[ptr,1] \
                and f[ctr,2] == f[ptr,2] and f[ptr,3]>=prev_inn :
            prev_inn = f[ptr,3]
            ptr+=1
        yield ctr,ptr
        ctr = ptr

In [12]:
game_ctr = 0
for c,p  in getNextGame(nx):
    game_ctr +=1
print(game_ctr)
ntrain_x = np.zeros((game_ctr, MAX_GAME_LEN, nx.shape[1]))
ntrain_y = np.ones((game_ctr, MAX_GAME_LEN, 1)) * 3
ctr=0
for c,p  in getNextGame(nx):
    ntrain_x[ctr,:p-c,:] = nx[c:p,:]
    ntrain_y[ctr,:p-c,0] = train_y[c:p]
    ctr+=1

7388


In [13]:
game_ctr = 0
for c,p  in getNextGame(ntx):
    game_ctr +=1
print(game_ctr)
ntest_x = np.zeros((game_ctr, MAX_GAME_LEN, ntx.shape[1]))
ntest_y = np.ones((game_ctr, MAX_GAME_LEN, 1))*3
ctr=0
for c,p  in getNextGame(ntx):
    ntest_x[ctr,:p-c,:] = ntx[c:p,:]
    ntest_y[ctr,:p-c,0] = test_y[c:p]
    ctr+=1

2466


# RNN Model

In [14]:
DEVICE = torch.device('cuda')

In [15]:
def data_gen(dx, dy, batch_size=100):
    idx = 0
    while True:
        if idx*batch_size >= dx.shape[0]:
            return
        elif (idx+1)*batch_size > dx.shape[0]:
            yield dx[idx*batch_size:,:], dy[idx*batch_size:]
        else:
            yield dx[idx*batch_size:(idx+1)*batch_size,:], dy[idx*batch_size:(idx+1)*batch_size]
        idx += 1

In [16]:
class PlayerEmbedding(nn.Module):
    def __init__(self, stadium_size, player_size, emb_dim, sta_dim):
        super(PlayerEmbedding, self).__init__()
        mon_size = 100
        day_size = 100
        self.emb_mon = nn.Embedding(13, mon_size)
        self.emb_day = nn.Embedding(32, day_size)
        self.emb_player = nn.Embedding(player_size, emb_dim)
        self.emb_stadium = nn.Embedding(stadium_size, sta_dim)
        
        self.emb_dim = mon_size+day_size + sta_dim + 5*emb_dim

    def forward(self, mon, day, stadium, p, b, onb1,onb2,onb3):
        e_mon = self.emb_mon(mon)
        e_day = self.emb_day(day)
        emb_s = self.emb_stadium(stadium)
        emb_p = self.emb_player(p)
        emb_b = self.emb_player(b)
        emb_b1 = self.emb_player(onb1)
        emb_b2 = self.emb_player(onb2)
        emb_b3 = self.emb_player(onb3)
        
        

        emb_all = torch.cat([e_mon, e_day, emb_s, emb_p, emb_b, emb_b1, emb_b2, emb_b3], dim=2)
        return emb_all

    def init_weights(self):
        initrange = 0.5
        em_layer = [self.emb_mon, self.emb_day, self.emb_player, self.emb_stadium]

        for layer in em_layer:
            #layer.weight.data.normal_(0, initrange)
            layer.weight.data.uniform_(-initrange, initrange)

In [17]:
class LSTM(nn.Module):
    """Vanilla encoder using pure LSTM."""
    def __init__(self, hidden_size, embedding_layer, dp=0.1, n_layers=2):
        super(LSTM, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding_layer
        #self.dp1 = nn.Dropout(dp)
        self.lstm = nn.LSTM(self.embedding.emb_dim, hidden_size , num_layers=n_layers, dropout=dp, bidirectional=False)
        self.lin1 = nn.Linear(self.hidden_size, self.hidden_size)
        self.lin2 = nn.Linear(self.hidden_size, 3)

    def forward(self, inp, hidden):
        '''
        inputs: (batch_size, seq_len, feature_dim)
        '''
        embedded = self.embedding(inp[:,:,0], inp[:,:,1], inp[:,:,2], inp[:,:,5], inp[:,:,6], inp[:,:,7], inp[:,:,8], inp[:,:,9])
        
        embedded = embedded.permute(1,0,2)
        bilstm_outs, nh = self.lstm(embedded, hidden)
        
        output = bilstm_outs.permute(1,0,2)
        # (batch, seq_len, hidden)
        #output = F.relu(self.lin1(output))
        output = self.lin2(output)
        # (batch, seq_len, 3)
        return F.log_softmax(output, dim=2)

    def initHidden(self, batch_size):
        forward = Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size )).to(DEVICE)
        backward = Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size )).to(DEVICE)
        return (forward, backward)
        #return forward

In [18]:
def train(train_x, train_y, dev_x, dev_y, model, optimizer, criterion, batch_size=512, 
          max_epoch = 512, validation_interv=1000, show_iter=10):
    start = time.time()
    for ep in range(max_epoch):
        print("Epoch {}".format(ep))
        model.train()

        train_iter = data_gen(train_x, train_y, batch_size=batch_size)

        ctr = 0
        avg_loss = 0
        acc = 0
        iteration = 0
        for bx,by in train_iter:
            iteration +=1
            optimizer.zero_grad()
            model.train()
            hid = model.initHidden(bx.shape[0])
            y_pred = model(bx, hid)
            
            y_pred = y_pred.view(y_pred.shape[0] * y_pred.shape[1],-1)
            by = by.view(by.shape[0]*by.shape[1])
            
            idx = (by!=3)
            y_pred = y_pred[idx,:]
            by = by[idx]

            _, lab_y = torch.max(y_pred, 1)
            
            
            loss = criterion(y_pred, by)
            acc += torch.sum(lab_y == by).item()
            loss.backward()
            avg_loss += loss.item()*by.shape[0]
            optimizer.step()
            ctr+= by.shape[0]
            if iteration % show_iter == 0:
                print("Time: {}, iter: {}, avg. loss: {}, avg.acc: {}".format(time.time() - start, 
                                                                              iteration,  
                                                                              avg_loss/ctr,
                                                                              acc/ctr))
                avg_loss = 0
                ctr = 0
                acc = 0
        #dy_pred = model(dev_x, dev_f)
        
        lo, ac = calPerf(dev_x, dev_y, model, criterion, batch_size)
        

        print("Time: {}, loss:{} dev_loss:{}, dev_acc:{}".format(time.time() - start, avg_loss/ctr, lo, ac))

In [19]:
def calPerf(dev_x, dev_y, model, criterion, batch_size=16):
    with torch.no_grad():
        model.eval()
        ll = 0
        ctrr = 0
        acc = 0
        for dx,dy in data_gen(dev_x, dev_y, batch_size=batch_size):
            hid = model.initHidden(dx.shape[0])
            dy_pred = model(dx, hid)

            dy_pred = dy_pred.view(dy_pred.shape[0] * dy_pred.shape[1], -1)
            dy = dy.view(dy.shape[0]*dy.shape[1])

            idx = (dy!=3)
            dy_pred = dy_pred[idx,:]
            dy = dy[idx]

            loss = criterion(dy_pred, dy)

            ll += loss * dy.shape[0]
            ctrr += dy.shape[0]
            _, lab_y = torch.max(dy_pred, 1)

            acc += torch.sum( lab_y == dy ).item()
    return ll/ctrr, acc/ctrr

In [20]:
from sklearn.model_selection import train_test_split
train_x, dev_x, train_y, dev_y = train_test_split(ntrain_x, ntrain_y, test_size=0.1, random_state=0, shuffle=True)

In [21]:
vtrainx = Variable(torch.from_numpy(train_x.astype(np.long)), requires_grad=False).to(DEVICE)
vtrainy = Variable(torch.from_numpy(train_y.astype(np.long)), requires_grad=False).to(DEVICE)

vdevx = Variable(torch.from_numpy(dev_x.astype(np.long)), requires_grad=False).to(DEVICE)
vdevy = Variable(torch.from_numpy(dev_y.astype(np.long)), requires_grad=False).to(DEVICE)
vtestx = Variable(torch.from_numpy(ntest_x.astype(np.long)), requires_grad=False).to(DEVICE)
vtesty = Variable(torch.from_numpy(ntest_y.astype(np.long)), requires_grad=False).to(DEVICE)


In [22]:
emb = PlayerEmbedding(loc.n_words, players.n_words, 256,10).to(DEVICE)
model = LSTM(1024, emb, n_layers=2).to(DEVICE)
print(model)
#opt = torch.optim.Adam(model.parameters(), lr=1e-3)
opt = torch.optim.Adagrad(model.parameters(), lr=0.01, lr_decay=0, weight_decay=0)
crit = nn.NLLLoss()

LSTM(
  (embedding): PlayerEmbedding(
    (emb_mon): Embedding(13, 100)
    (emb_day): Embedding(32, 100)
    (emb_player): Embedding(1880, 256)
    (emb_stadium): Embedding(34, 10)
  )
  (lstm): LSTM(1490, 1024, num_layers=2, dropout=0.1)
  (lin1): Linear(in_features=1024, out_features=1024, bias=True)
  (lin2): Linear(in_features=1024, out_features=3, bias=True)
)


In [26]:
train(vtrainx, vtrainy, vdevx, vdevy, model, opt, crit, batch_size=32, max_epoch=3, show_iter=10)

Epoch 0
Time: 8.100097179412842, iter: 10, avg. loss: 1.0192512735363184, avg.acc: 0.46384446465408247
Time: 16.190094470977783, iter: 20, avg. loss: 1.0165523647985344, avg.acc: 0.4672524314765694
Time: 24.287427186965942, iter: 30, avg. loss: 1.020094162184382, avg.acc: 0.46290904594354576
Time: 32.388426065444946, iter: 40, avg. loss: 1.0208407819634029, avg.acc: 0.46146184666229034
Time: 40.49259281158447, iter: 50, avg. loss: 1.019300871879167, avg.acc: 0.46260222059703476
Time: 48.588518142700195, iter: 60, avg. loss: 1.0205040306542321, avg.acc: 0.46408578943818707
Time: 56.673378467559814, iter: 70, avg. loss: 1.0178048831726099, avg.acc: 0.46380967315576066
Time: 64.75835752487183, iter: 80, avg. loss: 1.0190694754530807, avg.acc: 0.46221002182570936
Time: 72.8707582950592, iter: 90, avg. loss: 1.018124907389625, avg.acc: 0.4625176491351924
Time: 81.1646990776062, iter: 100, avg. loss: 1.0190161175873342, avg.acc: 0.4631992149165849
Time: 89.48227906227112, iter: 110, avg. los

In [None]:
def calPred(dev_x, dev_y, model, criterion, batch_size=16):
    with torch.no_grad():
        model.eval()
        ll = 0
        ctrr = 0
        acc = 0
        for dx,dy in data_gen(dev_x, dev_y, batch_size=batch_size):
            hid = model.initHidden(dx.shape[0])
            dy_pred = model(dx, hid)

            dy_pred = dy_pred.view(dy_pred.shape[0] * dy_pred.shape[1], -1)
            dy = dy.view(dy.shape[0]*dy.shape[1])

            idx = (dy!=3)
            dy_pred = dy_pred[idx,:]
            dy = dy[idx]

            loss = criterion(dy_pred, dy)

            ll += loss * dy.shape[0]
            ctrr += dy.shape[0]
            _, lab_y = torch.max(dy_pred, 1)

            acc += torch.sum( lab_y == dy ).item()
    return ll/ctrr, acc/ctrr

In [28]:
calPerf(vtestx, vtesty,model,crit)

(tensor(1.0353, device='cuda:0'), 0.4543989762782762)

In [64]:
countMaxLen(f)

424

In [50]:
f.iloc[291, 0:2] == f.iloc[292, 2]

date       True
stadium    True
dtype: bool

In [52]:
f.iloc[292, 2]

1