# 2015 MLB Baseball Sample Questions

### Questions

1) Modify **GetCounts** and **ToProbs** to create the new HMM. How many nodes are there?

2) Find the *rarest transition event*. Determine the date, game, and inning it came from. Take a screenshot of this event from the Baseball Reference Website.

3) What is the rarest inning?

4) What percentage of runs(R) were scored when the team had 2 outs?

5) Come up with one interesting question that your group will ask of the HMM. Answer that question.


## Some code blocks take a couple of minutes to run because of how many files it has to go through.
**The 'bb2015' directory should be in the current working directory.**

In [2]:
import baseball
import os

### Reading In Files:

In [64]:
#Use new GetOutsBases()

import os
import copy
from lxml import html, etree
import requests
import math
import numpy as np
import urllib

def GetOutsBasesOR(fname):
    with open(fname) as f:
        page = f.read()
    page = page.replace('<!--', '')
    page = page.replace('-->','')
    page = page.replace('data-stat="runs_outs_result" ></td>', 'data-stat = "runs_outs_result" >z</td>')
    tree = html.fromstring(page)
    outs = tree.xpath('//td[@data-stat="outs"]/text()')
    onbase = tree.xpath('//td[@data-stat = "runners_on_bases_pbp"]/text()')
    rodata = tree.xpath('//td[@data-stat= "runs_outs_result"]/text()')
    #remove the top 5 plays
    outs = outs[5:]
    onbase = onbase[5:]
    ro = rodata[5:]
    return outs, onbase, ro

# Modified ToInnings()

def ToInningsOR(outs,onbase, ro):
    innings = []
    N = len(outs)
    st = [outs[0] + onbase[0] + ro[0]]
    for i in range(1,N):
        if outs[i]=='0' and outs[i-1]!='0':
            st.append('3---')
            innings.append( st )
            st = ['0---']     
        else:
            if ro[i] != 'z':
                tostring = outs[i] + onbase[i] + ro[i]
                st.append(tostring)
            else:
                tostring = outs[i] + onbase[i]
                st.append(tostring)
    # last atbats
    st.append('3---')
    innings.append(st)
    return innings

#Modify GetCounts()

def GetCountsOR(mydir):
    games = baseball.GetGames(mydir)
    dct = {}
    for i in range(len(games)):
        outs, onbase, ro = GetOutsBasesOR(games[i])
        innings = ToInningsOR(outs, onbase, ro)
        #print(games[i],len(outs))
        #print('.', end='')
        baseball.EventCounts(innings,dct)
    return dct

In [65]:
mydir = 'bb2015'
dct = GetCountsOR(mydir)

### Question 1: How many nodes are there?

In [66]:
nodes = len(dct)
print('There are', nodes ,'nodes.')

There are 158 nodes.


### Question 2: Find the *rarest transition event*.

In [20]:
# Rarest Transition Event

# Determines the rarest transition event
def RarestTransition(probs, preProb):
    rProb = 1.0
    rTransition = []
    for fKey in probs.keys():
        for sKey in probs[fKey].keys():
            if probs[fKey][sKey] < rProb and probs[fKey][sKey] > preProb:
                rProb = probs[fKey][sKey]
                rTransition = [fKey, sKey]
    return rProb, rTransition

# Searches all games to find occurrences of a transition event
def RTGame(rTransition, datadir):
    games = baseball.GetGames(datadir)
    rtgames = []
    for g in range(len(games)):
        outs, onbase, ro = GetOutsBasesOR(games[g])
        innings = ToInningsOR(outs, onbase, ro)
        for i in range(len(innings)):  # loops through each inning
            for x in range(len(innings[i])-1):  # loops through each event in an inning
                if innings[i][x]==rTransition[0] and innings[i][x+1]==rTransition[1]:
                    rtgames.append((games[g], innings[i]))
    return rtgames


In [29]:
probs = baseball.ToProbs(dct)
rProb, rTransition = RarestTransition(probs, 0)
rtgames = []
while len(rtgames) == 0:
    rtgames = RTGame(rTransition, datadir = os.getcwd() + '\\bb2015')
    if len(rtgames) == 0:
        rProb, rTransition = RarestTransition(probs, rProb)
    #print("attempted")
#print("done")
#print(rtgames)
rTransition

['0---', '0--3RR']

In [68]:
#Most common transition

rProb, rTransition = RarestTransition(probs, 0)
rTransition

['0---', '0--3RR']

### Question 3: Find the rarest inning.

In [61]:
probs = baseball.ToProbs(dct)

games = baseball.GetGames(mydir)

def RareInningsOR(games, logodds):
    data = []
    for game in games:
        outs, onbase, ro = GetOutsBasesOR(game)
        innings = ToInningsOR(outs,onbase, ro)
        for i in range(len(innings)):
            a = baseball.TrailLogOdds(innings[i],logodds)
            data.append((a,i,game))
    # sort
    vec = np.array(list(map(lambda x:x[0],data)))
    ag = vec.argsort()
    answ = []
    for i in ag:
        answ.append(data[i])
    return answ

logodds = baseball.ToLogOdds(probs)
answ = RareInningsOR(games, logodds)

answ[0:16] #this is printing the tuples with the lowest HMM value. remove the brackets to get the entire list.

[(-4.136415776142788, 21, 'bb2015/BAL201507270.shtml'),
 (-4.136415776142788, 19, 'bb2015/TOR201504180.shtml'),
 (-4.136415776142788, 21, 'bb2015/BAL201509020.shtml'),
 (-4.136415776142788, 19, 'bb2015/ATL201508160.shtml'),
 (-4.136415776142788, 21, 'bb2015/CHA201507080.shtml'),
 (-4.136415776142788, 21, 'bb2015/MIN201504170.shtml'),
 (-4.136415776142788, 15, 'bb2015/KCA201506040.shtml'),
 (-4.136415776142788, 21, 'bb2015/CHA201506210.shtml'),
 (-4.136415776142788, 21, 'bb2015/DET201509190.shtml'),
 (-4.136415776142788, 17, 'bb2015/BAL201508190.shtml'),
 (-4.136415776142788, 17, 'bb2015/TEX201506160.shtml'),
 (-4.136415776142788, 17, 'bb2015/ANA201509260.shtml'),
 (-4.136415776142788, 21, 'bb2015/DET201505210.shtml'),
 (-4.136415776142788, 17, 'bb2015/ANA201505050.shtml'),
 (-4.136415776142788, 21, 'bb2015/CHN201509280.shtml'),
 (-4.136415776142788, 21, 'bb2015/SEA201505080.shtml')]

In [63]:
#To check the events where the lowest inning event occured:

fname = os.getcwd() + '\\bb2015\\BAL201508190.shtml'
outs, onbase, ro = GetOutsBasesOR(fname)
innings = ToInningsOR(outs1403, onbase, ro)
innings[17]

['0---', '3---']

### Question 4: What percentage of runs (R) were scored when the team had 2 outs?

In [45]:
def CountRuns(datadir)
   games = baseball.GetGames(datadir)
   twoOutCount = 0
   for g in range(len(games)):
       outs, onbase, ro = GetOutsBasesOR(games[g])
       innings = ToInningsOR(outs, onbase, ro)
       for i in range(len(innings)): # loops through each inning
           for x in range(len(innings[i])): # loops through each string in an inning
                if innings[i][x][0] == '2':
                    twoOutCount += innings[i][x].count('R')

   return twoOutCount

datadir = os.getcwd() + '\\bb2015'
twoOutCount = CountRuns(datadir)

In [51]:
Percentage = twoOutCount/11357
Percentage

0.4040679756978075

### Question 5: How many times has a Grand Slam occurred in these games? (Grand Slam = When all bases are filled and the next batter scores a homerun)

The results show the games and inning number when the grand slam occured (and are probably the most interesting to watch!)

In [55]:
# Tweaked GetGames() function from baseball.py
# Grandslam() is a function I made from scratch

mydir = 'bb2015'

def GetGames(mydir):
    a = os.listdir(mydir)
    games = []
    for i in a:
        if '.shtml' in i:
            games.append(os.getcwd() + '\\' + mydir + '\\' + i)
    return games

games = GetGames(mydir)

# Input list of games from GetGames()
# The function will go through every event in every inning and check for when
# '0123R' becomes an '0123RRRR' (it doesn't), '1123R' becomes an '1123RRRR', or '2123R' becomes an '2123RRRR'
# AKA this is when a grandslam events/transition occurs
# When a grandslam occurs, the key is the game and value is the inning number which the grandslam occured in

def Grandslam(games):
    gsdct = {}
    for i in range(len(games)):
        outs, onbase, ro = GetOutsBasesOR(games[i])
        innings = ToInningsOR(outs, onbase, ro)
        for j in range(len(innings)):
            for k in range(len(innings[j])):
                if innings[j][k] == '0123RRRR' or innings[j][k] == '1123RRRR' or innings[j][k] == '2123RRRR':
                    gsdct[games[i][-18:]] = j
    return gsdct       

# Will take a couple of minutes to do this:
gsdct = Grandslam(games) 

In [57]:
print(len(gsdct))
#gsdct

63
