In [1]:
import requests #apache http library
import xml.etree.ElementTree as ET #XML parsing
import os
import pandas as pd
import numpy as np
from matplotlib.patches import Rectangle
from numpy.random import random

%matplotlib inline

from termcolor import colored

In [2]:
url = 'http://gd2.mlb.com/components/game/mlb/year_2019/month_10/day_17/gid_2019_10_17_houmlb_nyamlb_1/players.xml'
resp = requests.get(url)
print(colored(resp, 'blue'))

[34m<Response [200]>[0m


In [3]:
xmlfile = 'HOU_NYY_Players.xml'
with open(xmlfile, 'wb') as f:
    f.write(resp.content)
statinfo = os.stat(xmlfile)


In [5]:
# pull in players <game><team><player>
tree = ET.parse(xmlfile)
game = tree.getroot()
teams = game.findall('./team')
teams

[<Element 'team' at 0x11ab1e110>, <Element 'team' at 0x11ab1eef0>]

In [6]:
PlayerDict = {}

for team in teams:
    print(team.attrib.get('name'))
    players = team.findall('player')
    for player in players:
        print('', player.attrib.get('id'), player.attrib.get('first'), player.attrib.get('last'))
        PlayerDict[player.attrib.get('id')] = player.attrib.get('first') + ' ' + player.attrib.get('last')

Houston Astros
 455139 Robinson Chirinos
 455117 Martin Maldonado
 650556 Bryan Abreu
 663656 Kyle Tucker
 425844 Zack Greinke
 608324 Alex Bregman
 493329 Yuli Gurriel
 444468 Hector Rondon
 501789 Will Harris
 502210 Josh Reddick
 514888 Jose Altuve
 532077 Roberto Osuna
 543807 George Springer
 501925 Joe Smith
 488726 Michael Brantley
 664353 Jose Urquidy
 649557 Aledmys Diaz
 621043 Carlos Correa
 519151 Ryan Pressly
 502748 Brad Peacock
 545350 Jake Marisnick
 543037 Gerrit Cole
 434378 Justin Verlander
 657624 Josh James
 670541 Yordan Alvarez
New York Yankees
 429665 Edwin Encarnacion
 518934 DJ LeMahieu
 622663 Luis Severino
 544928 Tyler Lyons
 519222 Austin Romine
 493603 Adam Ottavino
 592450 Aaron Judge
 642528 Jonathan Loaisiga
 572020 James Paxton
 570482 Gio Urshela
 543305 Aaron Hicks
 282332 CC Sabathia
 502154 Zack Britton
 547888 Masahiro Tanaka
 457727 Cameron Maybin
 650402 Gleyber Torres
 458731 Brett Gardner
 643338 Chad Green
 592454 Tommy Kahnle
 519317 Gianca

In [7]:
# get innings - all data

url = 'http://gd2.mlb.com/components/game/mlb/year_2019/month_10/day_17/gid_2019_10_17_houmlb_nyamlb_1/inning/inning_all.xml'
resp = requests.get(url)
print(colored(resp, 'blue'))

[34m<Response [200]>[0m


In [8]:
xmlfile = 'HOU_NYY_game.xml'
with open(xmlfile, 'wb') as f:
    f.write(resp.content)
statinfo = os.stat(xmlfile)

In [9]:
tree = ET.parse(xmlfile)
root = tree.getroot()

# unpack the game to get all the innings
for child in root:
    print(child.tag, child.attrib.get('num'))
    for frame in child:
        print('   ', frame.tag, frame.attrib)

inning 1
    top {}
    bottom {}
inning 2
    top {}
    bottom {}
inning 3
    top {}
    bottom {}
inning 4
    top {}
    bottom {}
inning 5
    top {}
    bottom {}
inning 6
    top {}
    bottom {}
inning 7
    top {}
    bottom {}
inning 8
    top {}
    bottom {}
inning 9
    top {}
    bottom {}


In [10]:
frames = ['top', 'bottom']

pitchDictionary = { "FA":"fastball", "FF":"4-seam fb", "FT": "2-seam fb", "FC": "fb-cutter", "":"unknown", None: "none",
                    "FS":"fb-splitter", "SL":"slider", "CH":"changeup","CU":"curveball","KC":"knuckle-curve",
                    "KN":"knuckleball","EP":"eephus", "UN":"unidentified", "PO":"pitchout", "SI":"sinker", "SF":"split-finger"
                    }

PitchDF = pd.DataFrame(columns = ['pitchIdx', 'inning', 'frame', 'ab', 'abIdx', 'batter', 'stand', 'speed', 
                                  'pitchtype', 'px', 'pz', 'szTop', 'szBottom', 'des'])

colors = {'R':'red', 'L':'black'}
markers = {'R': 'x', 'L': 'o'}

In [11]:
totalPitchCount = 0 
topPitchCount = 0 
bottomPitchCount = 0

innings = root.findall('./inning')
for inning in innings:
    for i in range(len(frames)):
        color = 'green' if i==0 else 'yellow' # show top in green and yellow for bottom
        print(colored('\nInning:' + inning.attrib.get('num') + '(', color, attrs=['reverse']))
        fr = inning.find(frames[i])
        if fr is not None:
            for ab in fr.iter('atbat'):
                battername = PlayerDict[ab.get('batter')]
                abPitchCount = 0 
                print(colored('   ' + battername, color, attrs=['bold']))
                
                pitches = ab.findall('pitch')
                #print(pitches)
                for pitch in pitches:
                    abPitchCount = abPitchCount + 1
                    totalPitchCount = totalPitchCount + 1
                    verbosePitch = pitchDictionary[pitch.get('pitch_type')]
                    print(colored('   ' + str(abPitchCount) + ': ' + verbosePitch))
                print('   ' + colored(ab.attrib.get('event'), color, attrs=['underline']))
print('Total Pitches:' + ' ' + str(totalPitchCount))

[7m[32m
Inning:1([0m
[1m[32m   George Springer[0m
   1: 4-seam fb[0m
   2: fb-splitter[0m
   3: slider[0m
   [4m[32mLineout[0m
[1m[32m   Jose Altuve[0m
   1: slider[0m
   2: 4-seam fb[0m
   3: slider[0m
   4: fb-splitter[0m
   5: fb-splitter[0m
   [4m[32mLineout[0m
[1m[32m   Michael Brantley[0m
   1: slider[0m
   2: fb-splitter[0m
   3: slider[0m
   4: 4-seam fb[0m
   5: fb-splitter[0m
   6: fb-splitter[0m
   7: slider[0m
   8: slider[0m
   [4m[32mWalk[0m
[1m[32m   Alex Bregman[0m
   1: slider[0m
   2: 4-seam fb[0m
   [4m[32mPop Out[0m
[7m[33m
Inning:1([0m
[1m[33m   DJ LeMahieu[0m
   1: 4-seam fb[0m
   2: 4-seam fb[0m
   3: 4-seam fb[0m
   4: 4-seam fb[0m
   [4m[33mWalk[0m
[1m[33m   Aaron Judge[0m
   1: 4-seam fb[0m
   2: curveball[0m
   3: changeup[0m
   4: changeup[0m
   5: changeup[0m
   6: slider[0m
   [4m[33mForceout[0m
[1m[33m   Aaron Hicks[0m
   1: changeup[0m
   2: changeup[0m
   3: curveball[0m
   4: c

In [12]:
for inning in innings:
    for i in range(len(frames)):
        fr = inning.find(frames[i])
        if fr is not None:
            for ab in fr.iter('atbat'):
                battername = PlayerDict[ab.get('batter')]
                standside = ab.get('stand')
                abIdx = ab.get('num')
                abPitchCount = 0 
                pitches = ab.findall('pitch')
                
                #print(pitches)
                for pitch in pitches:
                    if pitch.attrib.get('start_speed') is None:
                        speed = 0 
                    else:
                        speed = float(pitch.attrib.get('start_speed'))
                    pxfloat = 0.0 if pitch.attrib.get('px') == None else float('{0:.2f}'.format(float(pitch.attrib.get('px'))))
                    pzfloat = 0.0 if pitch.attrib.get('pz') == None else float('{0:.2f}'.format(float(pitch.attrib.get('pz'))))
                    szTop = 0.0 if pitch.attrib.get('sz_top') == None else float('{0:.2f}'.format(float(pitch.attrib.get('sz_top'))))
                    szBot = 0.0 if pitch.attrib.get('sz_bot') == None else float('{0:.2f}'.format(float(pitch.attrib.get('sz_bot'))))
                    print(pxfloat, pzfloat, szTop, szBot)
                    
                    abPitchCount = abPitchCount + 1
                    totalPitchCount = totalPitchCount + 1
                    
                    if frames[i] =='top':
                        topPitchCount = topPitchCount + 1
                    else:
                        bottomPitchCount = bottomPitchCount + 1
                    inn = inning.attrib.get('num')
                    des = pitch.get('des')    
                    verbosePitch = pitchDictionary[pitch.get('pitch_type')]
                    
                    PitchDF.loc[totalPitchCount] = [totalPitchCount, inn, frames[i], abIdx, abPitchCount, battername, 
                                                   standside, speed, verbosePitch, pxfloat, pzfloat, szTop, szBot, des]

print(str(totalPitchCount) + " pitches cataloged. Top: " + str(topPitchCount) + ". Bottom: " + str(bottomPitchCount) + ".")

0.02 2.47 3.37 1.53
-1.37 3.08 3.62 1.72
0.75 2.31 2.81 1.3
0.73 1.96 3.05 1.45
-1.22 4.44 3.08 1.39
1.27 0.7 3.05 1.48
-0.16 2.55 2.81 1.3
0.97 1.26 3.32 1.51
-0.32 1.76 3.32 1.52
-0.24 0.39 3.32 1.48
-0.25 2.97 3.41 1.52
-0.27 4.05 3.35 1.52
-1.29 1.99 3.35 1.52
0.22 1.94 3.32 1.51
0.89 1.96 3.32 1.51
0.73 1.21 3.35 1.52
0.92 2.3 3.13 1.45
0.05 3.55 3.11 1.48
1.03 2.25 3.53 1.75
0.69 1.46 3.5 1.75
0.28 1.19 3.5 1.69
1.44 1.91 3.53 1.75
-0.08 1.27 4.24 2.08
-0.41 2.21 4.03 2.05
-0.52 1.38 4.21 2.11
-0.07 1.96 3.94 1.88
-0.49 2.2 3.94 1.88
0.57 1.88 3.94 1.88
-0.04 1.59 3.5 1.54
-0.97 1.39 3.5 1.6
-0.46 1.82 3.47 1.57
0.12 1.96 3.2 1.51
-0.04 2.28 3.32 1.58
0.68 1.26 3.41 1.66
-0.13 2.57 3.32 1.59
0.06 2.52 3.5 1.59
-0.17 1.41 3.38 1.59
-1.02 2.12 3.32 1.59
-1.2 2.25 3.38 1.63
-0.96 2.4 2.84 1.24
-0.6 0.7 3.05 1.45
-0.78 0.92 3.14 1.49
-0.98 1.71 2.9 1.27
0.68 1.49 3.35 1.6
-0.32 2.69 3.26 1.59
0.19 0.72 3.45 1.65
0.11 1.52 3.47 1.63
-0.4 4.21 3.5 1.6
0.71 0.49 3.47 1.66
0.39 1.69 3.4 

In [13]:
PitchDF.to_csv('PitchDF.csv')