# MLB Data Exploring

WIP but some examples of how to work with this data.

In [1]:
import api_helpers
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from datetime import datetime, timedelta
import pandas as pd
from IPython import display as ICD # just to display multiple dataframes in a loop
import json
import glob

date_format = "%Y-%m-%d"



In [None]:
def get_team_standings():
    team_standings = []
    for l_id in [103,104]:
        standings = api_helpers.call_api(f'/standings?season=2025&leagueId={l_id}')['records']
        for stan in standings:
            for rec in stan['team_records']:
                team_standings.append({
                    'team_id': rec['team']['id'],
                    'season': rec['season'],
                    'games_played': rec['games_played'],
                    'wins': rec['league_record']['wins'],
                    'losses': rec['league_record']['losses'],
                })

    return pd.DataFrame(team_standings).set_index('team_id')

def flatten_dict(og, flatten_keys):
    new_dict = {}
    for k,v in og.items():
        if k in flatten_keys:
            for k_, v_ in v.items():
                new_col = f"{k}__{k_}"
                new_dict[new_col] = v_
        else:
            new_dict[k] = v
    return new_dict

In [None]:
class Team:

    def __init__(self, data, season, score=None):
        self.id = data['id']
        self.name = data['name']
        self.abbr = [x['abbreviation'] for x in api_helpers.get_teams(season) if x['name'] == data['name']][0]
        self.score = score

    def _get_team_abbr(self, name, season):
        return 

In [None]:
class Game:
    def __init__(self, data, pbp):
        self.date = datetime.strptime(data['game_date'], "%Y-%m-%dT%H:%M:%SZ")
        self.id = data['game_pk']
        """
        home_team = {'id': data['teams']['home']['team']['id'],
                     'name': data['teams']['home']['team']['name'],
                     'score': data['teams']['home']['score'],
                     'abbr': team_abbr[data['teams']['home']['team']['name']]}
        """ 
        
        self.home_team = Team(data['teams']['home']['team'], data['season'], data['teams']['home']['score'])
        self.away_team = Team(data['teams']['away']['team'], data['season'], data['teams']['away']['score'])
        self.pbp = pbp

    def __repr__(self):
        return "(game: {}) {} | {} @ {}".format(self.id, self.date.date().isoformat(), self.away_team.abbr, self.home_team.abbr) 

    def __str__(self):
        # print
        return "{} | {} @ {} ({}-{})".format(self.date.strftime("%Y-%m-%d %H:%M"), self.away_team.name, self.home_team.name, self.away_team.score, self.home_team.score)
    
    def get_winner(self):
        if self.away_team.score > self.home_team.score:
            return self.away_team
        elif self.away_team.score > self.home_team.score:
            return self.home_team
        else:
            return None
        
    def get_teams(self):
        return {'home': self.home_team, 'away': self.away_team}

    def get_scoring_summary(self):      
        # index, team, bot vs top, desc, score after?
        scoring_plays = []

        for i in self.pbp['scoring_plays']:
            p = self.pbp['all_plays'][i]
            scoring_plays.append({
                'id': i,
                'batting': self.away_team.abbr if p['about']['half_inning'] == 'top' else self.home_team.abbr,
                'inning': p['about']['inning'],
                'half': p['about']['half_inning'],
                'event': p['result']['description'],
                self.away_team.abbr: p['result']['away_score'],
                self.home_team.abbr: p['result']['home_score']
            })

        return pd.DataFrame(scoring_plays)
    
    def get_box_score(self):
        box_score = {}
        current_score = {'away': 0, 'home': 0}
        inning = 1
        for i in self.pbp['plays_by_inning']:
            p = self.pbp['all_plays'][i['end_index']]
            box_score[str(inning)] = {
                self.away_team.name: p['result']['away_score'] - current_score['away'],
            }
            if len(i['bottom']) > 0:
                box_score[str(inning)][self.home_team.name] = p['result']['home_score'] - current_score['home']
            else:
                box_score[str(inning)][self.home_team.name] = '-'

            inning += 1
            current_score['away'] = p['result']['away_score']
            current_score['home'] = p['result']['home_score']

        box_score['R'] = {self.away_team.name: self.away_team.score, self.home_team.name: self.home_team.score}
        return pd.DataFrame(box_score)


## Getting Data...

the api is free but i'm pretty sure it'll rate limit and flag IPs if it sees too many requests, so the goal here is to make as few calls as possible. to help with that I'm going to be saving the responses as json files. there will be another section on loading the files from json, so only use this if you need new data that hasn't been saved yet.

I think first step is to get & store raw data. Should only be creating game objects when I want to do analysis.

In [None]:
# can get list of pbp games from glob'ing the pbp folder
pbp_game_ids = [int(g.split('/')[-1].split('.')[0]) for g in glob.glob('data/pbp/*.json')]

### get raw data
this gets all games from 2015 to the current season. it checks to see if previous seasons have been stored as json files yet as to not waste API calls. Always refreshes current season data though.

In [None]:
# 31181 games
current_year = datetime.today().year 

all_games = api_helpers.get_games()
print(f"returned {len(all_games)} games")

finished_games = [g for g in all_games if g['status']['coded_game_state'] == 'F']
reg_season_games = [g for g in finished_games if g['season'] == str(current_year) and g['game_type'] == 'R']

# games_dict = {g['game_pk']:g for g in reg_season_games}

print(f"total: {len(all_games)}")
print(f"completed: {len(finished_games)}")
print(f"2025 reg szn: {len(reg_season_games)}")

In [None]:
# box scores for most recent 10 games

for g in games:
    print(f"{g.date.date()} at bats: {len(g.pbp['all_plays'])}")
    ICD.display(g.get_box_score())
    ICD.display(g.get_scoring_summary())
    print()

## PBP stuff

In [None]:
def get_plays(g):
    out_list = []
    #extra_stuff = {'pitches': [], 'actions': [], 'runners': []}
    extra_stuff = {}
    all_plays = g.pbp['all_plays']

    for play in all_plays:
        ab_id = f"{g.id}-{play['about']['at_bat_index']}"

        for p_e in play['play_events']:
            p_e['ab_id'] = ab_id
        for r_e in play['runners']:
            r_e['ab_id'] = ab_id

        # TODO filter out pickoff attempts from pitces
        pitches = [e for e in play['play_events'] if e['index'] in play['pitch_index']]
        # TODO filter out 
        actions = [e for e in play['play_events'] if e['index'] in play['action_index']]
        runners = [r for r in play['runners']]

        play_dict = {
            'g_id': g.id
            , 'ab_id': ab_id
            , 'date': g.date
            , 'a_team': g.away_team.abbr
            , 'a_score': play['result']['away_score'] # after conclusion of at bat
            , 'h_team': g.home_team.abbr
            , 'h_score': play['result']['home_score'] # after conclusion of at bat
            , 'at_bat': play['about']['at_bat_index']
            , 'inning': play['about']['inning']
            , 'inning_id': f"{g.id}-{play['about']['inning']}-{play['about']['half_inning'][0:3]}"
            , 'half': play['about']['half_inning']
            , 'result': play['result']['event_type']
            , 'desc': play['result']['description']
            , 'is_hit': event_types_map[play['result']['event_type']]['hit']
            , 'is_score': play['about']['is_scoring_play']
            , 'b_id': play['matchup']['batter']['id']
            , 'b_name': play['matchup']['batter']['full_name']
            , 'b_hand': play['matchup']['bat_side']['description']
            , 'p_id': play['matchup']['pitcher']['id']
            , 'p_name': play['matchup']['pitcher']['full_name']
            , 'p_hand': play['matchup']['pitch_hand']['description']
            , 'p_count': len(pitches)
            , 'a_count': len(actions)
            , 'r_count': len(runners)
        }
        
        """ if play_dict['is_hit']:
            play_dict['hit_data'] = play_dict['pitches'][-1]['hit_data']
        else:
            play_dict['hit_data'] = None """

        if 'hit_data' in pitches[-1]:
            hit = pitches[-1]['hit_data']
            play_dict['exit_velo'] = hit['launch_speed']
            play_dict['exit_angle'] = hit['launch_angle']
            play_dict['distance'] = hit['total_distance']
        else:
            play_dict['exit_velo'] = 0
            play_dict['exit_angle'] = 0
            play_dict['distance'] = 0
            
        extra_stuff[ab_id] = {'pitches': pitches, 'actions': actions, 'runners': runners}

        #extra_stuff['pitches'].extend(pitches)
        #extra_stuff['actions'].extend(actions)
        #extra_stuff['runners'].extend(runners)

        #extra_stuff.append(extra_stuff)
        out_list.append(play_dict)
    
    return out_list, extra_stuff

In [None]:
plays_0, extra_0 = get_plays(game_objects[0])
plays_0_df = pd.DataFrame(plays_0).set_index('ab_id')
plays_0_df['date'] = pd.to_datetime(plays_0_df['date'])

plays_col_hide = ['g_id', 'b_id', 'b_hand', 'p_id', 'p_hand', 'inning', 'half']
plays_cols = [c for c in plays_0_df.columns if c not in plays_col_hide]

plays_0_df[plays_cols].head(10)

In [None]:
plays_1, extra_1 = get_plays(game_objects[1])
plays_1_df = pd.DataFrame(plays_1).set_index('ab_id')
plays_1_df['date'] = pd.to_datetime(plays_1_df['date'])

plays_col_hide = ['g_id', 'b_id', 'b_hand', 'p_id', 'p_hand', 'inning', 'half']
plays_cols = [c for c in plays_1_df.columns if c not in plays_col_hide]

plays_1_df[plays_cols].tail(15)


### pitches

called strikes-swinging strikes-foul balls-in play strikes

In [None]:
def get_count(c):
    return f"{c['balls']}-{c['strikes']}"

foul_codes = ['R', 'L', 'F', 'T', 'O']
strike_codes = ['A', 'K', 'W', 'M', 'Q', 'AB', 'AC']
ball_codes = ['VS', 'VP', 'VC', 'VB', '*B', 'P', 'I', 'H', 'V', 'B']
hit_codes = ['X', 'E', 'D', 'Y', 'J', 'Z']
hbp_codes = ['H']
called_strike_codes = ['C']
swinging_strike_codes = ['S']

def code_map_flip(c):
    if c in swinging_strike_codes:
        # 'Strike Swinging'
        return 'SS'
    elif c in called_strike_codes:
        # Strike Looking
        return 'SL'
    elif c in hit_codes:
        # In Play
        return 'IP'
    elif c in foul_codes:
        # Foul Ball
        return 'FB'
    elif c in strike_codes:
        # Other Strike
        return 'S_'
    elif c in ball_codes:
        # Ball
        return 'B'
    else:
        return f"?{c}?"

#stat_count = {"balls": 0, "fouls": 0, 'in_play': 0, "swinging": 0}
#outs = 0
pitches_out = []
for i, a_b in plays_1_df.iterrows():
    pitches = [p for p in extra_1[i]['pitches'] if p['is_pitch']]
    print(i,a_b['b_name'], '-', a_b['desc'])
    outs = 0
    for p in pitches:
        p_data = p['pitch_data']
        p_dict = {
            'ab_id': i,
            'pitch_number': p['pitch_number'],
            'code': p['details']['code'],
            'my_code': code_map_flip(p['details']['code']),
            'in_play': p['details']['is_in_play'],
            'is_out': p['details']['is_out'],
            'strike': p['details']['is_strike'],
            'ball': p['details']['is_ball'],
            'type': p['details']['type']['description'],
            'start_speed': p_data['start_speed'],
            'end_speed': p_data['end_speed'],
            'zone_top': p_data['strike_zone_top'],
            'zone_bot': p_data['strike_zone_bottom'],
            'plate_time': p_data['plate_time'], # projected time from release to front of plate
            'spin_direction': p_data['breaks']['spin_direction'], # 360/0 is top-spin, 90 puts it to 1b side
            'spin_rate': p_data['breaks']['spin_rate'], # rpm
            'acceleration': {'x': p_data['coordinates']['a_x'], 'y': p_data['coordinates']['a_y'], 'z': p_data['coordinates']['a_z']}, 
            'movement': {'x': p_data['coordinates']['pfx_x'], 'z': p_data['coordinates']['pfx_z']}, # movement in inches, x is horizontal, z is vertical, so y must be pitch to catch?
            'position_plate': {'x': p_data['coordinates']['p_x'], 'z': p_data['coordinates']['p_z']}, # position in feet as it crosses plate
            'coordinate_plate': {'x': p_data['coordinates']['x'], 'y': p_data['coordinates']['y']}, # coordinate at plate
            'coordinate_pitcher': {'x': p_data['coordinates']['x0'], 'y': p_data['coordinates']['y0'], 'z': p_data['coordinates']['z0']}, # coordinate location at pitcher
            'velocity': {'x': p_data['coordinates']['v_x0'], 'y': p_data['coordinates']['v_y0'], 'z': p_data['coordinates']['v_z0']},
            'zone': p_data['zone'] # zone is assigned from catcher's perspective 1 is high and away to a leftie, area around 1 is 11, area around 9 is 14
        }
        pitches_out.append(p_dict)
        # Y is catcher toward pitcher
        
        print(p_dict['pitch_number'], p_dict['zone'], p_dict['position_plate'], p_dict['movement'])
        

In [None]:
for p in pitches_out:
    print(p['zone_top'], p['zone_bot'], p['position_plate'], p['zone'])

In [None]:
def get_box(z_min, z_max, x_min, x_max):
    return {
        'x': [x_min, x_min, x_max, x_max, x_min],
        'z': [z_min, z_max, z_max, z_min, z_min]
    }

#strike_zone = get_box(18/12, 42/12, (-10/12), (10/12)) # actual strike zone. 17 inches for the plate plus 3 to account for width of ball
strike_zone = get_box(1.7898, 4.0018, (-10/12)-.3, (10/12)-.3)
strike_zone_alt = get_box(1.3, 2.9, (-10/12)+.3, (10/12)+.3)
#inside = get_box(22/12, 38/12, -6.7/12, 6.7/12)
#shadow = get_box(14/12, 46/12, -13.3/12, 13.3/12)
#chase = get_box(0, 5, -20/12, 20/12)

In [None]:
plt.rcParams["figure.figsize"] = (5,10)
plt.xlim(-1.75, 1.75)
plt.ylim(0, 7)
plt.plot([(-10/12)-.3, (10/12)-.3], [6.58,6.58], color = 'navy')

plt.plot([(-10/24)-.3, (10/24)-.3], [3.29,3.29], color = 'navy')

plt.plot(strike_zone['x'], strike_zone['z'], color='navy')


plt.plot([(-10/12)+.3, (10/12)+.3], [5.5,5.5], color = 'orange')
plt.plot([(-10/24)+.3, (10/24)+.3], [2.75,2.75], color = 'orange')
plt.plot(strike_zone_alt['x'], strike_zone_alt['z'], color='orange')
#plt.plot(inside['x'], inside['z'], linestyle='dashed', color='grey')
#plt.plot(shadow['x'], shadow['z'], color='grey')

#plt.plot([-3.3/12,-3.3/12],[14/12,46/12], color='red') # vert left
#plt.plot([3.3/12,3.3/12],[14/12,46/12], color='red') # vert right
#plt.plot([-13.3/12, 13.3/12],[34/12,34/12], color='red') # horiz top
#plt.plot([-13.3/12, 13.3/12],[26/12,26/12], color='red') # horiz bot


#5.3333
#6.58

# 5'4"
# 6'7"
# 29 = lowest ()

#pitch_plt = pitches_out[16:22] # [15:27] # 22:27 is bregg's 5 pitches

#for p in pitch_plt:
#    print(p['zone_top'], p['zone_bot'], p['position_plate'], p['zone'])

#x_vals = [-p['position_plate']['x'] for p in pitch_plt]
#z_vals = [p['position_plate']['z'] for p in pitch_plt]

#plt.plot(x_vals, z_vals, 'o')

plt.show()

### base running and actions

running has an entry for anytime something happens to a base runner (including the batter)  
when runners advance due to a stolen base, or wild pitch, I think it's cool to make it clear that is before the hit, so it's clear to see what the hit produced  

actions has things like wild pitch (which you can also find in runners if they advance) and stolen bases, so the pre-hit stuff, as well as game advisories like status changes and pitching/offense subs  
it's mostly not needed as far as i can tell

stolen base at bats:
- '778319-24'
- '778319-29'

wild pitch at bats:
- '778319-79'
- '778319-2'


play example: 778463-28  
batter tried to get to second and was thrown out i think: 778463-28  
still counts as a single, so there is one entry in the array for 0->1st base where he was safe, then another one for 1st->2nd where he was out.  
the base running is going to be very confusing  
i'm mostly interested in pitching for now so i will likely not mess with this

In [None]:
for ab_id, v in extra_1.items():
    a_b = plays_1_df.loc[ab_id]
    print(ab_id, a_b['b_name'])
    print()
    r_batter = [b for b in v['runners'] if b['details']['runner']['full_name'] == a_b['b_name']]
    r_runners = [r for r in v['runners'] if r['details']['runner']['full_name'] != a_b['b_name']]
    post_hit = [r for r in r_runners if r['details']['event_type'] not in ['wild_pitch', 'stolen_base_2b', 'stolen_base_3b', 'stolen_base_home']]
    pre_hit = [r for r in r_runners if r['details']['event_type'] in ['wild_pitch', 'stolen_base_2b', 'stolen_base_3b', 'stolen_base_home']]

    for r_r in pre_hit:
        m = r_r['movement']
        r_d = r_r['details']
        out_string = f"out #{m['out_number']} at {m['out_base']}" if m['is_out'] else f"{m['start']} -> {m['end']}"
        print(f"runner: {r_d['runner']['full_name']} | {r_d['event']} ({r_d['event_type']}) {r_d['movement_reason']} | origin {m['origin_base']} | {out_string}")
    if len(pre_hit) > 0:
        print()

    for r_b in r_batter:
        m = r_b['movement']
        r_d = r_b['details']
        out_string = f"out #{m['out_number']} at {m['out_base']}" if m['is_out'] else f"{m['start']} -> {m['end']}"
        print(f"batter: {r_d['runner']['full_name']} | {r_d['event']} ({r_d['event_type']}) | {out_string}")

    for r_r in post_hit:
        m = r_r['movement']
        r_d = r_r['details']
        out_string = f"out #{m['out_number']} at {m['out_base']}" if m['is_out'] else f"{m['start']} -> {m['end']}"
        print(f"runner: {r_d['runner']['full_name']} | {r_d['movement_reason']} | origin {m['origin_base']} | {out_string}")
    
    """for a in v['actions']:
        d = a['details']
        if d['event_type'] not in ['game_advisory']:
            print()
            print(f"action: {d['description']} | {d['event']} | {d['event_type']} | {'score' if d['is_scoring_play'] else 'no score'} | {'out' if d['is_out'] else ''}")"""

    print()