In [1]:
import pybaseball as bb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
start_dt = '2021-04-01'
end_dt = '2021-10-3'
pitchers_2021 = bb.pitching_stats_bref(2021).sort_values(by='IP', ascending=False)

In [3]:
def on_base_count(row: pd.Series) -> int:
    return sum(np.isfinite([row.on_1b, row.on_2b, row.on_3b]))

def generate(player_id: np.float64) -> pd.DataFrame:
    data = bb.statcast_pitcher(start_dt, end_dt, player_id)
    data = data[['player_name', 'batter', 'game_date', 'at_bat_number', 
                 'balls', 'strikes', 'pitch_name', 'on_3b', 'on_2b', 'on_1b']]
    data['on_base_count'] = data.apply(lambda row: on_base_count(row), axis=1)
    return data

def get_data(first: str, last: str) -> pd.DataFrame:
    try:
        l = bb.playerid_lookup(last, first)
        return generate(l['key_mlbam'][0])
    except KeyError:
        return None

In [4]:
def count_transitions(count: list) -> np.array:
    it = reversed(count)
    prev = next(it)
    out = np.zeros((4,4))
    for item in it:
        out[prev][item] += 1
        prev = item
    return out

def limiting_distribution(count: pd.Series, num_simulations: int = 100000) -> np.array:
    if count is not None:
        transitions = count_transitions(count['on_base_count'])   
        transitions_sum = transitions.sum(axis=1)
        transitions_sum = np.array([value if value else 1.0 for value in transitions_sum])
        probs = transitions / transitions_sum[:, np.newaxis]
        initial_state = [1, 0, 0, 0]
    
        return simulate(initial_state, probs, num_simulations)
    else:
        return None

def simulate(state, pi, i):
    for _ in range(i):
        state = np.dot(state, pi)
    return state

class Player:
    
    def __init__(self, first, last):
        self.first = first
        self.last = last
        self.data = get_data(self.first, self.last)
        self.pi = limiting_distribution(self.data, num_simulations = 100000)

In [5]:
woodruff = Player('brandon', 'woodruff')
bauer = Player('trevor', 'bauer')
scherzer = Player('max', 'scherzer')
lyles = Player('jordan', 'lyles')
gant = Player('john', 'gant')
test = Player('kyle', 'hendricks')

Gathering player lookup table. This may take a moment.
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data


In [6]:
names = [ tuple(name.split()) for name in pitchers_2021['Name'][:50] ]
pitchers = [ Player(first, last) for first, last in names ]
pitchers = np.array([ pitcher for pitcher in pitchers if pitcher.pi is not None ])

Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering 

In [7]:
pis = np.array([pitcher.pi for pitcher in pitchers])
scores = np.dot(homers, pis.T).flatten()

NameError: name 'homers' is not defined

In [None]:
hrv = {pitcher.first + ' ' + pitcher.last: value for pitcher, value in zip(list(pitchers), scores)}
HRV = pd.DataFrame(hrv.items(), columns = ['Name', 'HRM'])
players = pitchers_2021[:50]
players = players[players['Name'] != 'Kyle Hendricks']
HRV['HR'] = list(players['HR'])
HRV['IP'] = list(players['IP'])
HRV['HRM*HR'] = HRV.apply(lambda row: row.HRM * row.HR, axis = 1)

In [None]:
HRV.sort_values(by='HRM')

In [None]:
HRV.to_csv(f'/Users/mason/w/cs/fun/baseball/data/pitching/HRV_2021.csv')