In [624]:
import numpy as np
import pandas as pd
import pybaseball
import seaborn as sns
import matplotlib.pyplot as plot
from deepdiff import DeepDiff
from pybaseball import bwar_pitch
from pybaseball import bwar_bat
from pybaseball import cache
from pybaseball.lahman import *
from pybaseball import chadwick_register
from pybaseball import pitching_stats
from pybaseball import batting_stats
from time import sleep

cache.enable()
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)

# These are the people

In [625]:
people = pd.DataFrame(people())

sleep(1)  
# sleep calls to prevent the pybaseball scraper 
# from throwing errors when importing tons of stuff

people = people.convert_dtypes() # cleanup; thank you NumPy

people = people[people.weight.notna()]

print(people.shape)
print("---")
people.sample(3)

(19729, 24)
---


Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,deathCountry,deathState,deathCity,nameFirst,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
4499,delgaje01,1984,4,19,Venezuela,Aragua,Maracay,,,,,,,Jesus,Delgado,Jesus Andres,225,72,R,R,2008-09-17,2008-09-23,delgj001,delgaje01
3632,conrowi01,1877,4,5,USA,PA,Philadelphia,1959.0,12.0,6.0,USA,NJ,Mount Holly,Wid,Conroy,William Edward,158,69,R,R,1901-04-25,1911-10-05,conrw101,conrowi01
16690,seelbch02,1972,12,18,USA,TX,Lufkin,,,,,,,Chris,Seelbach,Christopher Don,180,76,R,R,2000-09-09,2001-05-20,seelc001,seelbch02


We need to add more ID info about them so that we don't drown in merge errors.

In [626]:
rosetta = pd.DataFrame(chadwick_register()) 

sleep(1)

rosetta = rosetta.convert_dtypes()

rosetta = rosetta.dropna(how='any')

print(rosetta.shape)
print('---')
rosetta.sample(3)

(20506, 8)
---


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
15368,Mosolf,Jim,119441,mosoj101,mosolji01,1009247,1929,1933
16094,O'Connell,Jimmy,119917,oconj102,o'conji01,1009715,1923,1924
784,Bagwell,Jeff,110432,bagwj001,bagweje01,547,1991,2005


## Merge people into rosetta

In [627]:
everyone = pd.merge(rosetta, people, left_on='key_bbref', right_on='playerID', how='left')

print(everyone.shape)
print('---')
everyone.sample(3)

(20506, 32)
---


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,deathCountry,deathState,deathCity,nameFirst,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
1568,Blair,Carson,542942,blaic001,blairca01,5429,2015,2015,blairca01,1989,10,18,USA,TX,Carrollton,,,,,,,Carson,Blair,Carson Reynolds,210,74,R,R,2015-09-06,2015-09-24,blaic001,blairca01
2292,Brown,Tom,111593,browt104,brownto04,1001565,1978,1978,brownto04,1949,8,10,USA,LA,Lafayette,,,,,,,Tom,Brown,Thomas Dale,170,73,R,R,1978-09-14,1978-10-01,browt104,brownto04
6065,Flowers,Tyler,452095,flowt001,flowety01,9134,2009,2020,flowety01,1986,1,24,USA,GA,Roswell,,,,,,,Tyler,Flowers,Cole Tyler,260,76,R,R,2009-09-03,2020-09-27,flowt001,flowety01


## Init Fielding Data

In [628]:
# fielding stats by year 
fielding = pd.DataFrame(fielding()) # this is fine

fielding = fielding.convert_dtypes()

In [629]:
fielding = fielding.rename(columns={"playerID": "key_bbref"})

In [630]:
everyone = everyone[
    [
        "name_last",
        "name_first",
        "key_mlbam",
        "key_retro",
        "key_bbref",
        "key_fangraphs",
        "birthYear",
        "deathYear",
        "weight",
        "height",
        "bats",
        "throws",
    ]
]

In [631]:
df = everyone
df = fielding.merge(df, on='key_bbref', how='outer', validate='many_to_one')

## Batting data

In [632]:
# batting stats by year 
batting = pd.DataFrame(batting()) # this is fine

batting = batting.convert_dtypes()
batting.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)


Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
12629,aitchra01,1911,1,BRO,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12700,camniha01,1911,1,SLN,NL,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12738,cottren01,1911,1,PIT,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12830,griffha01,1911,1,CHN,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13045,pucketr01,1911,1,PHI,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110490,zimmejo02,2021,1,MIL,NL,2,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
110491,zimmeky01,2021,1,KCA,AL,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
110492,zimmery01,2021,1,WAS,NL,110,255,27,62,16,0,14,46,0,0,16,77,0,0,0,2,9
110493,zuberty01,2021,1,KCA,AL,31,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [633]:
batting = batting.rename(columns={"playerID": "key_bbref"})

### Merge Batting Data

In [634]:
a = np.intersect1d(df.columns, batting.columns)
print(list(a))

['CS', 'G', 'SB', 'key_bbref', 'lgID', 'stint', 'teamID', 'yearID']


In [635]:
df = df.merge(batting, on=['CS', 'G', 'SB', 'key_bbref', 'lgID', 'stint', 'teamID', 'yearID'], how='outer', validate='many_to_many', indicator=True)


## Init Pitching data

In [636]:
# pitching stats by year 
pitching = pd.DataFrame(pitching()) # this is fine

sleep(1)

pitching = pitching.convert_dtypes()

In [637]:
pitching = pitching.rename(columns={"playerID": "key_bbref"})

In [638]:
a = np.intersect1d(df.columns, pitching.columns)
print(list(a))

['BB', 'G', 'GIDP', 'GS', 'H', 'HBP', 'HR', 'IBB', 'R', 'SF', 'SH', 'SO', 'WP', 'key_bbref', 'lgID', 'stint', 'teamID', 'yearID']


In [639]:
df = df.merge(pitching, on=['BB', 'G', 'GIDP', 'GS', 'H', 'HBP', 'HR', 'IBB', 'R', 'SF', 'SH', 'SO', 'WP', 'key_bbref', 'lgID', 'stint', 'teamID', 'yearID'], how='outer', validate='many_to_many', indicator="second_merge")



## FIP, wRC+

In [640]:
# bwar_bat stats by year 
bwar_bat = pd.DataFrame(bwar_bat()) # this is fine

bwar_bat = bwar_bat.convert_dtypes()


In [641]:
bwar_bat = bwar_bat.rename(columns={
    "year_ID": "yearID",
    "player_ID": "key_bbref",
    "team_ID": "teamID",
    "lg_ID": "lgID",
})

In [642]:
a = np.intersect1d(df.columns, bwar_bat.columns)
print(list(a))

['G', 'key_bbref', 'lgID', 'teamID', 'yearID']


In [643]:
df = df.merge(bwar_bat, on=['G', 'key_bbref', 'lgID', 'teamID', 'yearID'], how='outer', validate='many_to_many', indicator="third_merge")

In [644]:
# bwar_pit stats by year 
bwar_pitch = pd.DataFrame(bwar_pitch()) # this is fine

bwar_pitch = bwar_pitch.convert_dtypes()


In [645]:
bwar_pitch = bwar_pitch.rename(columns={
    "year_ID": "yearID",
    "player_ID": "key_bbref",
    "team_ID": "teamID",
    "lg_ID": "lgID",
})

In [646]:
a = np.intersect1d(df.columns, bwar_pitch.columns)
print(list(a))

['G', 'GS', 'WAA', 'WAR', 'WAR_rep', 'key_bbref', 'lgID', 'mlb_ID', 'name_common', 'salary', 'stint_ID', 'teamID', 'yearID']


In [647]:
df = df.merge(bwar_pitch, on=['G', 'GS', 'WAA', 'WAR', 'WAR_rep', 'key_bbref', 'lgID', 'mlb_ID', 'name_common', 'salary', 'stint_ID', 'teamID', 'yearID'], how='outer', validate='many_to_many', indicator="fourth_merge")
df

Unnamed: 0,key_bbref,yearID,stint,teamID,lgID,POS,G,GS,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR,name_last,name_first,key_mlbam,key_retro,key_fangraphs,birthYear,deathYear,weight,height,bats,throws,AB,R,H,2B,3B,HR,RBI,BB,SO,IBB,HBP,SH,SF,GIDP,_merge,W,L,CG,SHO,SV,IPouts,ER,BAOpp,ERA,BK,BFP,GF,second_merge,name_common,mlb_ID,stint_ID,pitcher,PA,salary,runs_above_avg,runs_above_avg_off,runs_above_avg_def,WAR_rep,WAA,WAR,third_merge,RA,xRA,BIP,BIP_perc,ERA_plus,WAA_adj,fourth_merge
0,abercda01,1871,1,TRO,,SS,1,1,24,1,3,2,0,,,,,,Abercrombie,Frank,110018,aberd101,1000017,,,,,,,,,,,,,,,,,,,,,left_only,,,,,,,,,,,,,left_only,Frank Abercrombie,110018,1,N,4,,-1.3,-1.3,0.1,0.01,-0.08,-0.07,both,,,,,,,left_only
1,abercda01,1871,1,TRO,,,1,,,,,,,,,0,0,,,,,,,,,,,,,4,0,0,0,0,0,0,0,0,,,,,0,right_only,,,,,,,,,,,,,left_only,Frank Abercrombie,110018,1,N,4,,-1.3,-1.3,0.1,0.01,-0.08,-0.07,both,,,,,,,left_only
2,addybo01,1871,1,RC1,,2B,22,22,606,67,72,42,5,,,,,,Addy,Bob,110074,addyb101,1000070,1842,1910,160,68,L,L,,,,,,,,,,,,,,,left_only,,,,,,,,,,,,,left_only,,,,,,,,,,,,,left_only,,,,,,,left_only
3,addybo01,1871,1,RC1,,SS,3,3,96,8,14,7,0,,,,,,Addy,Bob,110074,addyb101,1000070,1842,1910,160,68,L,L,,,,,,,,,,,,,,,left_only,,,,,,,,,,,,,left_only,,,,,,,,,,,,,left_only,,,,,,,left_only
4,addybo01,1873,1,PH2,,2B,10,,249,24,23,8,2,,,,,,Addy,Bob,110074,addyb101,1000070,1842,1910,160,68,L,L,,,,,,,,,,,,,,,left_only,,,,,,,,,,,,,left_only,,,,,,,,,,,,,left_only,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430630,strasst01,2022,,WSN,NL,,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stephen Strasburg,544931,1,,,35000000,,,,0.044,-0.3146,-0.28,,7,2.188,15,0.0065,31.714286,-0.0052,right_only
430631,tetreja01,2022,,WSN,NL,,4,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Jackson Tetreault,676194,1,,,,,,,0.1971,-0.3824,-0.21,,15,10.557,77,0.0335,78.308333,-0.0233,right_only
430632,thompma02,2022,,WSN,NL,,5,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Mason Thompson,666168,1,,,,,,,0.0389,0.2085,0.15,,0,2.201,10,0.0044,,-0.096,right_only
430633,vothau01,2022,,WSN,NL,,19,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Austin Voth,608723,1,,,875000,,,,0.1801,-1.2559,-0.81,,22,9.4,70,0.0305,39.866667,0.2644,right_only


### Clean-up

In [648]:
everyone = df[
    [
        "key_bbref",
        "name_last",
        "name_first",
        "weight",
        "height",
        "yearID",
        "POS",
        "WAR",
        "WAA",
        "pitcher",
        "ERA_plus",
        "birthYear",
        "deathYear",
        "teamID",
        "lgID",
        "runs_above_avg",
        "runs_above_avg_def",
        "WAA_adj",
        "G",
        "GS",
        "InnOuts",
        "PO",
        "A",
        "E",
        "DP",
        "PB",
        "WP",
        "SB",
        "CS",
        "ZR",
        "bats",
        "throws",
        "AB",
        "R",
        "H",
        "2B",
        "3B",
        "HR",
        "RBI",
        "BB",
        "SO",
        "IBB",
        "HBP",
        "SH",
        "SF",
        "GIDP",
        "W",
        "L",
        "CG",
        "SHO",
        "SV",
        "IPouts",
        "ER",
        "BAOpp",
        "ERA",
        "BK",
        "BFP",
        "GF",
        "name_common",
        "mlb_ID",
        "stint_ID",
        "PA",
        "salary",
        "runs_above_avg_off",
        "WAR_rep",
        "RA",
        "xRA",
        "BIP",
        "BIP_perc",
        "_merge",
        "second_merge",
        "third_merge",
        "fourth_merge",
        "key_mlbam",
        "key_retro",
        "key_fangraphs",
    ]
]

In [649]:
df = everyone[
    [
        "key_bbref",
        "weight",
        "height",
        "name_last",
        "name_first",
        "POS",
        "yearID",
        "birthYear",
        "deathYear",
        "teamID",
        "lgID",
        "AB",
        "R",
        "H",
        "2B",
        "3B",
        "HR",
        "RBI",
        "BB",
        "SO",
        "WAR",
        "WAA",
        "pitcher",
        "ERA_plus",
    ]
]

# Here it is:

In [650]:
df = df.sort_values('weight', ascending=False)

In [651]:
df = df.drop_duplicates(subset=['key_bbref'])

In [652]:
# Constants
KG_TO_LB = 0.453592
M_TO_IN = 0.0254

In [653]:
# BMI Calculations
df['KG'] = df['weight'] * KG_TO_LB
df['meters'] = df['height'] * M_TO_IN
df['BMI'] = df['KG'] / df['meters'] ** 2
df['ratio'] = df['meters'] * df['BMI']


After some noodling, this is the best combo of BMI:height that I could find such that we have enough players at each position to fill out a roster. 2B and 1B ended up being a bottleneck, which isn't surprising. Second-basemen are typically the smallest player on the team, and First-basemen are almost universally the tallest fielder on the roster.

In [654]:
tryouts = df[ # Let's separate the meat from the chaff
    (df.BMI >= df.BMI.quantile(0.99))
    & (df.height <= df.height.quantile(0.23))]

In [655]:
tryouts.POS.value_counts()

P     17
C     15
1B     8
OF     7
2B     3
SS     1
3B     1
Name: POS, dtype: Int64

In [656]:
# Probably easiest if we make ourselves a dataframe for every position, since we'll need to fill up a 26-man roster, which is parsed into several discrete roles that each require a particular number of players. 

# These role counts aren't codified, but teams have nearly always carried the same distribution of player-roles.

# The standard breakdown is:
# 13 Pitchers (5x SP, 7x RP)
# 5 OF
# 2 C
# 4 IF
# 2 Utility

# For Utility players, we'll likely take two extra middle infielders, because middle infielders can usually play any field position without looking like they have no idea what they're doing. E.g., you can put a shortstop at first base, and he'll be a pretty mediocre-to-bad first baseman — but if you put a first baseman at shortstop, he's likely to spend the entire game crying.

tryouts_P = tryouts[tryouts['POS'] == "P"]
tryouts_C = tryouts[tryouts['POS'] == "C"]
tryouts_1B = tryouts[tryouts['POS'] == "1B"]
tryouts_2B = tryouts[tryouts['POS'] == "2B"]
tryouts_3B = tryouts[tryouts['POS'] == "3B"]
tryouts_SS = tryouts[tryouts['POS'] == "SS"]
tryouts_OF = tryouts[tryouts['POS'] == "OF"]

In [657]:
test1 = tryouts_P.sort_values(["BMI"], ascending=False).nlargest(13, "BMI")
test1.head(7)

Unnamed: 0,key_bbref,weight,height,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus,KG,meters,BMI,ratio
182432,colonba01,285,71,Colon,Bartolo,P,2011,1973,,NYA,AL,,,,,,,,,,,,,,129.27372,1.8034,39.748992,71.683331
243208,moronre01,265,70,Moronta,Reyes,P,2017,1993,,SFN,NL,,,,,,,,,,,,,,120.20188,1.778,38.02312,67.605107
216625,mijarjo01,265,71,Mijares,Jose,P,2011,1984,,MIN,AL,,,,,,,,,,,,,,120.20188,1.8034,36.959589,66.652922
3994,healeto01,155,55,Healey,Tom,P,1878,1853,1891.0,PRO,NL,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.02,-0.02,Y,,70.30676,1.397,36.025023,50.326958
229336,machije01,257,71,Machi,Jean,P,2014,1982,,SFN,NL,,,,,,,,,,,,,,116.573144,1.8034,35.843828,64.640759
199068,rodnefe01,240,71,Rodney,Fernando,P,2002,1977,,DET,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
236741,castran01,240,71,Castro,Angel,P,2015,1982,,OAK,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911


In [658]:
huskies_P = (
    tryouts_P.sort_values("BMI", ascending=False)
    .nlargest(13, "BMI")
)  # 13 heaviest P
huskies_P

Unnamed: 0,key_bbref,weight,height,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus,KG,meters,BMI,ratio
182432,colonba01,285,71,Colon,Bartolo,P,2011,1973,,NYA,AL,,,,,,,,,,,,,,129.27372,1.8034,39.748992,71.683331
243208,moronre01,265,70,Moronta,Reyes,P,2017,1993,,SFN,NL,,,,,,,,,,,,,,120.20188,1.778,38.02312,67.605107
216625,mijarjo01,265,71,Mijares,Jose,P,2011,1984,,MIN,AL,,,,,,,,,,,,,,120.20188,1.8034,36.959589,66.652922
3994,healeto01,155,55,Healey,Tom,P,1878,1853,1891.0,PRO,NL,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.02,-0.02,Y,,70.30676,1.397,36.025023,50.326958
229336,machije01,257,71,Machi,Jean,P,2014,1982,,SFN,NL,,,,,,,,,,,,,,116.573144,1.8034,35.843828,64.640759
199068,rodnefe01,240,71,Rodney,Fernando,P,2002,1977,,DET,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
236741,castran01,240,71,Castro,Angel,P,2015,1982,,OAK,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
248116,colined01,240,71,Colina,Edwar,P,2020,1997,,MIN,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
245959,valdefr01,239,71,Valdez,Framber,P,2021,1993,,HOU,AL,,,,,,,,,,,,,,108.408488,1.8034,33.333365,60.11339
224531,carigan01,235,71,Carignan,Andrew,P,2012,1986,,OAK,AL,,,,,,,,,,,,,,106.59412,1.8034,32.775484,59.107308


In [659]:
huskies_P = (
    tryouts_P.sort_values("BMI", ascending=False)
    .nlargest(13, "BMI")
)  # 13 heaviest P
huskies_P

Unnamed: 0,key_bbref,weight,height,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus,KG,meters,BMI,ratio
182432,colonba01,285,71,Colon,Bartolo,P,2011,1973,,NYA,AL,,,,,,,,,,,,,,129.27372,1.8034,39.748992,71.683331
243208,moronre01,265,70,Moronta,Reyes,P,2017,1993,,SFN,NL,,,,,,,,,,,,,,120.20188,1.778,38.02312,67.605107
216625,mijarjo01,265,71,Mijares,Jose,P,2011,1984,,MIN,AL,,,,,,,,,,,,,,120.20188,1.8034,36.959589,66.652922
3994,healeto01,155,55,Healey,Tom,P,1878,1853,1891.0,PRO,NL,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.02,-0.02,Y,,70.30676,1.397,36.025023,50.326958
229336,machije01,257,71,Machi,Jean,P,2014,1982,,SFN,NL,,,,,,,,,,,,,,116.573144,1.8034,35.843828,64.640759
199068,rodnefe01,240,71,Rodney,Fernando,P,2002,1977,,DET,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
236741,castran01,240,71,Castro,Angel,P,2015,1982,,OAK,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
248116,colined01,240,71,Colina,Edwar,P,2020,1997,,MIN,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
245959,valdefr01,239,71,Valdez,Framber,P,2021,1993,,HOU,AL,,,,,,,,,,,,,,108.408488,1.8034,33.333365,60.11339
224531,carigan01,235,71,Carignan,Andrew,P,2012,1986,,OAK,AL,,,,,,,,,,,,,,106.59412,1.8034,32.775484,59.107308


In [660]:
huskies_C = (
    tryouts_C.sort_values("BMI", ascending=False)
    .nlargest(2, "BMI")
)  # 2 heaviest C
huskies_C

Unnamed: 0,key_bbref,weight,height,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus,KG,meters,BMI,ratio
248415,kirkal01,245,68,Kirk,Alejandro,C,2021,1998,,TOR,AL,,,,,,,,,,,,,,111.13004,1.7272,37.25171,64.341153
207632,penabr01,240,69,Pena,Brayan,C,2016,1982,,SLN,NL,,,,,,,,,,,,,,108.86208,1.7526,35.441412,62.114618


In [661]:
huskies_1B = (
    tryouts_1B.sort_values("BMI", ascending=False)
    .nlargest(1, "BMI")
)  # 1 heaviest 1B
huskies_1B

Unnamed: 0,key_bbref,weight,height,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus,KG,meters,BMI,ratio
217568,sandopa01,268,70,Sandoval,Pablo,1B,2010,1986,,SFN,NL,,,,,,,,,,,,,,121.562656,1.778,38.45357,68.370448


In [662]:
huskies_2B = (
    tryouts_2B.sort_values("BMI", ascending=False)
    .nlargest(3, "BMI")
)  # 3 heaviest 2B (1x starter, 2x utility)
huskies_2B

Unnamed: 0,key_bbref,weight,height,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus,KG,meters,BMI,ratio
241635,whitety01,238,71,White,Tyler,2B,2017,1990,,HOU,AL,,,,,,,,,,,,,,107.954896,1.8034,33.193895,59.86187
232566,phegljo01,225,70,Phegley,Josh,2B,2013,1988,,CHA,AL,,,,,,,,,,,,,,102.0582,1.778,32.283781,57.400562
230370,solando01,210,68,Solano,Donovan,2B,2014,1987,,MIA,NL,,,,,,,,,,,,,,95.25432,1.7272,31.930037,55.14956


In [663]:
huskies_SS = (
    tryouts_SS.sort_values("BMI", ascending=False)
    .nlargest(1, "BMI")
)  # 1 heaviest SS
huskies_SS

Unnamed: 0,key_bbref,weight,height,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus,KG,meters,BMI,ratio
184377,tejadmi01,220,69,Tejada,Miguel,SS,1997,1974,,OAK,AL,,,,,,,,,,-0.23,-0.59,N,,99.79024,1.7526,32.487961,56.9384


In [664]:
huskies_3B = (
    tryouts_3B.sort_values("BMI", ascending=False)
    .nlargest(1, "BMI")
)  # 1 heaviest 3B
huskies_3B

Unnamed: 0,key_bbref,weight,height,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus,KG,meters,BMI,ratio
208969,callaal01,210,68,Callaspo,Alberto,3B,2010,1983,,KCA,AL,,,,,,,,,,,,,,95.25432,1.7272,31.930037,55.14956


In [665]:
huskies_OF = (
    tryouts_OF.sort_values("BMI", ascending=False)
    .nlargest(5, "BMI")
)  # 5 heaviest OF
huskies_OF

Unnamed: 0,key_bbref,weight,height,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus,KG,meters,BMI,ratio
247295,naylojo01,250,71,Naylor,Josh,OF,2020,1997,,SDN,NL,,,,,,,,,,,,,,113.398,1.8034,34.867537,62.880115
223992,vicieda01,240,71,Viciedo,Dayán,OF,2012,1989,,CHA,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
247419,ramirha02,232,70,Ramirez,Harold,OF,2021,1994,,CLE,AL,,,,,,,,,,,,,,105.233344,1.778,33.288165,59.186358
244258,astudwi01,225,69,Astudillo,Willians,OF,2021,1991,,MIN,AL,,,,,,,,,,,,,,102.0582,1.7526,33.226324,58.232455
51268,fothebo01,230,70,Fothergill,Bob,OF,1923,1897,1938.0,DET,AL,,,,,,,,,,,,,,104.32616,1.778,33.001198,58.67613


In [666]:
huskies = pd.concat([
    huskies_P,
    huskies_C,
    huskies_1B,
    huskies_2B,
    huskies_3B,
    huskies_SS,
    huskies_OF
])

In [667]:
huskies.describe()

Unnamed: 0,weight,height,yearID,birthYear,deathYear,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,ERA_plus,KG,meters,BMI,ratio
count,26.0,26.0,26.0,26.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,0.0,26.0,26.0,26.0,26.0
mean,236.5,69.576923,2005.153846,1978.076923,1914.5,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.125,-0.305,,107.274508,1.767254,34.299658,60.596378
std,24.081943,3.1518,32.024606,31.757737,33.234019,,,,,,,,,,0.148492,0.403051,,10.923377,0.080056,2.208648,4.580691
min,155.0,55.0,1878.0,1853.0,1891.0,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.23,-0.59,,70.30676,1.397,31.930037,50.326958
25%,226.25,69.25,2010.0,1982.0,1902.75,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.1775,-0.4475,,102.62519,1.75895,32.775484,58.343374
50%,238.5,70.5,2013.5,1986.0,1914.5,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.125,-0.305,,108.181692,1.7907,33.4031,59.98763
75%,243.75,71.0,2019.25,1992.5,1926.25,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.0725,-0.1625,,110.56305,1.8034,35.743224,62.688741
max,285.0,71.0,2021.0,1998.0,1938.0,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.02,-0.02,,129.27372,1.8034,39.748992,71.683331


In [668]:
huskies

Unnamed: 0,key_bbref,weight,height,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus,KG,meters,BMI,ratio
182432,colonba01,285,71,Colon,Bartolo,P,2011,1973,,NYA,AL,,,,,,,,,,,,,,129.27372,1.8034,39.748992,71.683331
243208,moronre01,265,70,Moronta,Reyes,P,2017,1993,,SFN,NL,,,,,,,,,,,,,,120.20188,1.778,38.02312,67.605107
216625,mijarjo01,265,71,Mijares,Jose,P,2011,1984,,MIN,AL,,,,,,,,,,,,,,120.20188,1.8034,36.959589,66.652922
3994,healeto01,155,55,Healey,Tom,P,1878,1853,1891.0,PRO,NL,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.02,-0.02,Y,,70.30676,1.397,36.025023,50.326958
229336,machije01,257,71,Machi,Jean,P,2014,1982,,SFN,NL,,,,,,,,,,,,,,116.573144,1.8034,35.843828,64.640759
199068,rodnefe01,240,71,Rodney,Fernando,P,2002,1977,,DET,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
236741,castran01,240,71,Castro,Angel,P,2015,1982,,OAK,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
248116,colined01,240,71,Colina,Edwar,P,2020,1997,,MIN,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
245959,valdefr01,239,71,Valdez,Framber,P,2021,1993,,HOU,AL,,,,,,,,,,,,,,108.408488,1.8034,33.333365,60.11339
224531,carigan01,235,71,Carignan,Andrew,P,2012,1986,,OAK,AL,,,,,,,,,,,,,,106.59412,1.8034,32.775484,59.107308


# Let's run this through Baseball Reference offline
They've got handy tool to produce an expected season for a given player. These are the CSVs that it spits out.

In [669]:
batters = pd.read_csv("../data/huskies_batters.csv")
pitchers = pd.read_csv("../data/huskies_pitchers.csv")

Time to take a look at our team

In [670]:
# Average Husky WAR
(pitchers.WAR.sum() + batters.WAR.sum()) / 26

1.2153846153846155

In [671]:
# Total Husky WAR
husky_war_sum = (pitchers.WAR.sum() + batters.WAR.sum())
# Average Husky height
husky_height_mean = (pitchers.height.sum() + batters.height.sum()) / 26
# Average Husky weight
husky_weight_mean = (pitchers.weight.sum() + batters.weight.sum()) / 26

In [672]:
# Average weight of all players
everyone_weight_mean = everyone.weight.mean()
# Average height of all players
everyone_height_mean = everyone.height.mean()

In [673]:
pitchers.WAR.sum()

13.899999999999999

In [674]:
batters.describe(
)

Unnamed: 0,height,weight,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Rbat,Rbaser,Rdp,Rfield,Rpos,RAA,WAA,Rrep,RAR,WAR,waaWL%,162WL%,oWAR,dWAR,oRAR
count,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,12.0,13.0,13.0,12.0,12.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0
mean,69.538462,233.307692,162.0,556.307692,509.076923,60.461538,137.307692,27.384615,1.923077,14.153846,66.769231,2.461538,1.923077,35.692308,79.923077,0.269308,0.320692,0.413231,0.733923,98.846154,211.0,15.0,5.153846,1.923077,4.583333,2.083333,0.615385,-1.461538,-0.615385,-3.615385,0.0,-5.230769,-0.507692,19.384615,14.384615,1.361538,0.496692,0.498923,1.723077,-0.315385,18.0
std,1.126601,16.188394,0.0,53.658464,49.358318,13.684635,20.093053,4.407162,1.977437,6.375031,14.359398,1.941451,2.059998,13.780328,34.731018,0.025227,0.028946,0.039368,0.062551,16.118551,33.662541,4.0,2.703274,3.593085,0.792961,1.831955,10.882237,1.898042,1.192928,3.01492,6.244998,13.766831,1.412717,2.063107,14.505967,1.476222,0.008873,0.005155,1.314758,0.838497,12.529964
min,68.0,210.0,162.0,495.0,459.0,35.0,105.0,19.0,0.0,5.0,42.0,0.0,0.0,10.0,24.0,0.225,0.268,0.351,0.649,76.0,161.0,9.0,1.0,0.0,3.0,0.0,-14.0,-6.0,-2.0,-8.0,-9.0,-19.0,-1.8,17.0,-2.0,-0.2,0.488,0.491,0.3,-1.5,5.0
25%,69.0,225.0,162.0,521.0,478.0,55.0,126.0,24.0,1.0,8.0,62.0,1.0,1.0,27.0,54.0,0.257,0.299,0.381,0.697,91.0,192.0,12.5,3.0,0.0,4.0,1.0,-4.0,-3.0,-2.0,-5.0,-6.0,-15.0,-1.6,18.0,5.0,0.4,0.49,0.496,0.9,-1.1,10.0
50%,70.0,232.0,162.0,544.0,481.0,59.0,139.0,28.0,2.0,15.0,68.0,2.0,1.0,32.0,89.0,0.27,0.32,0.41,0.725,97.0,206.0,14.5,5.0,0.0,5.0,2.0,-1.0,-1.0,-1.0,-3.0,1.0,-10.0,-0.8,19.0,8.0,0.9,0.494,0.497,1.5,-0.2,16.0
75%,70.0,240.0,162.0,586.0,541.0,66.0,150.0,30.0,2.0,18.0,71.0,3.0,2.0,42.0,98.0,0.28,0.33,0.443,0.773,108.0,229.0,17.5,7.0,2.0,5.0,2.25,8.0,0.0,0.0,-2.0,5.0,0.0,0.0,20.0,20.0,2.1,0.5,0.502,2.3,0.4,23.0
max,71.0,268.0,162.0,687.0,629.0,92.0,180.0,35.0,8.0,23.0,97.0,6.0,8.0,58.0,132.0,0.325,0.374,0.48,0.854,136.0,287.0,21.0,9.0,13.0,6.0,6.0,27.0,1.0,1.0,3.0,9.0,29.0,3.1,24.0,48.0,4.9,0.519,0.508,4.6,1.0,44.0


In [675]:
batters['total_average'] = ((batters['TB'] + batters['HBP'] + batters['BB'] + batters['SB'] - batters['CS']) / (batters['AB'] - batters['H'] + batters['CS'] + batters['GDP']))

In [676]:
batters.WAR.sum()

17.700000000000003

In [677]:
17.7 + 13.9

31.6

In [678]:
pitchers.median()

  pitchers.median()


height      71.0000
weight     240.0000
W            4.0000
L            5.0000
W-L%         0.4665
Dec          3.6800
ERA         68.0000
G            0.0000
GS          13.0000
CG           0.0000
SHO          0.0000
SV           0.0000
IP          67.0000
H          109.0000
R           42.0000
ER          40.0000
HR          14.0000
BB          48.0000
IBB          2.0000
SO          67.0000
HBP          3.0000
BK           0.0000
WP           4.0000
BF         413.0000
ERA+▼      109.0000
FIP          4.1500
WHIP         1.3560
H9           8.9000
HR9          0.8000
BB9          3.8000
SO9          7.3000
SO/BB        2.2000
IP.1        67.0000
G.1         68.0000
GS.1         0.0000
R.1         42.0000
RA9          4.1800
RA9opp       4.6800
RA9def       0.0600
RA9role     -0.3100
PPFp        95.8000
RA9avg       4.6700
RAA          4.0000
WAA          0.1000
gmLI         1.0000
WAAadj       0.0000
WAR          0.5000
RAR          6.0000
waaWL%       0.5060
162WL%       0.5000


In [679]:
pitchers.WAR.sum()

13.899999999999999

In [680]:
people.shape

(19729, 24)

In [681]:
people.head()

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,deathCountry,deathState,deathCity,nameFirst,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
0,aardsda01,1981,12,27,USA,CO,Denver,,,,,,,David,Aardsma,David Allan,215,75,R,R,2004-04-06,2015-08-23,aardd001,aardsda01
1,aaronha01,1934,2,5,USA,AL,Mobile,2021.0,1.0,22.0,USA,GA,Atlanta,Hank,Aaron,Henry Louis,180,72,R,R,1954-04-13,1976-10-03,aaroh101,aaronha01
2,aaronto01,1939,8,5,USA,AL,Mobile,1984.0,8.0,16.0,USA,GA,Atlanta,Tommie,Aaron,Tommie Lee,190,75,R,R,1962-04-10,1971-09-26,aarot101,aaronto01
3,aasedo01,1954,9,8,USA,CA,Orange,,,,,,,Don,Aase,Donald William,190,75,R,R,1977-07-26,1990-10-03,aased001,aasedo01
4,abadan01,1972,8,25,USA,FL,Palm Beach,,,,,,,Andy,Abad,Fausto Andres,184,73,L,L,2001-09-10,2006-04-13,abada001,abadan01


In [682]:
people.weight.describe()

count    19729.000000
mean       188.133712
std         22.495432
min         65.000000
25%        172.000000
50%        185.000000
75%        200.000000
max        320.000000
Name: weight, dtype: float64

In [693]:
comparison = pd.DataFrame()

In [694]:
comparison['everyone_weight'] = people.weight.describe()
comparison['huskies_weight'] = huskies.weight.describe()
comparison['delta_weight'] = comparison['huskies_weight'] - comparison['everyone_weight']
comparison['everyone_height'] = people.height.describe()
comparison['huskies_height'] = huskies.height.describe()
comparison['delta_height'] = comparison['huskies_height'] - comparison['everyone_height']


In [695]:
comparison = comparison.convert_dtypes()
comparison = comparison.drop(labels='count')

In [697]:
comparison.to_csv('body_comparison.csv')

In [698]:
huskies.playerID

AttributeError: 'DataFrame' object has no attribute 'playerID'

In [701]:
brefids = huskies.key_bbref.to_list()

In [702]:
brefids

['colonba01',
 'moronre01',
 'mijarjo01',
 'healeto01',
 'machije01',
 'rodnefe01',
 'castran01',
 'colined01',
 'valdefr01',
 'carigan01',
 'akinke01',
 'mateoju01',
 'gonzaen01',
 'kirkal01',
 'penabr01',
 'sandopa01',
 'whitety01',
 'phegljo01',
 'solando01',
 'callaal01',
 'tejadmi01',
 'naylojo01',
 'vicieda01',
 'ramirha02',
 'astudwi01',
 'fothebo01']