In [186]:
import numpy as np
import pandas as pd
import pybaseball
import seaborn as sns
import matplotlib.pyplot as plot
from deepdiff import DeepDiff
from pybaseball import bwar_pitch
from pybaseball import bwar_bat
from pybaseball import cache
from pybaseball.lahman import *
from pybaseball import chadwick_register
from pybaseball import pitching_stats
from pybaseball import batting_stats
from time import sleep

cache.enable()
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 10)

# These are the people

In [187]:
biofile = pd.read_csv('/Users/bean/Dropbox/code/_projects/baseball/_data/retrosheet/biofile.txt')

In [188]:
biofile = biofile.convert_dtypes()

In [189]:
biofile[['FEET', 'INCHES']] = biofile['HEIGHT'].str.split('-', 1, expand=True)

In [190]:
biofile['FEET'] = biofile['FEET'].astype(float)
biofile['INCHES'] = biofile['FEET'].astype(float)

In [191]:
biofile.INCHES.dtype

dtype('float64')

In [192]:
biofile['BMI'] = ((biofile['WEIGHT'] * KG_TO_LB) / (((biofile['FEET'] * 12) + biofile['INCHES']) * INCH_TO_METER) ** 2)

In [193]:
biofile = biofile.convert_dtypes()
biofile_small = biofile[['PLAYERID', 'FIRST', 'LAST', 'INCHES', 'WEIGHT', 'BMI']]

In [194]:
biofile_small.describe().to_csv('biofile_small.csv')

In [195]:
KG_TO_LB = 0.4536
INCH_TO_METER = .0254

In [196]:
biofile_small.BMI.describe()

count        19840
mean           NaN
std           <NA>
min            NaN
25%       21.49422
50%      23.689866
75%      27.457133
max            NaN
Name: BMI, dtype: object

In [197]:
topweight = biofile.loc[biofile['WEIGHT'] == biofile['WEIGHT'].max()]
botweight = biofile.loc[biofile['WEIGHT'] == biofile['WEIGHT'].min()]

In [198]:
botweight

Unnamed: 0,PLAYERID,LAST,FIRST,NICKNAME,BIRTHDATE,BIRTH CITY,BIRTH STATE,BIRTH COUNTRY,PLAY DEBUT,PLAY LASTGAME,MGR DEBUT,MGR LASTGAME,COACH DEBUT,COACH LASTGAME,UMP DEBUT,UMP LASTGAME,DEATHDATE,DEATH CITY,DEATH STATE,DEATH COUNTRY,BATS,THROWS,HEIGHT,WEIGHT,CEMETERY,CEME CITY,CEME STATE,CEME COUNTRY,CEME NOTE,BIRTH NAME,NAME CHG,BAT CHG,HOF,FEET,INCHES,BMI
6155,gaede101,Gaedel,Edward Carl,Eddie,06/08/1925,Chicago,Illinois,USA,08/19/1951,08/19/1951,,,,,,,06/18/1961,Chicago,Illinois,USA,R,L,3-07,65,St. Mary Cemetery,Evergreen Park,Illinois,USA,,Edward Carl Gaedele,,,NOT,3,3,30.045684


In [199]:
topweight

Unnamed: 0,PLAYERID,LAST,FIRST,NICKNAME,BIRTHDATE,BIRTH CITY,BIRTH STATE,BIRTH COUNTRY,PLAY DEBUT,PLAY LASTGAME,MGR DEBUT,MGR LASTGAME,COACH DEBUT,COACH LASTGAME,UMP DEBUT,UMP LASTGAME,DEATHDATE,DEATH CITY,DEATH STATE,DEATH COUNTRY,BATS,THROWS,HEIGHT,WEIGHT,CEMETERY,CEME CITY,CEME STATE,CEME COUNTRY,CEME NOTE,BIRTH NAME,NAME CHG,BAT CHG,HOF,FEET,INCHES,BMI
21177,mcshj901,McSherry,John Patrick,John,09/11/1944,Bronx,New York,USA,,,,,,,06/01/1971,10/01/1995,04/01/1996,Cincinnati,Ohio,USA,,,6-03,351,Cemetery of the Gate of Heaven,Hawthorne,New York,USA,"Section 44, Plot 480, Grave 3; N41.05.184 W73....",,,,NOT,6,6,40.561673


In [200]:
botweight

Unnamed: 0,PLAYERID,LAST,FIRST,NICKNAME,BIRTHDATE,BIRTH CITY,BIRTH STATE,BIRTH COUNTRY,PLAY DEBUT,PLAY LASTGAME,MGR DEBUT,MGR LASTGAME,COACH DEBUT,COACH LASTGAME,UMP DEBUT,UMP LASTGAME,DEATHDATE,DEATH CITY,DEATH STATE,DEATH COUNTRY,BATS,THROWS,HEIGHT,WEIGHT,CEMETERY,CEME CITY,CEME STATE,CEME COUNTRY,CEME NOTE,BIRTH NAME,NAME CHG,BAT CHG,HOF,FEET,INCHES,BMI
6155,gaede101,Gaedel,Edward Carl,Eddie,06/08/1925,Chicago,Illinois,USA,08/19/1951,08/19/1951,,,,,,,06/18/1961,Chicago,Illinois,USA,R,L,3-07,65,St. Mary Cemetery,Evergreen Park,Illinois,USA,,Edward Carl Gaedele,,,NOT,3,3,30.045684


In [201]:
biofile.loc[biofile['PLAYERID'] == 'troum001']

Unnamed: 0,PLAYERID,LAST,FIRST,NICKNAME,BIRTHDATE,BIRTH CITY,BIRTH STATE,BIRTH COUNTRY,PLAY DEBUT,PLAY LASTGAME,MGR DEBUT,MGR LASTGAME,COACH DEBUT,COACH LASTGAME,UMP DEBUT,UMP LASTGAME,DEATHDATE,DEATH CITY,DEATH STATE,DEATH COUNTRY,BATS,THROWS,HEIGHT,WEIGHT,CEMETERY,CEME CITY,CEME STATE,CEME COUNTRY,CEME NOTE,BIRTH NAME,NAME CHG,BAT CHG,HOF,FEET,INCHES,BMI
18405,troum001,Trout,Michael Nelson,Mike,08/07/1991,Vineland,New Jersey,USA,07/08/2011,05/17/2021,,,,,,,,,,,R,R,6-01,200,,,,,,,,,NOT,6,6,23.112065


In [202]:
biofile.groupby('HEIGHT').describe()

Unnamed: 0_level_0,WEIGHT,WEIGHT,WEIGHT,WEIGHT,WEIGHT,WEIGHT,WEIGHT,WEIGHT,NAME CHG,NAME CHG,NAME CHG,NAME CHG,NAME CHG,NAME CHG,NAME CHG,NAME CHG,BAT CHG,BAT CHG,BAT CHG,BAT CHG,BAT CHG,BAT CHG,BAT CHG,BAT CHG,FEET,FEET,FEET,FEET,FEET,FEET,FEET,FEET,INCHES,INCHES,INCHES,INCHES,INCHES,INCHES,INCHES,INCHES,BMI,BMI,BMI,BMI,BMI,BMI,BMI,BMI
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
HEIGHT,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2
3-07,1,65.0,,65,65,65,65,65,0,,,,,,,,0,,,,,,,,1,3.0,,3,3,3,3,3,1,3.0,,3,3,3,3,3,1,30.045684,,30.045684,30.045684,30.045684,30.045684,30.045684
36678,1,200.0,,200,200,200,200,200,0,,,,,,,,0,,,,,,,,1,36678.0,,36678,36678,36678,36678,36678,1,36678.0,,36678,36678,36678,36678,36678,1,0.000001,,0.000001,0.000001,0.000001,0.000001,0.000001
44325,1,140.0,,140,140,140,140,140,0,,,,,,,,0,,,,,,,,1,44325.0,,44325,44325,44325,44325,44325,1,44325.0,,44325,44325,44325,44325,44325,1,0.0,,0.0,0.0,0.0,0.0,0.0
44327,1,194.0,,194,194,194,194,194,0,,,,,,,,0,,,,,,,,1,44327.0,,44327,44327,44327,44327,44327,1,44327.0,,44327,44327,44327,44327,44327,1,0.0,,0.0,0.0,0.0,0.0,0.0
5-03,10.0,144.5,14.222439,125.0,135.75,142.5,155.0,170.0,0,,,,,,,,0,,,,,,,,10.0,5.0,0.0,5.0,5.0,5.0,5.0,5.0,10.0,5.0,0.0,5.0,5.0,5.0,5.0,5.0,10.0,24.045792,2.366712,20.800858,22.589732,23.712978,25.793064,28.289167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6-07.5,1,235.0,,235,235,235,235,235,0,,,,,,,,0,,,,,,,,1,6.0,,6,6,6,6,6,1,6.0,,6,6,6,6,6,1,27.156676,,27.156676,27.156676,27.156676,27.156676,27.156676
6-08,37.0,226.810811,18.133514,185.0,220.0,225.0,240.0,265.0,0,,,,,,,,0,,,,,,,,37.0,6.0,0.0,6.0,6.0,6.0,6.0,6.0,37.0,6.0,0.0,6.0,6.0,6.0,6.0,6.0,37.0,26.210331,2.095515,21.37866,25.423271,26.001073,27.734478,30.623486
6-09,12.0,237.833333,27.185669,180.0,225.0,242.0,255.0,280.0,0,,,,,,,,0,,,,,,,,12.0,6.0,0.0,6.0,6.0,6.0,6.0,6.0,12.0,6.0,0.0,6.0,6.0,6.0,6.0,6.0,12.0,27.484097,3.141585,20.800858,26.001073,27.965598,29.467882,32.35689
6-10,6.0,244.166667,17.724747,225.0,231.25,240.0,256.25,270.0,0,,,,,,,,0,,,,,,,,6.0,6.0,0.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,0.0,6.0,6.0,6.0,6.0,6.0,6.0,28.215979,2.048278,26.001073,26.723325,27.734478,29.612333,31.201287


In [203]:
people = pd.DataFrame(people())

sleep(1)  
# sleep calls to prevent the pybaseball scraper 
# from throwing errors when importing tons of stuff

people = people.convert_dtypes() # cleanup; thank you NumPy

people = people[people.weight.notna()]

print(people.shape)
print("---")
people.sample(3)

(19729, 24)
---


Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,deathCountry,deathState,deathCity,nameFirst,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
3432,cocopa01,1977,9,8,D.R.,Distrito Nacional,Santo Domingo,,,,,,,Pasqual,Coco,Pasqual,185,73,R,R,2000-07-17,2002-04-14,cocop001,cocopa01
9084,jewettr99,1964,3,3,USA,TX,Dallas,,,,,,,Trent,Jewett,Phillip Trent,195,74,R,R,,,jewet801,
8337,hittbr01,1897,3,14,USA,TX,Comanche,1973.0,11.0,10.0,USA,OR,Portland,Bruce,Hitt,Bruce Smith,190,73,R,R,1917-09-23,1917-09-30,hittb101,hittbr01


We need to add more ID info about them so that we don't drown in merge errors.

In [204]:
rosetta = pd.DataFrame(chadwick_register()) 

sleep(1)

rosetta = rosetta.convert_dtypes()

rosetta = rosetta.dropna(how='any')

print(rosetta.shape)
print('---')
rosetta.sample(3)

(20506, 8)
---


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
5729,Donnelly,Jim,113448,donnj101,donneji01,1003385,1884,1898
23524,Williams,Rick,124334,willr101,williri03,1014035,1978,1979
15861,Ni,Fu-Te,547820,ni--f001,nifu01,4199,2009,2010


## Merge people into rosetta

In [205]:
everyone = pd.merge(rosetta, people, left_on='key_bbref', right_on='playerID', how='left')

print(everyone.shape)
print('---')
everyone.sample(3)

(20506, 32)
---


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,deathCountry,deathState,deathCity,nameFirst,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
10353,Lambert,Gene,117433,lambg101,lambege01,1007274,1941,1942,lambege01,1921,4,26,USA,MS,Crenshaw,2000.0,2.0,10.0,USA,TN,Germantown,Gene,Lambert,Eugene Marion,175,71,R,R,1941-09-14,1942-04-26,lambg101,lambege01
4896,Dobb,John,113403,dobbj102,dobbjo01,1003339,1924,1924,dobbjo01,1901,11,15,USA,MI,Muskegon,1991.0,7.0,31.0,USA,MI,Muskegon,John,Dobb,John Kenneth,180,74,R,L,1924-08-13,1924-08-30,dobbj102,dobbjo01
18602,Toms,Tommy,123393,tomst101,tomsto01,1013115,1975,1977,tomsto01,1951,10,15,USA,VA,Charlottesville,,,,,,,Tommy,Toms,Thomas Howard,195,76,R,R,1975-05-04,1977-07-02,tomst101,tomsto01


In [206]:
# Constants
KG_TO_LB = 0.453592
M_TO_IN = 0.0254

# BMI Calculations
everyone['KG'] = everyone['weight'] * KG_TO_LB
everyone['meters'] = everyone['height'] * M_TO_IN
everyone['BMI'] = everyone['KG'] / everyone['meters'] ** 2
everyone['ratio'] = everyone['meters'] * everyone['BMI']

## Init Fielding Data

In [207]:
# fielding stats by year 
fielding = pd.DataFrame(fielding()) # this is fine

fielding = fielding.convert_dtypes()

In [208]:
fielding = fielding.rename(columns={"playerID": "key_bbref"})

In [209]:
everyone = everyone[
    [
        "name_last",
        "name_first",
        "key_mlbam",
        "key_retro",
        "key_bbref",
        "key_fangraphs",
        "birthYear",
        "deathYear",
        "weight",
        "height",
        "BMI",
        "bats",
        "throws",
    ]
]

In [210]:
biocomp = everyone[
    [
        "weight",
        "height",
        "BMI",
    ]
]

In [211]:
biocomp.describe().to_csv('biocomp.csv')

In [212]:
everyone.describe().to_csv('everyone_describe.csv')

In [213]:
df = everyone
df = fielding.merge(df, on='key_bbref', how='outer', validate='many_to_one')

## Batting data

In [214]:
# batting stats by year 
batting = pd.DataFrame(batting()) # this is fine

batting = batting.convert_dtypes()
batting.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)


Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
12629,aitchra01,1911,1,BRO,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12700,camniha01,1911,1,SLN,NL,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12738,cottren01,1911,1,PIT,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12830,griffha01,1911,1,CHN,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13045,pucketr01,1911,1,PHI,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110490,zimmejo02,2021,1,MIL,NL,2,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
110491,zimmeky01,2021,1,KCA,AL,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
110492,zimmery01,2021,1,WAS,NL,110,255,27,62,16,0,14,46,0,0,16,77,0,0,0,2,9
110493,zuberty01,2021,1,KCA,AL,31,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [215]:
batting.sample(10)

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
85789,sizemgr01,2004,1,CLE,AL,43,138,15,34,6,2,4,24,2,0.0,14,34,0.0,5,0,2.0,0.0
98649,chacijh01,2014,1,COL,NL,12,15,2,5,1,0,0,0,0,0.0,1,4,0.0,0,1,0.0,0.0
103974,rodrise01,2017,1,ATL,NL,15,37,6,6,1,0,2,3,1,0.0,8,19,1.0,1,1,0.0,1.0
67762,gerenbo01,1990,1,NYA,AL,110,277,21,59,7,0,8,31,0,0.0,13,73,1.0,5,6,2.0,7.0
96988,wilsobr01,2012,1,SFN,NL,2,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0.0
91248,seaybo01,2008,1,DET,AL,60,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0.0
80137,myettaa01,2000,1,CHA,AL,2,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0.0
23014,cochrmi01,1929,1,PHA,AL,135,514,113,170,37,8,7,95,7,6.0,69,8,,2,21,,
64128,stewada01,1986,2,OAK,AL,29,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0.0
22213,orsater01,1927,1,SLN,NL,27,92,15,29,7,3,0,12,2,,11,12,,0,2,,


In [216]:
batting.describe().to_csv('mlb_batting.csv')

In [217]:
batting = batting.rename(columns={"playerID": "key_bbref"})

### Merge Batting Data

In [218]:
a = np.intersect1d(df.columns, batting.columns)
print(list(a))

['CS', 'G', 'SB', 'key_bbref', 'lgID', 'stint', 'teamID', 'yearID']


In [219]:
bats = df.merge(batting, on=['CS', 'G', 'SB', 'key_bbref', 'lgID', 'stint', 'teamID', 'yearID'], how='outer', validate='many_to_many', indicator=True)


In [220]:
bats

Unnamed: 0,key_bbref,yearID,stint,teamID,lgID,POS,G,GS,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR,name_last,name_first,key_mlbam,key_retro,key_fangraphs,birthYear,deathYear,weight,height,BMI,bats,throws,AB,R,H,2B,3B,HR,RBI,BB,SO,IBB,HBP,SH,SF,GIDP,_merge
0,abercda01,1871,1,TRO,,SS,1,1,24,1,3,2,0,,,,,,Abercrombie,Frank,110018,aberd101,1000017,,,,,,,,,,,,,,,,,,,,,,left_only
1,addybo01,1871,1,RC1,,2B,22,22,606,67,72,42,5,,,,,,Addy,Bob,110074,addyb101,1000070,1842,1910,160,68,24.327647,L,L,,,,,,,,,,,,,,,left_only
2,addybo01,1871,1,RC1,,SS,3,3,96,8,14,7,0,,,,,,Addy,Bob,110074,addyb101,1000070,1842,1910,160,68,24.327647,L,L,,,,,,,,,,,,,,,left_only
3,addybo01,1873,1,PH2,,2B,10,,249,24,23,8,2,,,,,,Addy,Bob,110074,addyb101,1000070,1842,1910,160,68,24.327647,L,L,,,,,,,,,,,,,,,left_only
4,addybo01,1873,2,BS1,,OF,31,,843,30,3,14,0,,,,,,Addy,Bob,110074,addyb101,1000070,1842,1910,160,68,24.327647,L,L,,,,,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256727,zimmebr01,2021,1,CLE,AL,,99,,,,,,,,,15,3,,,,,,,,,,,,,,299,44,68,9,1,8,35,30,122,0,15,0,4,3,right_only
256728,zimmebr02,2021,1,BAL,AL,,14,,,,,,,,,0,0,,,,,,,,,,,,,,4,0,0,0,0,0,0,0,3,0,0,0,0,0,right_only
256729,zimmeky01,2021,1,KCA,AL,,52,,,,,,,,,0,0,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,right_only
256730,zimmery01,2021,1,WAS,NL,,110,,,,,,,,,0,0,,,,,,,,,,,,,,255,27,62,16,0,14,46,16,77,0,0,0,2,9,right_only


## Init Pitching data

In [221]:
# pitching stats by year 
pitching = pd.DataFrame(pitching()) # this is fine

sleep(1)

pitching = pitching.convert_dtypes()

In [222]:
pitching = pitching.rename(columns={"playerID": "key_bbref"})

In [223]:
a = np.intersect1d(df.columns, pitching.columns)
print(list(a))

['G', 'GS', 'WP', 'key_bbref', 'lgID', 'stint', 'teamID', 'yearID']


In [224]:
df = df.merge(pitching, on=[ 'G', 'GS', 'WP', 'key_bbref', 'lgID', 'stint', 'teamID', 'yearID'], how='outer', validate='many_to_many', indicator="second_merge")



## FIP, wRC+

In [225]:
# bwar_bat stats by year 
bwar_bat = pd.DataFrame(bwar_bat()) # this is fine

bwar_bat = bwar_bat.convert_dtypes()


In [226]:
bwar_bat = bwar_bat.rename(columns={
    "year_ID": "yearID",
    "player_ID": "key_bbref",
    "team_ID": "teamID",
    "lg_ID": "lgID",
})

In [227]:
a = np.intersect1d(df.columns, bwar_bat.columns)
print(list(a))

['G', 'key_bbref', 'lgID', 'teamID', 'yearID']


In [228]:
df = df.merge(bwar_bat, on=['G', 'key_bbref', 'lgID', 'teamID', 'yearID'], how='outer', validate='many_to_many', indicator="third_merge")

In [229]:
# bwar_pit stats by year 
bwar_pitch = pd.DataFrame(bwar_pitch()) # this is fine

bwar_pitch = bwar_pitch.convert_dtypes()


In [230]:
bwar_pitch = bwar_pitch.rename(columns={
    "year_ID": "yearID",
    "player_ID": "key_bbref",
    "team_ID": "teamID",
    "lg_ID": "lgID",
})

In [231]:
a = np.intersect1d(df.columns, bwar_pitch.columns)
print(list(a))

['G', 'GS', 'WAA', 'WAR', 'WAR_rep', 'key_bbref', 'lgID', 'mlb_ID', 'name_common', 'salary', 'stint_ID', 'teamID', 'yearID']


In [232]:
df = df.merge(bwar_pitch, on=['G', 'GS', 'WAA', 'WAR', 'WAR_rep', 'key_bbref', 'lgID', 'mlb_ID', 'name_common', 'salary', 'stint_ID', 'teamID', 'yearID'], how='outer', validate='many_to_many', indicator="fourth_merge")
df

Unnamed: 0,key_bbref,yearID,stint,teamID,lgID,POS,G,GS,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR,name_last,name_first,key_mlbam,key_retro,key_fangraphs,birthYear,deathYear,weight,height,BMI,bats,throws,W,L,CG,SHO,SV,IPouts,H,ER,HR,BB,SO,BAOpp,ERA,IBB,HBP,BK,BFP,GF,R,SH,SF,GIDP,second_merge,name_common,mlb_ID,stint_ID,pitcher,PA,salary,runs_above_avg,runs_above_avg_off,runs_above_avg_def,WAR_rep,WAA,WAR,third_merge,RA,xRA,BIP,BIP_perc,ERA_plus,WAA_adj,fourth_merge
0,abercda01,1871,1,TRO,,SS,1,1,24,1,3,2,0,,,,,,Abercrombie,Frank,110018,aberd101,1000017,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only,Frank Abercrombie,110018,1,N,4,,-1.3,-1.3,0.1,0.01,-0.08,-0.07,both,,,,,,,left_only
1,addybo01,1871,1,RC1,,2B,22,22,606,67,72,42,5,,,,,,Addy,Bob,110074,addyb101,1000070,1842,1910,160,68,24.327647,L,L,,,,,,,,,,,,,,,,,,,,,,,left_only,,,,,,,,,,,,,left_only,,,,,,,left_only
2,addybo01,1871,1,RC1,,SS,3,3,96,8,14,7,0,,,,,,Addy,Bob,110074,addyb101,1000070,1842,1910,160,68,24.327647,L,L,,,,,,,,,,,,,,,,,,,,,,,left_only,,,,,,,,,,,,,left_only,,,,,,,left_only
3,addybo01,1873,1,PH2,,2B,10,,249,24,23,8,2,,,,,,Addy,Bob,110074,addyb101,1000070,1842,1910,160,68,24.327647,L,L,,,,,,,,,,,,,,,,,,,,,,,left_only,,,,,,,,,,,,,left_only,,,,,,,left_only
4,addybo01,1873,2,BS1,,OF,31,,843,30,3,14,0,,,,,,Addy,Bob,110074,addyb101,1000070,1842,1910,160,68,24.327647,L,L,,,,,,,,,,,,,,,,,,,,,,,left_only,,,,,,,,,,,,,left_only,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351906,strasst01,2022,,WSN,NL,,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stephen Strasburg,544931,1,,,35000000,,,,0.044,-0.3143,-0.28,,7,2.205,15,0.0064,31.571429,-0.0051,right_only
351907,tetreja01,2022,,WSN,NL,,4,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Jackson Tetreault,676194,1,,,,,,,0.1967,-0.3812,-0.21,,15,10.559,77,0.0327,77.933333,-0.023,right_only
351908,thompma02,2022,,WSN,NL,,6,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Mason Thompson,666168,1,,,,,,,0.0485,0.2664,0.2,,0,2.712,12,0.0051,,-0.1145,right_only
351909,vothau01,2022,,WSN,NL,,19,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Austin Voth,608723,1,,,875000,,,,0.1801,-1.2578,-0.81,,22,9.389,70,0.0297,39.67619,0.2655,right_only


### Clean-up

In [233]:
all_players = df.drop_duplicates(subset=['key_bbref'])

In [235]:
everyone = df[
    [
        "key_bbref",
        "name_last",
        "name_first",
        "weight",
        "height",
        "BMI",
        "yearID",
        "POS",
        "WAR",
        "WAA",
        "pitcher",
        "ERA_plus",
        "birthYear",
        "deathYear",
        "teamID",
        "lgID",
        "runs_above_avg",
        "runs_above_avg_def",
        "WAA_adj",
        "G",
        "GS",
        "InnOuts",
        "PO",
        "A",
        "E",
        "DP",
        "PB",
        "WP",
        "SB",
        "CS",
        "ZR",
        "bats",
        "throws",
        "R",
        "H",
        "HR",
        "BB",
        "SO",
        "IBB",
        "HBP",
        "SH",
        "SF",
        "GIDP",
        "W",
        "L",
        "CG",
        "SHO",
        "SV",
        "IPouts",
        "ER",
        "BAOpp",
        "ERA",
        "BK",
        "BFP",
        "GF",
        "name_common",
        "mlb_ID",
        "stint_ID",
        "PA",
        "salary",
        "runs_above_avg_off",
        "WAR_rep",
        "RA",
        "xRA",
        "BIP",
        "BIP_perc",
        "second_merge",
        "third_merge",
        "fourth_merge",
        "key_mlbam",
        "key_retro",
        "key_fangraphs",
    ]
]

In [237]:

df = everyone[
    [
        "key_bbref",
        "weight",
        "height",
        "BMI",
        "name_last",
        "name_first",
        "POS",
        "yearID",
        "birthYear",
        "deathYear",
        "teamID",
        "lgID",
        "R",
        "H",
        "HR",
        "BB",
        "SO",
        "WAR",
        "WAA",
        "pitcher",
        "ERA_plus",
    ]
]

# Here it is:

In [238]:
df = df.sort_values('weight', ascending=False)

In [239]:
df

Unnamed: 0,key_bbref,weight,height,BMI,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,R,H,HR,BB,SO,WAR,WAA,pitcher,ERA_plus
164878,youngwa01,320,77,37.946042,Young,Walter,1B,2005,1980,2015,BAL,AL,,,,,,,,,
184576,diazju03,315,76,38.342579,Diaz,Jumbo,P,2015,1984,,CIN,NL,,,,,,,,,
184578,diazju03,315,76,38.342579,Diaz,Jumbo,P,2016,1984,,CIN,NL,,,,,,-0.01,-0.01,Y,
184580,diazju03,315,76,38.342579,Diaz,Jumbo,P,2017,1984,,TBA,AL,,,,,,,,,
184574,diazju03,315,76,38.342579,Diaz,Jumbo,P,2014,1984,,CIN,NL,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351906,strasst01,,,,,,,2022,,,WSN,NL,,,,,,-0.28,-0.3143,,31.571429
351907,tetreja01,,,,,,,2022,,,WSN,NL,,,,,,-0.21,-0.3812,,77.933333
351908,thompma02,,,,,,,2022,,,WSN,NL,,,,,,0.2,0.2664,,
351909,vothau01,,,,,,,2022,,,WSN,NL,,,,,,-0.81,-1.2578,,39.67619


In [240]:
df = df.drop_duplicates(subset=['key_bbref'])

In [241]:
df = df.convert_dtypes()

In [242]:
df

Unnamed: 0,key_bbref,weight,height,BMI,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,R,H,HR,BB,SO,WAR,WAA,pitcher,ERA_plus
164878,youngwa01,320,77,37.946042,Young,Walter,1B,2005,1980,2015,BAL,AL,,,,,,,,,
184576,diazju03,315,76,38.342579,Diaz,Jumbo,P,2015,1984,,CIN,NL,,,,,,,,,
44051,brownju01,295,76,35.90813,Brown,Jumbo,P,1933,1907,1966,NYA,AL,,,,,,,,,
144186,youngdm01,295,74,37.875339,Young,Dmitri,1B,1997,1973,,SLN,NL,,,,,,,,,
157446,rauchjo01,290,83,29.59646,Rauch,Jon,P,2002,1978,,CHA,AL,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298214,zomphch01,,,,,,,1926,,,CEL,NNL,,,,,,-0.05,-0.35,N,
351180,bellobr01,,,,,,,2022,,,BOS,AL,,,,,,-0.08,-0.1094,,50.875
351261,hillga02,,,,,,,2022,,,DET,AL,,,,,,-0.08,-0.1696,,69.642857
351467,ogandcr01,,,,,,,2022,,,TBR,AL,,,,,,0.01,-0.0033,,95.7


In [243]:
df.describe().to_csv('mlb_all_stats.csv')

After some noodling, this is the best combo of BMI:height that I could find such that we have enough players at each position to fill out a roster. 2B and 1B ended up being a bottleneck, which isn't surprising. Second-basemen are typically the smallest player on the team, and First-basemen are almost universally the tallest fielder on the roster.

In [244]:
tryouts = df[ # Let's separate the meat from the chaff
    (df.BMI >= df.BMI.quantile(0.99))
    & (df.height <= df.height.quantile(0.23))]

In [246]:
tryouts.to_csv('tryouts.csv')

In [965]:
tryouts.POS.value_counts()

P     17
C     15
1B     8
OF     7
2B     3
SS     1
3B     1
Name: POS, dtype: Int64

In [966]:
# Probably easiest if we make ourselves a dataframe for every position, since we'll need to fill up a 26-man roster, which is parsed into several discrete roles that each require a particular number of players. 

# These role counts aren't codified, but teams have nearly always carried the same distribution of player-roles.

# The standard breakdown is:
# 13 Pitchers (5x SP, 7x RP)
# 5 OF
# 2 C
# 4 IF
# 2 Utility

# For Utility players, we'll likely take two extra middle infielders, because middle infielders can usually play any field position without looking like they have no idea what they're doing. E.g., you can put a shortstop at first base, and he'll be a pretty mediocre-to-bad first baseman — but if you put a first baseman at shortstop, he's likely to spend the entire game crying.

tryouts_P = tryouts[tryouts['POS'] == "P"]
tryouts_C = tryouts[tryouts['POS'] == "C"]
tryouts_1B = tryouts[tryouts['POS'] == "1B"]
tryouts_2B = tryouts[tryouts['POS'] == "2B"]
tryouts_3B = tryouts[tryouts['POS'] == "3B"]
tryouts_SS = tryouts[tryouts['POS'] == "SS"]
tryouts_OF = tryouts[tryouts['POS'] == "OF"]

In [967]:
huskies_P = (
    tryouts_P.sort_values("BMI", ascending=False)
    .nlargest(13, "BMI")
)  # 13 heaviest P
huskies_P

Unnamed: 0,key_bbref,weight,height,BMI,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus
182432,colonba01,285,71,39.748992,Colon,Bartolo,P,2011,1973,,NYA,AL,,,,,,,,,,,,,
243208,moronre01,265,70,38.02312,Moronta,Reyes,P,2017,1993,,SFN,NL,,,,,,,,,,,,,
216625,mijarjo01,265,71,36.959589,Mijares,Jose,P,2011,1984,,MIN,AL,,,,,,,,,,,,,
3994,healeto01,155,55,36.025023,Healey,Tom,P,1878,1853,1891.0,PRO,NL,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.02,-0.02,Y,
229336,machije01,257,71,35.843828,Machi,Jean,P,2014,1982,,SFN,NL,,,,,,,,,,,,,
199068,rodnefe01,240,71,33.472835,Rodney,Fernando,P,2002,1977,,DET,AL,,,,,,,,,,,,,
236741,castran01,240,71,33.472835,Castro,Angel,P,2015,1982,,OAK,AL,,,,,,,,,,,,,
248116,colined01,240,71,33.472835,Colina,Edwar,P,2020,1997,,MIN,AL,,,,,,,,,,,,,
245959,valdefr01,239,71,33.333365,Valdez,Framber,P,2021,1993,,HOU,AL,,,,,,,,,,,,,
224531,carigan01,235,71,32.775484,Carignan,Andrew,P,2012,1986,,OAK,AL,,,,,,,,,,,,,


In [968]:
huskies_P = (
    tryouts_P.sort_values("BMI", ascending=False)
    .nlargest(13, "BMI")
)  # 13 heaviest P
huskies_P

Unnamed: 0,key_bbref,weight,height,BMI,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus
182432,colonba01,285,71,39.748992,Colon,Bartolo,P,2011,1973,,NYA,AL,,,,,,,,,,,,,
243208,moronre01,265,70,38.02312,Moronta,Reyes,P,2017,1993,,SFN,NL,,,,,,,,,,,,,
216625,mijarjo01,265,71,36.959589,Mijares,Jose,P,2011,1984,,MIN,AL,,,,,,,,,,,,,
3994,healeto01,155,55,36.025023,Healey,Tom,P,1878,1853,1891.0,PRO,NL,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.02,-0.02,Y,
229336,machije01,257,71,35.843828,Machi,Jean,P,2014,1982,,SFN,NL,,,,,,,,,,,,,
199068,rodnefe01,240,71,33.472835,Rodney,Fernando,P,2002,1977,,DET,AL,,,,,,,,,,,,,
236741,castran01,240,71,33.472835,Castro,Angel,P,2015,1982,,OAK,AL,,,,,,,,,,,,,
248116,colined01,240,71,33.472835,Colina,Edwar,P,2020,1997,,MIN,AL,,,,,,,,,,,,,
245959,valdefr01,239,71,33.333365,Valdez,Framber,P,2021,1993,,HOU,AL,,,,,,,,,,,,,
224531,carigan01,235,71,32.775484,Carignan,Andrew,P,2012,1986,,OAK,AL,,,,,,,,,,,,,


In [969]:
huskies_C = (
    tryouts_C.sort_values("BMI", ascending=False)
    .nlargest(2, "BMI")
)  # 2 heaviest C
huskies_C

Unnamed: 0,key_bbref,weight,height,BMI,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus
248415,kirkal01,245,68,37.25171,Kirk,Alejandro,C,2021,1998,,TOR,AL,,,,,,,,,,,,,
207632,penabr01,240,69,35.441412,Pena,Brayan,C,2016,1982,,SLN,NL,,,,,,,,,,,,,


In [970]:
huskies_1B = (
    tryouts_1B.sort_values("BMI", ascending=False)
    .nlargest(1, "BMI")
)  # 1 heaviest 1B
huskies_1B

Unnamed: 0,key_bbref,weight,height,BMI,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus
217568,sandopa01,268,70,38.45357,Sandoval,Pablo,1B,2010,1986,,SFN,NL,,,,,,,,,,,,,


In [971]:
huskies_2B = (
    tryouts_2B.sort_values("BMI", ascending=False)
    .nlargest(3, "BMI")
)  # 3 heaviest 2B (1x starter, 2x utility)
huskies_2B

Unnamed: 0,key_bbref,weight,height,BMI,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus
241635,whitety01,238,71,33.193895,White,Tyler,2B,2017,1990,,HOU,AL,,,,,,,,,,,,,
232566,phegljo01,225,70,32.283781,Phegley,Josh,2B,2013,1988,,CHA,AL,,,,,,,,,,,,,
230370,solando01,210,68,31.930037,Solano,Donovan,2B,2014,1987,,MIA,NL,,,,,,,,,,,,,


In [972]:
huskies_SS = (
    tryouts_SS.sort_values("BMI", ascending=False)
    .nlargest(1, "BMI")
)  # 1 heaviest SS
huskies_SS

Unnamed: 0,key_bbref,weight,height,BMI,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus
184377,tejadmi01,220,69,32.487961,Tejada,Miguel,SS,1997,1974,,OAK,AL,,,,,,,,,,-0.23,-0.59,N,


In [973]:
huskies_3B = (
    tryouts_3B.sort_values("BMI", ascending=False)
    .nlargest(1, "BMI")
)  # 1 heaviest 3B
huskies_3B

Unnamed: 0,key_bbref,weight,height,BMI,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus
208969,callaal01,210,68,31.930037,Callaspo,Alberto,3B,2010,1983,,KCA,AL,,,,,,,,,,,,,


In [974]:
huskies_OF = (
    tryouts_OF.sort_values("BMI", ascending=False)
    .nlargest(5, "BMI")
)  # 5 heaviest OF
huskies_OF

Unnamed: 0,key_bbref,weight,height,BMI,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus
247295,naylojo01,250,71,34.867537,Naylor,Josh,OF,2020,1997,,SDN,NL,,,,,,,,,,,,,
223992,vicieda01,240,71,33.472835,Viciedo,Dayán,OF,2012,1989,,CHA,AL,,,,,,,,,,,,,
247419,ramirha02,232,70,33.288165,Ramirez,Harold,OF,2021,1994,,CLE,AL,,,,,,,,,,,,,
244258,astudwi01,225,69,33.226324,Astudillo,Willians,OF,2021,1991,,MIN,AL,,,,,,,,,,,,,
51268,fothebo01,230,70,33.001198,Fothergill,Bob,OF,1923,1897,1938.0,DET,AL,,,,,,,,,,,,,


In [975]:
huskies = pd.concat([
    huskies_P,
    huskies_C,
    huskies_1B,
    huskies_2B,
    huskies_3B,
    huskies_SS,
    huskies_OF
])

In [976]:
huskies.describe()

Unnamed: 0,weight,height,BMI,yearID,birthYear,deathYear,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,ERA_plus
count,26.0,26.0,26.0,26.0,26.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,0.0
mean,236.5,69.576923,34.299658,2005.153846,1978.076923,1914.5,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.125,-0.305,
std,24.081943,3.1518,2.208648,32.024606,31.757737,33.234019,,,,,,,,,,0.148492,0.403051,
min,155.0,55.0,31.930037,1878.0,1853.0,1891.0,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.23,-0.59,
25%,226.25,69.25,32.775484,2010.0,1982.0,1902.75,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.1775,-0.4475,
50%,238.5,70.5,33.4031,2013.5,1986.0,1914.5,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.125,-0.305,
75%,243.75,71.0,35.743224,2019.25,1992.5,1926.25,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.0725,-0.1625,
max,285.0,71.0,39.748992,2021.0,1998.0,1938.0,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.02,-0.02,


In [None]:
huskies

Unnamed: 0,key_bbref,weight,height,name_last,name_first,POS,yearID,birthYear,deathYear,teamID,lgID,AB,R,H,2B,3B,HR,RBI,BB,SO,WAR,WAA,pitcher,ERA_plus,KG,meters,BMI,ratio
182432,colonba01,285,71,Colon,Bartolo,P,2011,1973,,NYA,AL,,,,,,,,,,,,,,129.27372,1.8034,39.748992,71.683331
243208,moronre01,265,70,Moronta,Reyes,P,2017,1993,,SFN,NL,,,,,,,,,,,,,,120.20188,1.778,38.02312,67.605107
216625,mijarjo01,265,71,Mijares,Jose,P,2011,1984,,MIN,AL,,,,,,,,,,,,,,120.20188,1.8034,36.959589,66.652922
3994,healeto01,155,55,Healey,Tom,P,1878,1853,1891.0,PRO,NL,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,4.0,-0.02,-0.02,Y,,70.30676,1.397,36.025023,50.326958
229336,machije01,257,71,Machi,Jean,P,2014,1982,,SFN,NL,,,,,,,,,,,,,,116.573144,1.8034,35.843828,64.640759
199068,rodnefe01,240,71,Rodney,Fernando,P,2002,1977,,DET,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
236741,castran01,240,71,Castro,Angel,P,2015,1982,,OAK,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
248116,colined01,240,71,Colina,Edwar,P,2020,1997,,MIN,AL,,,,,,,,,,,,,,108.86208,1.8034,33.472835,60.364911
245959,valdefr01,239,71,Valdez,Framber,P,2021,1993,,HOU,AL,,,,,,,,,,,,,,108.408488,1.8034,33.333365,60.11339
224531,carigan01,235,71,Carignan,Andrew,P,2012,1986,,OAK,AL,,,,,,,,,,,,,,106.59412,1.8034,32.775484,59.107308


# Let's run this through Baseball Reference offline
They've got handy tool to produce an expected season for a given player. These are the CSVs that it spits out.

In [None]:
batters = pd.read_csv("../data/huskies_batters.csv")
pitchers = pd.read_csv("../data/huskies_pitchers.csv")

Time to take a look at our team

In [None]:
# Average Husky WAR
(pitchers.WAR.sum() + batters.WAR.sum()) / 26

1.2153846153846155

In [None]:
# Total Husky WAR
husky_war_sum = (pitchers.WAR.sum() + batters.WAR.sum())
# Average Husky height
husky_height_mean = (pitchers.height.sum() + batters.height.sum()) / 26
# Average Husky weight
husky_weight_mean = (pitchers.weight.sum() + batters.weight.sum()) / 26

In [None]:
# Average weight of all players
everyone_weight_mean = everyone.weight.mean()
# Average height of all players
everyone_height_mean = everyone.height.mean()

In [None]:
pitchers.WAR.sum()

13.899999999999999

In [None]:
batters.describe(
)

Unnamed: 0,height,weight,G,PA,AB,R,H,double,triple,homer,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Rbat,Rbaser,Rdp,Rfield,Rpos,RAA,WAA,Rrep,RAR,WAR,waaWL%,162WL%,oWAR,dWAR,oRAR,outs,walks,1B,2B,3B,HR,OUT,WALK,sum
count,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,12.0,13.0,13.0,12.0,12.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0
mean,69.538462,233.307692,162.0,556.307692,509.076923,60.461538,137.307692,27.384615,1.923077,14.153846,66.769231,2.461538,1.923077,35.692308,79.923077,0.269308,0.320692,0.413231,0.733923,98.846154,211.0,15.0,5.153846,1.923077,4.583333,2.083333,0.615385,-1.461538,-0.615385,-3.615385,0.0,-5.230769,-0.507692,19.384615,14.384615,1.361538,0.496692,0.498923,1.723077,-0.315385,18.0,329.0,37.615385,0.262531,0.049272,0.003466,0.025108,0.592548,0.067074,0.737469
std,1.126601,16.188394,0.0,53.658464,49.358318,13.684635,20.093053,4.407162,1.977437,6.375031,14.359398,1.941451,2.059998,13.780328,34.731018,0.025227,0.028946,0.039368,0.062551,16.118551,33.662541,4.0,2.703274,3.593085,0.792961,1.831955,10.882237,1.898042,1.192928,3.01492,6.244998,13.766831,1.412717,2.063107,14.505967,1.476222,0.008873,0.005155,1.314758,0.838497,12.529964,34.285565,14.534662,0.028998,0.006912,0.003764,0.010373,0.045736,0.024663,0.028998
min,68.0,210.0,162.0,495.0,459.0,35.0,105.0,19.0,0.0,5.0,42.0,0.0,0.0,10.0,24.0,0.225,0.268,0.351,0.649,76.0,161.0,9.0,1.0,0.0,3.0,0.0,-14.0,-6.0,-2.0,-8.0,-9.0,-19.0,-1.8,17.0,-2.0,-0.2,0.488,0.491,0.3,-1.5,5.0,272.0,10.0,0.208333,0.037773,0.0,0.009524,0.495446,0.019881,0.688525
25%,69.0,225.0,162.0,521.0,478.0,55.0,126.0,24.0,1.0,8.0,62.0,1.0,1.0,27.0,54.0,0.257,0.299,0.381,0.697,91.0,192.0,12.5,3.0,0.0,4.0,1.0,-4.0,-3.0,-2.0,-5.0,-6.0,-15.0,-1.6,18.0,5.0,0.4,0.49,0.496,0.9,-1.1,10.0,309.0,29.0,0.240809,0.045537,0.001988,0.013652,0.568015,0.054726,0.720137
50%,70.0,232.0,162.0,544.0,481.0,59.0,139.0,28.0,2.0,15.0,68.0,2.0,1.0,32.0,89.0,0.27,0.32,0.41,0.725,97.0,206.0,14.5,5.0,0.0,5.0,2.0,-1.0,-1.0,-1.0,-3.0,1.0,-10.0,-0.8,19.0,8.0,0.9,0.494,0.497,1.5,-0.2,16.0,331.0,33.0,0.261383,0.050946,0.003317,0.029412,0.587332,0.06142,0.738617
75%,70.0,240.0,162.0,586.0,541.0,66.0,150.0,30.0,2.0,18.0,71.0,3.0,2.0,42.0,98.0,0.28,0.33,0.443,0.773,108.0,229.0,17.5,7.0,2.0,5.0,2.25,8.0,0.0,0.0,-2.0,5.0,0.0,0.0,20.0,20.0,2.1,0.5,0.502,2.3,0.4,23.0,339.0,48.0,0.279863,0.052277,0.003413,0.031716,0.624242,0.080944,0.759191
max,71.0,268.0,162.0,687.0,629.0,92.0,180.0,35.0,8.0,23.0,97.0,6.0,8.0,58.0,132.0,0.325,0.374,0.48,0.854,136.0,287.0,21.0,9.0,13.0,6.0,6.0,27.0,1.0,1.0,3.0,9.0,29.0,3.1,24.0,48.0,4.9,0.519,0.508,4.6,1.0,44.0,394.0,59.0,0.311475,0.062857,0.015238,0.040073,0.660714,0.107468,0.791667


In [None]:
batters['total_average'] = ((batters['TB'] + batters['HBP'] + batters['BB'] + batters['SB'] - batters['CS']) / (batters['AB'] - batters['H'] + batters['CS'] + batters['GDP']))

In [None]:
batters.WAR.sum()

17.700000000000003

In [None]:
17.7 + 13.9

31.6

In [None]:
pitchers.median()

  pitchers.median()


height      71.0000
weight     240.0000
W            4.0000
L            5.0000
W-L%         0.4665
Dec          3.6800
ERA         68.0000
G            0.0000
GS          13.0000
CG           0.0000
SHO          0.0000
SV           0.0000
IP          67.0000
H          109.0000
R           42.0000
ER          40.0000
HR          14.0000
BB          48.0000
IBB          2.0000
SO          67.0000
HBP          3.0000
BK           0.0000
WP           4.0000
BF         413.0000
ERA+▼      109.0000
FIP          4.1500
WHIP         1.3560
H9           8.9000
HR9          0.8000
BB9          3.8000
SO9          7.3000
SO/BB        2.2000
IP.1        67.0000
G.1         68.0000
GS.1         0.0000
R.1         42.0000
RA9          4.1800
RA9opp       4.6800
RA9def       0.0600
RA9role     -0.3100
PPFp        95.8000
RA9avg       4.6700
RAA          4.0000
WAA          0.1000
gmLI         1.0000
WAAadj       0.0000
WAR          0.5000
RAR          6.0000
waaWL%       0.5060
162WL%       0.5000


In [None]:
pitchers.WAR.sum()

13.899999999999999

In [None]:
people.shape

(19729, 24)

In [None]:
people.head()

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,deathCountry,deathState,deathCity,nameFirst,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
0,aardsda01,1981,12,27,USA,CO,Denver,,,,,,,David,Aardsma,David Allan,215,75,R,R,2004-04-06,2015-08-23,aardd001,aardsda01
1,aaronha01,1934,2,5,USA,AL,Mobile,2021.0,1.0,22.0,USA,GA,Atlanta,Hank,Aaron,Henry Louis,180,72,R,R,1954-04-13,1976-10-03,aaroh101,aaronha01
2,aaronto01,1939,8,5,USA,AL,Mobile,1984.0,8.0,16.0,USA,GA,Atlanta,Tommie,Aaron,Tommie Lee,190,75,R,R,1962-04-10,1971-09-26,aarot101,aaronto01
3,aasedo01,1954,9,8,USA,CA,Orange,,,,,,,Don,Aase,Donald William,190,75,R,R,1977-07-26,1990-10-03,aased001,aasedo01
4,abadan01,1972,8,25,USA,FL,Palm Beach,,,,,,,Andy,Abad,Fausto Andres,184,73,L,L,2001-09-10,2006-04-13,abada001,abadan01


In [None]:
people.weight.describe()

count    19729.000000
mean       188.133712
std         22.495432
min         65.000000
25%        172.000000
50%        185.000000
75%        200.000000
max        320.000000
Name: weight, dtype: float64

In [None]:
comparison = pd.DataFrame()

In [None]:
comparison['everyone_weight'] = people.weight.describe()
comparison['huskies_weight'] = huskies.weight.describe()
comparison['delta_weight'] = comparison['huskies_weight'] - comparison['everyone_weight']
comparison['everyone_height'] = people.height.describe()
comparison['huskies_height'] = huskies.height.describe()
comparison['delta_height'] = comparison['huskies_height'] - comparison['everyone_height']


In [None]:
comparison = comparison.convert_dtypes()
comparison = comparison.drop(labels='count')

In [None]:
statcompare = pd.DataFrame()
statcompare['huskies_weight'] = 

In [None]:
comparison.to_csv('body_comparison.csv')

In [None]:
stats = pd.DataFrame()

In [None]:
batters.columns

Index(['player', 'height', 'weight', 'G', 'PA', 'AB', 'R', 'H', 'double',
       'triple', 'homer', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG',
       'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'Pos', 'Rbat',
       'Rbaser', 'Rdp', 'Rfield', 'Rpos', 'RAA', 'WAA', 'Rrep', 'RAR', 'WAR',
       'waaWL%', '162WL%', 'oWAR', 'dWAR', 'oRAR', 'outs', 'walks', '1B', '2B',
       '3B', 'HR', 'OUT', 'WALK', 'sum', 'total_average'],
      dtype='object')

In [900]:
everyone.convert_dtypes()
everyone_bio = everyone[['height', 'weight', 'BMI', 'WAR']]
everyone.describe().to_csv('everyone_describe.csv')
huskies.convert_dtypes()
huskies.describe().to_csv('huskies_describe.csv')


KeyError: "['BMI'] not in index"

In [701]:
brefids = huskies.key_bbref.to_list()

In [702]:
brefids

['colonba01',
 'moronre01',
 'mijarjo01',
 'healeto01',
 'machije01',
 'rodnefe01',
 'castran01',
 'colined01',
 'valdefr01',
 'carigan01',
 'akinke01',
 'mateoju01',
 'gonzaen01',
 'kirkal01',
 'penabr01',
 'sandopa01',
 'whitety01',
 'phegljo01',
 'solando01',
 'callaal01',
 'tejadmi01',
 'naylojo01',
 'vicieda01',
 'ramirha02',
 'astudwi01',
 'fothebo01']

In [801]:
all_players.columns

Index(['key_bbref', 'yearID', 'stint', 'teamID', 'lgID', 'POS', 'G', 'GS',
       'InnOuts', 'PO', 'A', 'E', 'DP', 'PB', 'WP', 'SB', 'CS', 'ZR',
       'name_last', 'name_first', 'key_mlbam', 'key_retro', 'key_fangraphs',
       'birthYear', 'deathYear', 'weight', 'height', 'bats', 'throws', 'AB',
       'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF',
       'GIDP', '_merge', 'W', 'L', 'CG', 'SHO', 'SV', 'IPouts', 'ER', 'BAOpp',
       'ERA', 'BK', 'BFP', 'GF', 'second_merge', 'name_common', 'mlb_ID',
       'stint_ID', 'pitcher', 'PA', 'salary', 'runs_above_avg',
       'runs_above_avg_off', 'runs_above_avg_def', 'WAR_rep', 'WAA', 'WAR',
       'third_merge', 'RA', 'xRA', 'BIP', 'BIP_perc', 'ERA_plus', 'WAA_adj',
       'fourth_merge'],
      dtype='object')

In [None]:
all_players['KG'] = all_players['weight'] * KG_TO_LB
all_players['meters'] = all_players['height'] * height_

In [819]:
all_players['BMI'] = all_players['KG'] / all_players['meters'] ** 2

KeyError: 'KG'

In [821]:
all_players_describe = all_players.describe()
huskies_describe = huskies.describe()

In [822]:
all_players_describe = all_players_describe.join(huskies_describe, how='left', lsuffix='_mlb', rsuffix='_huskies')

In [824]:
all_players_describe = all_players_describe.convert_dtypes()
huskies_describe = huskies_describe.convert_dtypes()

In [825]:
all_players_describe.to_csv('stat_comparison.csv')
huskies_describe.to_csv('huskie_bodies.csv')
