In [319]:
## Let's try to figure out who the shortest, heaviest baseball players are.

## Here's our overall plan of attack

# Setting Things Up ✅
## Import CSVs ✅
### Separate CSVs --> DataFrames for People, Pitching Data, Batting Data ✅
## Squish everything into one mondo DF ✅
## Add Physical Data ✅
### Height ✅
### Weight ✅
## Calculate BMI ✅
### Convert Imperial to Metric ✅
### BMI-ify ✅
### Throw BMI back into df ✅
## Calculate mean fWAR/bWAR (mWAR)
## Assemble per-position lists sorted by BMI, then mWAR

# Knocking Things Down
## Find worst team that made playoffs in 2021
### Describe team fWAR/bWAR
### Describe individual fWAR/bWAR
## Pull from BMI lists per position until high BMI roster is full
### mWAR shall be higher on a team basis.
### mWAR shall be higher per position.
# If there's enough time:
## Repeat for:
### Tallness
### Shortness
### Heavy
### Light

# If there's enough time:
## Repeat for:
### Tallness
### Shortness
### Heavy
### Light

## Setting Things Up

### Import the necessaries

In [320]:
import pandas as pd
import pybaseball
from pybaseball import bwar_pitch
from pybaseball import bwar_bat
from pybaseball import cache
from pybaseball.lahman import *
from pybaseball import chadwick_register

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)


In [321]:
# Constants
KG_TO_LB = 0.453592
M_TO_IN = 0.0254

In [322]:
chadwick = chadwick_register(save=True)
chadwick = pd.DataFrame(chadwick)

In [323]:
download_lahman()

## Read infinity baseball data to DataFrames

In [324]:
# a table of all player biographical info and ids
people = pd.DataFrame(people())

# park id, name, alias, city, state, and country
parks = pd.DataFrame(parks())

# all star roster data: player, year, team, league, position
allstar = pd.DataFrame(all_star_full())

# each player's games played per position for each season
appearances = pd.DataFrame(appearances())

# batting stats by year, regular season
batting = pd.DataFrame(batting())

# batting stats by year, post season
batting_post = pd.DataFrame(batting_post())

# fielding stats by year 
fielding = pd.DataFrame(fielding())

# games played in left, center, right field 
fielding_of = pd.DataFrame(fielding_of())

# LF/CF/RF splits
fielding_of_split = pd.DataFrame(fielding_of_split())

# postseason fielding 
fielding_post = pd.DataFrame(fielding_post())

# home game attendance by park by year 
home_games = pd.DataFrame(home_games())

# historical player pitching stats
pitching = pd.DataFrame(pitching())

# postseason pitching stats
pitching_post = pd.DataFrame(pitching_post())

# playoff series winners and losers 
series_post = pd.DataFrame(series_post())

# data on teams by year: record, division, stadium, attendance, etc
teams = pd.DataFrame(teams())

# current and historical franchises, whether they're still active, and their ids
teams_franchises = pd.DataFrame(teams_franchises())

# split season data for teams
teams_half = pd.DataFrame(teams_half()) 

# fangraphs batting since 2008
fangraphs_batting = pd.DataFrame(pybaseball.batting_stats_range(
    start_dt="2008-01-01", end_dt="2021-12-31"))

# fangraphs pitching since 2008
fangraphs_pitching = pd.DataFrame(pybaseball.pitching_stats_range(
    start_dt="2008-01-01", end_dt="2021-12-31"))

# fangraphs team pitching since 2008
fangraphs_team_pitching = pd.DataFrame(
    pybaseball.team_pitching(start_season="2008", end_season="2021"))

# fangraphs team batting since 2008
fangraphs_team_batting = pd.DataFrame(
    pybaseball.team_batting(start_season="2008", end_season="2021"))

# fangraphs team fielding since 2008
fangraphs_team_fielding = pd.DataFrame(
    pybaseball.team_fielding(start_season="2008", end_season="2021"))

# bref pitching WAR
bwar_pitch = pd.DataFrame(bwar_pitch(return_all=True))

# bref batting WAR
bwar_bat = pd.DataFrame(bwar_bat(return_all=True))


  table = table.drop('', 1)


Here's where we add the good stuff.

In [325]:
people['KG'] = people['weight'] * KG_TO_LB
people['meters'] = people['height'] * M_TO_IN
people['BMI'] = people['KG'] / people['meters'] ** 2
people['ratio'] = people['meters'] * people['BMI']


In [326]:
# Let's see what we've got

In [327]:
people.shape

(20543, 28)

In [328]:
people.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio'],
      dtype='object')

In [329]:
chadwick.shape

(24258, 8)

In [330]:
# Let's change some of these column names to save ourselves some merging hassle

In [331]:
chadwick.columns

Index(['name_last', 'name_first', 'key_mlbam', 'key_retro', 'key_bbref',
       'key_fangraphs', 'mlb_played_first', 'mlb_played_last'],
      dtype='object')

In [332]:
chadwick = chadwick.rename(columns={
    "name_last": "nameLast",
    "name_first": "nameFirst",
    "key_fangraphs": "fangraphsID",
    "key_bbref": "playerID",
    "key_retro": "retroID",
    "key_mlbam": "mlbID"
})

In [333]:
chadwick.columns

Index(['nameLast', 'nameFirst', 'mlbID', 'retroID', 'playerID', 'fangraphsID',
       'mlb_played_first', 'mlb_played_last'],
      dtype='object')

In [334]:
people.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio'],
      dtype='object')

In [335]:
df = people.merge(
    chadwick,
    left_on=["playerID", "retroID", "nameLast", "nameFirst"],
    right_on=["playerID", "retroID", "nameLast", "nameFirst"], how='outer'
)

Merge Chadwick, check.

In [336]:
df.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio', 'mlbID',
       'fangraphsID', 'mlb_played_first', 'mlb_played_last'],
      dtype='object')

In [337]:
df.shape

(24587, 32)

prep merge bwar_bat

In [338]:
bwar_bat.columns

Index(['name_common', 'age', 'mlb_ID', 'player_ID', 'year_ID', 'team_ID',
       'stint_ID', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp',
       'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher',
       'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p',
       'runs_replacement', 'runs_above_rep', 'runs_above_avg',
       'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def',
       'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG',
       'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent',
       'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off',
       'waa_win_perc_def', 'waa_win_perc_rep', 'OPS_plus', 'TOB_lg', 'TB_lg'],
      dtype='object')

In [339]:
bwar_bat = bwar_bat.rename(columns={
    'player_ID': 'playerID',
    'mlb_ID': 'mlbID',
    'stint_ID': 'stint'
})

In [340]:
bwar_bat.columns

Index(['name_common', 'age', 'mlbID', 'playerID', 'year_ID', 'team_ID',
       'stint', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp',
       'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher',
       'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p',
       'runs_replacement', 'runs_above_rep', 'runs_above_avg',
       'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def',
       'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG',
       'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent',
       'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off',
       'waa_win_perc_def', 'waa_win_perc_rep', 'OPS_plus', 'TOB_lg', 'TB_lg'],
      dtype='object')

Merge bwar_bat, check

In [341]:
df = df.merge(
    bwar_bat, left_on=["playerID", "mlbID"], right_on=["playerID", "mlbID"], how="outer"
)

In [342]:
df.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio', 'mlbID',
       'fangraphsID', 'mlb_played_first', 'mlb_played_last', 'name_common',
       'age', 'year_ID', 'team_ID', 'stint', 'lg_ID', 'PA', 'G', 'Inn',
       'runs_bat', 'runs_br', 'runs_dp', 'runs_field', 'runs_infield',
       'runs_outfield', 'runs_catcher', 'runs_good_plays', 'runs_defense',
       'runs_position', 'runs_position_p', 'runs_replacement',
       'runs_above_rep', 'runs_above_avg', 'runs_above_avg_off',
       'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def', 'WAR', 'WAR_def',
       'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG', 'oppRpG',
       'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent', 'pyth_

In [343]:
bwar_pitch.columns

Index(['name_common', 'age', 'mlb_ID', 'player_ID', 'year_ID', 'team_ID',
       'stint_ID', 'lg_ID', 'G', 'GS', 'IPouts', 'IPouts_start',
       'IPouts_relief', 'RA', 'xRA', 'xRA_sprp_adj', 'xRA_extras_adj',
       'xRA_def_pitcher', 'PPF', 'PPF_custom', 'xRA_final', 'BIP', 'BIP_perc',
       'RS_def_total', 'runs_above_avg', 'runs_above_avg_adj',
       'runs_above_rep', 'RpO_replacement', 'GR_leverage_index_avg', 'WAR',
       'salary', 'teamRpG', 'oppRpG', 'pyth_exponent', 'waa_win_perc', 'WAA',
       'WAA_adj', 'oppRpG_rep', 'pyth_exponent_rep', 'waa_win_perc_rep',
       'WAR_rep', 'ERA_plus', 'ER_lg'],
      dtype='object')

In [344]:
bwar_pitch = bwar_pitch.rename(columns={
    "stint_ID": "stint",
    "mlb_ID": "mlbID",
    "player_ID": "playerID"
})

In [345]:
bwar_pitch.stint

0        1
1        1
2        1
3        1
4        1
        ..
53650    1
53651    1
53652    1
53653    1
53654    1
Name: stint, Length: 53655, dtype: int64

In [346]:
print(df.columns.tolist())

['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry', 'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay', 'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast', 'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame', 'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio', 'mlbID', 'fangraphsID', 'mlb_played_first', 'mlb_played_last', 'name_common', 'age', 'year_ID', 'team_ID', 'stint', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp', 'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher', 'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p', 'runs_replacement', 'runs_above_rep', 'runs_above_avg', 'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def', 'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG', 'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent', 'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off', 'waa_win_perc_def', 'waa_win_perc_rep', 'OPS_p

In [347]:
df = df.merge(
    bwar_pitch,
    left_on=[
        'G',
        'WAA',
        'WAR',
        'WAR_rep',
        'age',
        'lg_ID',
        'mlbID',
        'oppRpG',
        'oppRpG_rep',
        'playerID',
        'pyth_exponent',
        'pyth_exponent_rep',
        'runs_above_avg',
        'runs_above_rep',
        'salary',
        'stint',
        'teamRpG',
        'team_ID',
        'waa_win_perc',
        'waa_win_perc_rep',
        'year_ID'
    ],
    right_on=[
        'G',
        'WAA',
        'WAR',
        'WAR_rep',
        'age',
        'lg_ID',
        'mlbID',
        'oppRpG',
        'oppRpG_rep',
        'playerID',
        'pyth_exponent',
        'pyth_exponent_rep',
        'runs_above_avg',
        'runs_above_rep',
        'salary',
        'stint',
        'teamRpG',
        'team_ID',
        'waa_win_perc',
        'waa_win_perc_rep',
        'year_ID'
    ], how='outer'
)

In [348]:
print(df.columns.tolist())

['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry', 'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay', 'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast', 'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame', 'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio', 'mlbID', 'fangraphsID', 'mlb_played_first', 'mlb_played_last', 'name_common_x', 'age', 'year_ID', 'team_ID', 'stint', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp', 'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher', 'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p', 'runs_replacement', 'runs_above_rep', 'runs_above_avg', 'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def', 'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG', 'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent', 'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off', 'waa_win_perc_def', 'waa_win_perc_rep', 'OPS

In [349]:
fielding_of.columns

Index(['playerID', 'yearID', 'stint', 'Glf', 'Gcf', 'Grf'], dtype='object')

In [350]:
fielding_of = fielding_of.rename(columns={
    "yearID": "year_ID"
})

In [351]:
df = df.merge(
    fielding_of,
    left_on=["playerID", "year_ID", "stint"],
    right_on=["playerID", "year_ID", "stint"], how='left'
)


In [352]:
df.shape

(177727, 104)

In [353]:
fielding.columns

Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'POS', 'G', 'GS',
       'InnOuts', 'PO', 'A', 'E', 'DP', 'PB', 'WP', 'SB', 'CS', 'ZR'],
      dtype='object')

In [354]:
fielding = fielding.rename(columns={
    "yearID": "year_ID"
})

In [355]:
df = df.merge(
    fielding,
    left_on=["playerID", "year_ID", "G", "stint"],
    right_on=["playerID", "year_ID", "G", "stint"], how='left'
)


In [356]:
df.sample(25) ### works to here

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,deathCountry,deathState,deathCity,nameFirst,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID,KG,meters,BMI,ratio,mlbID,fangraphsID,mlb_played_first,mlb_played_last,name_common_x,age,year_ID,team_ID,stint,lg_ID,PA,G,Inn,runs_bat,runs_br,runs_dp,runs_field,runs_infield,runs_outfield,runs_catcher,runs_good_plays,runs_defense,runs_position,runs_position_p,runs_replacement,runs_above_rep,runs_above_avg,runs_above_avg_off,runs_above_avg_def,WAA,WAA_off,WAA_def,WAR,WAR_def,WAR_off,WAR_rep,salary,pitcher,teamRpG,oppRpG,oppRpPA_rep,oppRpG_rep,pyth_exponent,pyth_exponent_rep,waa_win_perc,waa_win_perc_off,waa_win_perc_def,waa_win_perc_rep,OPS_plus,TOB_lg,TB_lg,name_common_y,GS_x,IPouts,IPouts_start,IPouts_relief,RA,xRA,xRA_sprp_adj,xRA_extras_adj,xRA_def_pitcher,PPF,PPF_custom,xRA_final,BIP,BIP_perc,RS_def_total,runs_above_avg_adj,RpO_replacement,GR_leverage_index_avg,WAA_adj,ERA_plus,ER_lg,Glf,Gcf,Grf,teamID,lgID,POS,GS_y,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR
36977,goodedw01,1964.0,11.0,16.0,USA,FL,Tampa,,,,,,,Dwight,Gooden,Dwight Eugene,190.0,74.0,R,R,1984-04-07,2000-09-29,goodd001,goodedw01,86.18248,1.8796,24.394286,45.8515,114947.0,1004852.0,1984.0,2000.0,Dwight Gooden,27.0,1992.0,NYM,1.0,NL,77.0,33.0,206.0,-1.28,0.25,0.26,0.0,0.0,0.0,0.0,,0.0,0.16,8.66,0.14,8.2,8.1,8.1,0.2,0.91,0.89,0.02,0.93,0.02,0.91,0.02,5166667.0,Y,4.10624,3.8623,0.06979,3.79191,1.807,1.786,0.5276,0.5276,0.5006,0.4918,83.505798,23.601,27.338,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
140682,garvene01,,,,,,,,,,,,,,,,,,,,,,,,,,,,114640.0,,,,,23.0,1949.0,SLB,1.0,AL,,41.0,,,,,,,,,,,,,,40.236,21.816,,,2.3001,,,3.99,,,1.8014,,,4.73359,4.19467,,5.17596,1.866,1.923,0.5561,,,0.4572,,,,Ned Garver,32.0,671.0,637.0,34.0,126.0,121.072,0.0,,-17.751,108.0,106.478,147.816,823.0,0.1659,-107.0,22.096,0.203,1.4344,-0.1114,113.975758,112.836,,,,SLA,AL,P,,,15.0,42.0,5.0,3.0,,,,,
17131,chappla01,1890.0,2.0,19.0,USA,IL,McClusky,1918.0,11.0,8.0,USA,CA,San Francisco,Larry,Chappell,LaVerne Ashford,186.0,72.0,L,R,1913-07-18,1917-04-25,chapl101,chappla01,84.368112,1.8288,25.225856,46.133045,112208.0,1002155.0,1913.0,1917.0,Larry Chappell,24.0,1914.0,CHW,1.0,AL,43.0,21.0,,-1.81,0.1,0.0,0.0,,,,,0.0,-0.47,0.0,1.62,-0.6,-2.2,-2.2,-0.5,-0.27,-0.26,-0.08,-0.09,-0.08,-0.08,0.18,,N,3.56992,3.67373,0.05821,3.5966,1.758,1.76,0.4874,0.4874,0.4973,0.4907,61.100257,14.126,13.03,,,,,,,,,,,,,,,,,,,,,,,7.0,0.0,2.0,,,,,,,,,,,,,,
49791,johnsji04,1983.0,6.0,27.0,USA,NY,Johnson City,,,,,,,Jim,Johnson,James Robert,250.0,78.0,R,R,2006-07-29,2018-09-29,johnj010,johnsji04,113.398,1.9812,28.890081,57.237028,462382.0,3656.0,2006.0,2018.0,Jim Johnson,34.0,2017.0,ATL,1.0,NL,0.0,56.0,56.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02,0.0,-0.02,0.0,0.0,5000000.0,Y,4.66123,4.66123,0.08583,4.66123,1.889,1.889,0.5,0.5,0.5,0.5,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
31991,fordwh01,1928.0,10.0,21.0,USA,NY,New York,2020.0,10.0,8.0,USA,NY,Lake Success,Whitey,Ford,Edward Charles,178.0,70.0,L,L,1950-07-01,1967-05-21,fordw101,fordwh01,80.739376,1.778,25.540058,45.410223,114299.0,1004227.0,1950.0,1967.0,Whitey Ford,30.0,1959.0,NYY,1.0,AL,80.0,35.0,204.0,0.03,-0.61,0.34,0.0,0.0,0.0,0.0,,0.0,0.02,9.16,0.0,8.9,8.9,8.9,0.0,0.9,0.92,-0.03,0.9,-0.03,0.92,0.0,31000.0,Y,4.62883,4.3734,0.08105,4.3033,1.871,1.851,0.5265,0.5265,0.5001,0.4925,94.12473,25.117,25.396,,,,,,,,,,,,,,,,,,,,,,,,,,NYA,AL,P,29.0,612.0,15.0,49.0,1.0,5.0,,,,,
65424,mckeere01,1890.0,7.0,20.0,USA,OH,Shawnee,1972.0,8.0,5.0,USA,MI,Saginaw,Red,McKee,Raymond Ellis,180.0,71.0,L,R,1913-04-19,1916-09-19,mcker103,mckeere01,81.64656,1.8034,25.104626,45.273683,118769.0,1008586.0,1913.0,1916.0,Red McKee,23.0,1914.0,DET,1.0,AL,80.0,34.0,,-1.82,-0.27,0.0,-5.0,,,,,-5.0,1.35,0.0,3.01,-2.7,-5.7,-0.7,-3.7,-0.7,-0.09,-0.45,-0.37,-0.45,0.24,0.33,,N,3.65197,3.67373,0.05821,3.5851,1.764,1.759,0.48,0.4974,0.4873,0.4893,71.334971,26.362,21.766,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24134,delabst01,1983.0,7.0,17.0,USA,KY,Fort Knox,,,,,,,Steve,Delabar,Steven Edward,215.0,77.0,R,R,2011-09-11,2016-05-21,delas001,delabst01,97.52228,1.9558,25.494997,49.863115,447755.0,11827.0,2011.0,2016.0,Steve Delabar,32.0,2016.0,CIN,1.0,NL,0.0,6.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Y,4.5001,4.5001,0.08241,4.5001,1.871,1.871,0.5,0.5,0.5,0.5,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
65392,mckayre01,1881.0,11.0,16.0,USA,TX,Morgan,1946.0,1.0,18.0,USA,TX,Dallas,Reeve,McKay,Reeve Stewart,168.0,73.0,,R,1915-10-02,1915-10-02,mckar101,mckayre01,76.203456,1.8542,22.164682,41.097754,118764.0,1008580.0,1915.0,1915.0,Reeve McKay,33.0,1915.0,SLB,1.0,AL,0.0,1.0,,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Y,3.95995,3.95995,0.06479,3.95995,1.804,1.804,0.5,0.5,0.5,0.5,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,SLA,AL,P,,,0.0,1.0,0.0,0.0,,,,,
29749,falcope01,1953.0,10.0,1.0,USA,NY,Brooklyn,,,,,,,Pete,Falcone,Peter,185.0,74.0,L,L,1975-04-13,1984-09-17,falcp001,falcope01,83.91452,1.8796,23.752331,44.644882,113987.0,1003910.0,1975.0,1984.0,Pete Falcone,22.0,1976.0,STL,1.0,NL,76.0,32.0,212.0,-10.79,-0.58,-0.22,0.0,0.0,0.0,0.0,,0.0,0.14,8.21,0.0,-3.2,-3.2,-3.2,0.1,-0.38,-0.39,0.02,-0.38,0.02,-0.39,0.0,,Y,3.87376,3.97501,0.0697,3.89783,1.799,1.801,0.4884,0.4884,0.5005,0.4912,-14.093067,21.146,23.306,,,,,,,,,,,,,,,,,,,,,,,,,,SLN,NL,P,32.0,636.0,3.0,14.0,0.0,0.0,,,,,
76881,penabe01,1959.0,7.0,11.0,P.R.,,Santurce,,,,,,,Bert,Pena,Adalberto,165.0,71.0,R,R,1981-09-14,1987-07-19,penab001,penabe01,74.84268,1.8034,23.012574,41.500876,120368.0,1010142.0,1981.0,1987.0,Bert Pena,25.0,1985.0,HOU,1.0,NL,32.0,20.0,66.9,-0.38,-0.14,0.13,1.2,0.0,0.0,0.0,,1.2,0.51,0.0,1.09,2.4,1.3,0.1,1.7,0.15,0.0,0.19,0.26,0.19,0.11,0.11,,N,4.05956,4.05356,0.07022,3.99889,1.816,1.812,0.5074,0.5007,0.5096,0.4938,79.828231,10.041,11.087,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [101]:
teams.sample(10)

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
1197,1950,NL,SLN,STL,,5,153,76.0,78,75,,,N,N,693,5215,1353,255,50,102,606.0,604.0,23.0,,,,670,598,3.97,57,10,14,4068,1398,119,535,603,130,172,0.978,St. Louis Cardinals,Sportsman's Park IV,1093411.0,104,103,STL,SLN,SLN
1016,1939,AL,NYA,NYY,,1,152,77.0,106,45,,,Y,Y,967,5300,1521,259,55,166,701.0,543.0,72.0,37.0,,,556,496,3.31,87,15,26,4044,1208,85,567,565,126,159,0.978,New York Yankees,Yankee Stadium I,859785.0,99,95,NYY,NYA,NYA
2325,2000,AL,ANA,ANA,W,3,162,81.0,82,80,N,N,N,N,864,5628,1574,309,34,236,608.0,1024.0,93.0,52.0,47.0,43.0,869,805,5.0,5,3,46,4344,1534,228,662,846,134,182,0.978,Anaheim Angels,Edison International Field,2066982.0,102,103,ANA,ANA,ANA
271,1891,AA,LS2,LOU,,8,138,,54,83,,,N,,698,4764,1229,127,68,17,438.0,465.0,227.0,,68.0,,873,567,4.22,126,9,1,3630,1334,32,451,481,454,112,0.922,Louisville Colonels,Eclipse Park I,,97,97,LOU,LS2,LS2
1728,1977,NL,PIT,PIT,E,2,162,81.0,96,66,N,,N,N,734,5662,1550,278,57,133,474.0,878.0,260.0,120.0,34.0,43.0,665,594,3.61,25,15,39,4445,1406,149,485,890,145,137,0.977,Pittsburgh Pirates,Three Rivers Stadium,1237349.0,102,101,PIT,PIT,PIT
2190,1995,AL,DET,DET,E,4,144,72.0,60,84,N,N,N,N,654,4865,1204,228,29,159,551.0,987.0,73.0,36.0,41.0,43.0,844,778,5.49,5,3,38,3825,1509,170,536,729,106,143,0.981,Detroit Tigers,Tiger Stadium,1180979.0,100,101,DET,DET,DET
427,1903,NL,PIT,PIT,,1,141,70.0,91,49,,,Y,N,793,4988,1429,208,110,34,364.0,400.0,172.0,,50.0,,613,405,2.91,117,16,5,3754,1215,9,384,454,295,100,0.951,Pittsburgh Pirates,Exposition Park,326855.0,103,99,PIT,PIT,PIT
2005,1988,AL,KCA,KCR,W,3,161,80.0,84,77,N,,N,N,704,5469,1419,275,40,121,486.0,944.0,137.0,54.0,33.0,51.0,648,580,3.65,29,12,32,4285,1415,102,465,886,124,147,0.98,Kansas City Royals,Royals Stadium,2350181.0,101,101,KCR,KCA,KCA
718,1920,AL,WS1,MIN,,6,153,76.0,68,84,,,N,N,723,5251,1526,233,81,36,433.0,543.0,161.0,114.0,,,802,633,4.17,81,10,10,4101,1521,51,520,418,232,95,0.963,Washington Senators,Griffith Stadium I,359260.0,97,98,WSH,WS1,WS1
468,1906,NL,CIN,CIN,,6,155,78.0,64,87,,,N,N,533,5025,1198,140,71,16,395.0,532.0,170.0,,58.0,,582,409,2.69,126,12,5,4109,1248,14,470,567,262,97,0.959,Cincinnati Reds,Palace of the Fans,330056.0,104,104,CIN,CIN,CIN


Now let's repeat the process with oodles of performance data from df.bwar_bat and df.bwar_pitch.

### Set constants we'll need

### Squish everything into one mondo DF

In [132]:
df_huskiesBatters = df_batters.loc[df_batters.BMI >= 34.55]

In [133]:
df_huskiesBatters.sort_values('BMI').describe()

Unnamed: 0,birthYear,birthMonth,birthDay,deathYear,deathMonth,deathDay,weight,height,yearID,stint,...,BB,SO,IBB,HBP,SH,SF,GIDP,KG,meters,BMI
count,26.0,25.0,25.0,5.0,5.0,5.0,26.0,26.0,26.0,26.0,...,26.0,22.0,0.0,24.0,17.0,0.0,1.0,26.0,26.0,26.0
mean,1969.153846,5.8,17.52,1952.4,7.6,8.6,269.961538,72.115385,1901.730769,1.115385,...,9.884615,14.909091,,0.875,6.0,,0.0,122.452394,1.831731,36.408324
std,40.878055,3.316625,8.529947,49.45503,3.646917,7.602631,30.1801,4.348121,14.17761,0.325813,...,16.310308,13.606403,,1.650099,8.951257,,,13.689452,0.110442,1.728873
min,1853.0,1.0,1.0,1891.0,2.0,2.0,155.0,55.0,1872.0,1.0,...,0.0,0.0,,0.0,0.0,,0.0,70.30676,1.397,34.622243
25%,1977.0,4.0,11.0,1915.0,6.0,2.0,261.25,71.0,1890.25,1.0,...,1.0,3.25,,0.0,0.0,,0.0,118.50091,1.8034,34.891189
50%,1983.5,6.0,18.0,1966.0,9.0,6.0,270.0,72.0,1903.5,1.0,...,3.5,11.0,,0.0,2.0,,0.0,122.46984,1.8288,35.875979
75%,1989.0,8.0,24.0,1975.0,10.0,14.0,283.75,75.0,1912.0,1.0,...,9.25,25.25,,1.0,7.0,,0.0,128.70673,1.905,37.928366
max,1998.0,12.0,30.0,2015.0,11.0,19.0,320.0,78.0,1924.0,2.0,...,68.0,43.0,,6.0,33.0,,0.0,145.14944,1.9812,40.292666


In [134]:
df_huskiesPitchers = df[df_pitchers.BMI > 34.55]

NameError: name 'df' is not defined

In [None]:
df_huskiesPitchers.head()

In [None]:
df_huskies = pd.merge(df_huskiesBatters, df_huskiesPitchers, how='right', on='playerID')

In [None]:
df_huskies.columns

In [None]:
sns.jointplot(data=df_simple, x="height", y="weight", kind = "reg", truncate = False)

In [None]:
sns.choose_diverging_palette()

In [None]:

# Compute the correlation matrix
corr = df_huskiesBatters.corr(method="spearman")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(290, 10, n=40, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=1,
    center=0,
    square=True,
    linewidths=0.25,
    cbar_kws={"shrink": .5},
)



In [None]:
corr_mat = df.corr().stack().reset_index(name="correlation")