In [99]:
## Let's try to figure out who the shortest, heaviest baseball players are.

## Here's our overall plan of attack

# Setting Things Up ✅
## Import CSVs ✅
### Separate CSVs --> DataFrames for People, Pitching Data, Batting Data ✅
## Squish everything into one mondo DF ✅
## Add Physical Data ✅
### Height ✅
### Weight ✅
## Calculate BMI ✅
### Convert Imperial to Metric ✅
### BMI-ify ✅
### Throw BMI back into df ✅
## Assemble per-position lists sorted by BMI, then mWAR

# Knocking Things Down
## Find worst team that made playoffs in 2021
### Describe team fWAR/bWAR
### Describe individual fWAR/bWAR
## Pull from BMI lists per position until high BMI roster is full
### mWAR shall be higher on a team basis.
### mWAR shall be higher per position.
# If there's enough time:
## Repeat for:
### Tallness
### Shortness
### Heavy
### Light

# If there's enough time:
## Repeat for:
### Tallness
### Shortness
### Heavy
### Light

## Setting Things Up

### Import the necessaries

In [100]:
import pandas as pd
import pybaseball
from pybaseball import bwar_pitch
from pybaseball import bwar_bat
from pybaseball import cache
from pybaseball.lahman import *
from pybaseball import chadwick_register

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)


In [101]:
# Constants
KG_TO_LB = 0.453592
M_TO_IN = 0.0254

In [102]:
chadwick = chadwick_register(save=True)
chadwick = pd.DataFrame(chadwick)

In [103]:
download_lahman()

## Read infinity baseball data to DataFrames

In [104]:
# a table of all player biographical info and ids
people = pd.DataFrame(people())

# park id, name, alias, city, state, and country
parks = pd.DataFrame(parks())

# all star roster data: player, year, team, league, position
allstar = pd.DataFrame(all_star_full())

# each player's games played per position for each season
appearances = pd.DataFrame(appearances())

# batting stats by year, regular season
batting = pd.DataFrame(batting())

# batting stats by year, post season
batting_post = pd.DataFrame(batting_post())

# fielding stats by year 
fielding = pd.DataFrame(fielding())

# games played in left, center, right field 
fielding_of = pd.DataFrame(fielding_of())

# LF/CF/RF splits
fielding_of_split = pd.DataFrame(fielding_of_split())

# postseason fielding 
fielding_post = pd.DataFrame(fielding_post())

# home game attendance by park by year 
home_games = pd.DataFrame(home_games())

# historical player pitching stats
pitching = pd.DataFrame(pitching())

# postseason pitching stats
pitching_post = pd.DataFrame(pitching_post())

# playoff series winners and losers 
series_post = pd.DataFrame(series_post())

# data on teams by year: record, division, stadium, attendance, etc
teams = pd.DataFrame(teams())

# current and historical franchises, whether they're still active, and their ids
teams_franchises = pd.DataFrame(teams_franchises())

# split season data for teams
teams_half = pd.DataFrame(teams_half()) 

# fangraphs batting since 2008
fangraphs_batting = pd.DataFrame(pybaseball.batting_stats_range(
    start_dt="2008-01-01", end_dt="2021-12-31"))

# fangraphs pitching since 2008
fangraphs_pitching = pd.DataFrame(pybaseball.pitching_stats_range(
    start_dt="2008-01-01", end_dt="2021-12-31"))

# fangraphs team pitching since 2008
fangraphs_team_pitching = pd.DataFrame(
    pybaseball.team_pitching(start_season="2008", end_season="2021"))

# fangraphs team batting since 2008
fangraphs_team_batting = pd.DataFrame(
    pybaseball.team_batting(start_season="2008", end_season="2021"))

# fangraphs team fielding since 2008
fangraphs_team_fielding = pd.DataFrame(
    pybaseball.team_fielding(start_season="2008", end_season="2021"))

# bref pitching WAR
bwar_pitch = pd.DataFrame(bwar_pitch(return_all=True))

# bref batting WAR
bwar_bat = pd.DataFrame(bwar_bat(return_all=True))


ReadTimeout: HTTPSConnectionPool(host='www.baseball-reference.com', port=443): Read timed out. (read timeout=None)

----

Here's where we add the good stuff.

In [None]:
# BMI Calculations
people['KG'] = people['weight'] * KG_TO_LB
people['meters'] = people['height'] * M_TO_IN
people['BMI'] = people['KG'] / people['meters'] ** 2
people['ratio'] = people['meters'] * people['BMI']


----

Let's see what we've got

In [None]:
people.shape

(20543, 28)

In [None]:
people.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio'],
      dtype='object')

In [None]:
chadwick.shape

(24258, 8)

In [None]:
chadwick.columns

Index(['name_last', 'name_first', 'key_mlbam', 'key_retro', 'key_bbref',
       'key_fangraphs', 'mlb_played_first', 'mlb_played_last'],
      dtype='object')

In [None]:
# Let's change some of these column names to save ourselves some merging hassle

In [None]:
chadwick = chadwick.rename(columns={
    "name_last": "nameLast",
    "name_first": "nameFirst",
    "key_fangraphs": "fangraphsID",
    "key_bbref": "playerID",
    "key_retro": "retroID",
    "key_mlbam": "mlbID"
})

In [None]:
chadwick.columns

Index(['nameLast', 'nameFirst', 'mlbID', 'retroID', 'playerID', 'fangraphsID',
       'mlb_played_first', 'mlb_played_last'],
      dtype='object')

Merge Chadwick, check.

In [None]:
# Merge chadwick into people
df = people.merge(
    chadwick,
    left_on=["playerID", "retroID", "nameLast", "nameFirst"],
    right_on=["playerID", "retroID", "nameLast", "nameFirst"], how='outer'
)

In [None]:
df.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio', 'mlbID',
       'fangraphsID', 'mlb_played_first', 'mlb_played_last'],
      dtype='object')

In [None]:
df.shape

(24587, 32)

----

prep merge bwar_bat

In [None]:
bwar_bat.columns

Index(['name_common', 'age', 'mlb_ID', 'player_ID', 'year_ID', 'team_ID',
       'stint_ID', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp',
       'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher',
       'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p',
       'runs_replacement', 'runs_above_rep', 'runs_above_avg',
       'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def',
       'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG',
       'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent',
       'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off',
       'waa_win_perc_def', 'waa_win_perc_rep', 'OPS_plus', 'TOB_lg', 'TB_lg'],
      dtype='object')

In [None]:
bwar_bat = bwar_bat.rename(columns={
    'player_ID': 'playerID',
    'mlb_ID': 'mlbID',
    'stint_ID': 'stint'
})

In [None]:
bwar_bat.columns

Index(['name_common', 'age', 'mlbID', 'playerID', 'year_ID', 'team_ID',
       'stint', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp',
       'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher',
       'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p',
       'runs_replacement', 'runs_above_rep', 'runs_above_avg',
       'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def',
       'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG',
       'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent',
       'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off',
       'waa_win_perc_def', 'waa_win_perc_rep', 'OPS_plus', 'TOB_lg', 'TB_lg'],
      dtype='object')

In [None]:
bwar_bat.columns

Index(['name_common', 'age', 'mlbID', 'playerID', 'year_ID', 'team_ID',
       'stint', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp',
       'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher',
       'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p',
       'runs_replacement', 'runs_above_rep', 'runs_above_avg',
       'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def',
       'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG',
       'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent',
       'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off',
       'waa_win_perc_def', 'waa_win_perc_rep', 'OPS_plus', 'TOB_lg', 'TB_lg'],
      dtype='object')

----

Merge bwar_bat, check

In [None]:
df = df.merge(
    bwar_bat, left_on=["playerID", "mlbID"], right_on=["playerID", "mlbID"], how="outer"
)

In [None]:
df.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio', 'mlbID',
       'fangraphsID', 'mlb_played_first', 'mlb_played_last', 'name_common',
       'age', 'year_ID', 'team_ID', 'stint', 'lg_ID', 'PA', 'G', 'Inn',
       'runs_bat', 'runs_br', 'runs_dp', 'runs_field', 'runs_infield',
       'runs_outfield', 'runs_catcher', 'runs_good_plays', 'runs_defense',
       'runs_position', 'runs_position_p', 'runs_replacement',
       'runs_above_rep', 'runs_above_avg', 'runs_above_avg_off',
       'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def', 'WAR', 'WAR_def',
       'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG', 'oppRpG',
       'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent', 'pyth_

In [None]:
bwar_pitch.columns

Index(['name_common', 'age', 'mlb_ID', 'player_ID', 'year_ID', 'team_ID',
       'stint_ID', 'lg_ID', 'G', 'GS', 'IPouts', 'IPouts_start',
       'IPouts_relief', 'RA', 'xRA', 'xRA_sprp_adj', 'xRA_extras_adj',
       'xRA_def_pitcher', 'PPF', 'PPF_custom', 'xRA_final', 'BIP', 'BIP_perc',
       'RS_def_total', 'runs_above_avg', 'runs_above_avg_adj',
       'runs_above_rep', 'RpO_replacement', 'GR_leverage_index_avg', 'WAR',
       'salary', 'teamRpG', 'oppRpG', 'pyth_exponent', 'waa_win_perc', 'WAA',
       'WAA_adj', 'oppRpG_rep', 'pyth_exponent_rep', 'waa_win_perc_rep',
       'WAR_rep', 'ERA_plus', 'ER_lg'],
      dtype='object')

----

merge bwar_pitch, check

In [None]:
bwar_pitch = bwar_pitch.rename(columns={
    "stint_ID": "stint",
    "mlb_ID": "mlbID",
    "player_ID": "playerID"
})

In [None]:
bwar_pitch.stint

0        1
1        1
2        1
3        1
4        1
        ..
53652    1
53653    1
53654    1
53655    1
53656    1
Name: stint, Length: 53657, dtype: int64

In [None]:
print(df.columns.tolist())

['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry', 'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay', 'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast', 'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame', 'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio', 'mlbID', 'fangraphsID', 'mlb_played_first', 'mlb_played_last', 'name_common', 'age', 'year_ID', 'team_ID', 'stint', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp', 'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher', 'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p', 'runs_replacement', 'runs_above_rep', 'runs_above_avg', 'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def', 'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG', 'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent', 'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off', 'waa_win_perc_def', 'waa_win_perc_rep', 'OPS_p

In [None]:
df = df.merge(
    bwar_pitch,
    left_on=[
        'G',
        'WAA',
        'WAR',
        'WAR_rep',
        'age',
        'lg_ID',
        'mlbID',
        'oppRpG',
        'oppRpG_rep',
        'playerID',
        'pyth_exponent',
        'pyth_exponent_rep',
        'runs_above_avg',
        'runs_above_rep',
        'salary',
        'stint',
        'teamRpG',
        'team_ID',
        'waa_win_perc',
        'waa_win_perc_rep',
        'year_ID'
    ],
    right_on=[
        'G',
        'WAA',
        'WAR',
        'WAR_rep',
        'age',
        'lg_ID',
        'mlbID',
        'oppRpG',
        'oppRpG_rep',
        'playerID',
        'pyth_exponent',
        'pyth_exponent_rep',
        'runs_above_avg',
        'runs_above_rep',
        'salary',
        'stint',
        'teamRpG',
        'team_ID',
        'waa_win_perc',
        'waa_win_perc_rep',
        'year_ID'
    ], how='outer'
)

In [None]:
print(df.columns.tolist())

['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry', 'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay', 'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast', 'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame', 'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio', 'mlbID', 'fangraphsID', 'mlb_played_first', 'mlb_played_last', 'name_common_x', 'age', 'year_ID', 'team_ID', 'stint', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp', 'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher', 'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p', 'runs_replacement', 'runs_above_rep', 'runs_above_avg', 'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def', 'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG', 'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent', 'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off', 'waa_win_perc_def', 'waa_win_perc_rep', 'OPS

----

Merge fielding_of, check

In [None]:
fielding_of.columns

Index(['playerID', 'yearID', 'stint', 'Glf', 'Gcf', 'Grf'], dtype='object')

In [None]:
fielding_of = fielding_of.rename(columns={
    "yearID": "year_ID"
})

In [None]:
df = df.merge(
    fielding_of,
    left_on=["playerID", "year_ID", "stint"],
    right_on=["playerID", "year_ID", "stint"], how='left'
)


In [None]:
df.shape

(177729, 104)

----

In [None]:
fielding.columns

Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'POS', 'G', 'GS',
       'InnOuts', 'PO', 'A', 'E', 'DP', 'PB', 'WP', 'SB', 'CS', 'ZR'],
      dtype='object')

In [None]:
fielding = fielding.rename(columns={
    "yearID": "year_ID"
})

In [None]:
df = df.merge(
    fielding,
    left_on=["playerID", "year_ID", "G", "stint"],
    right_on=["playerID", "year_ID", "G", "stint"], how='left'
)


----

Here's our Frankenstein's Monster

In [None]:
df.sample(25) ### works to here

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,deathCountry,deathState,deathCity,nameFirst,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID,KG,meters,BMI,ratio,mlbID,fangraphsID,mlb_played_first,mlb_played_last,name_common_x,age,year_ID,team_ID,stint,lg_ID,PA,G,Inn,runs_bat,runs_br,runs_dp,runs_field,runs_infield,runs_outfield,runs_catcher,runs_good_plays,runs_defense,runs_position,runs_position_p,runs_replacement,runs_above_rep,runs_above_avg,runs_above_avg_off,runs_above_avg_def,WAA,WAA_off,WAA_def,WAR,WAR_def,WAR_off,WAR_rep,salary,pitcher,teamRpG,oppRpG,oppRpPA_rep,oppRpG_rep,pyth_exponent,pyth_exponent_rep,waa_win_perc,waa_win_perc_off,waa_win_perc_def,waa_win_perc_rep,OPS_plus,TOB_lg,TB_lg,name_common_y,GS_x,IPouts,IPouts_start,IPouts_relief,RA,xRA,xRA_sprp_adj,xRA_extras_adj,xRA_def_pitcher,PPF,PPF_custom,xRA_final,BIP,BIP_perc,RS_def_total,runs_above_avg_adj,RpO_replacement,GR_leverage_index_avg,WAA_adj,ERA_plus,ER_lg,Glf,Gcf,Grf,teamID,lgID,POS,GS_y,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR
12720,buffich01,1861.0,6.0,14.0,USA,MA,Fall River,1907.0,9.0,23.0,USA,MA,Fall River,Charlie,Buffinton,Charles G.,180.0,73.0,R,R,1882-05-17,1892-06-28,buffc101,buffich01,81.64656,1.8542,23.747874,44.033308,111672.0,1001634.0,1882.0,1892.0,Charlie Buffinton,21.0,1882.0,BSN,1.0,NL,52.0,15.0,,-0.66,0.0,0.0,-1.0,,,,,-1.0,-0.45,0.0,1.78,-0.3,-2.1,-1.1,-1.5,-0.21,-0.1,-0.15,-0.08,-0.15,0.03,0.13,1000.0,N,5.16182,5.23582,0.10604,5.11736,1.949,1.947,0.4869,0.4931,0.491,0.4889,83.514984,14.638,17.275,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,7.0,,,,,,,,,,,,,,
47471,huntebi03,1928.0,6.0,4.0,USA,PA,Punxsutawney,,,,,,,Billy,Hunter,Gordon William,180.0,72.0,R,R,1953-04-14,1958-09-27,huntb105,huntebi03,81.64656,1.8288,24.412118,44.644882,116331.0,1006196.0,1953.0,1958.0,Billy Hunter,28.0,1956.0,NYY,1.0,AL,79.0,39.0,196.0,-2.05,-0.75,-0.71,0.3,1.0,0.0,0.0,,1.3,1.35,0.0,2.42,1.6,-0.9,-2.2,2.7,-0.09,-0.23,0.29,0.15,0.29,0.01,0.24,11000.0,N,4.62306,4.67845,0.08601,4.61632,1.888,1.888,0.4978,0.4944,0.5069,0.4937,92.612618,26.573,30.172,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10248,boyleja01,1866.0,3.0,22.0,USA,OH,Cincinnati,1913.0,1.0,6.0,USA,OH,Cincinnati,Jack,Boyle,John Anthony,190.0,76.0,R,R,1886-10-08,1898-08-16,boylj102,boyleja01,86.18248,1.9304,23.12727,44.644882,111326.0,1001289.0,1886.0,1898.0,Jack Boyle,21.0,1887.0,STL,1.0,AA,372.0,88.0,,-44.56,-2.18,0.0,7.0,,,,,7.0,5.56,0.0,8.0,-26.2,-34.2,-41.2,12.6,-2.62,-3.21,1.06,-1.78,1.06,-2.37,0.84,,N,6.38966,6.85762,0.14738,6.76673,2.088,2.105,0.4693,0.4632,0.511,0.493,22.167333,132.841,137.69,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,2.0,,,,,,,,,,,,,,
135755,deshaji01,,,,,,,,,,,,,,,,,,,,,,,,,,,,113290.0,,,,,31.0,1991.0,HOU,1.0,NL,,28.0,,,,,,,,,,,,,,-4.927,-17.814,,,-1.9768,,,-0.65,,,1.4172,2100000.0,,4.10117,4.77775,,4.55899,1.863,1.85,0.4294,,,0.4512,,,,Jim Deshaies,28.0,483.0,483.0,0.0,90.0,72.856,2.55,,0.851,94.0,96.822,72.186,515.0,0.1134,7.5,-18.944,0.179,1.0,-0.0894,69.860674,62.176,,,,HOU,NL,P,28.0,483.0,4.0,18.0,3.0,1.0,,,,,
88284,sauveri01,1963.0,11.0,23.0,USA,VA,Arlington,,,,,,,Rich,Sauveur,Richard Daniel,163.0,76.0,L,L,1986-07-01,2000-06-18,sauvr001,sauveri01,73.935496,1.9304,19.840763,38.300609,121716.0,1011465.0,1986.0,2000.0,Rich Sauveur,24.0,1988.0,MON,1.0,NL,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Y,3.85934,3.85934,0.06631,3.85934,1.79,1.79,0.5,0.5,0.5,0.5,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,MON,NL,P,0.0,9.0,0.0,1.0,0.0,0.0,,,,,
132030,cecilbr01,,,,,,,,,,,,,,,,,,,,,,,,,,,,446399.0,,,,,27.0,2014.0,TOR,1.0,AL,,66.0,,,,,,,,,,,,,,12.691,7.849,,,0.8052,,,1.32,,,0.5427,1300000.0,,4.13265,4.0236,,4.20627,1.819,1.83,0.5122,,,0.4919,,,,Brett Cecil,0.0,160.0,0.0,160.0,16.0,24.886,-1.854,,-1.033,99.0,99.103,23.849,130.0,0.0295,-35.0,7.197,0.184,1.2717,-0.0265,141.10625,22.577,,,,TOR,AL,P,0.0,160.0,0.0,5.0,0.0,0.0,,,,,
149557,klawial01,,,,,,,,,,,,,,,,,,,,,,,,,,,,117138.0,,,,,20.0,1909.0,NYG,1.0,NL,,6.0,,,,,,,,,,,,,,2.46,0.521,,,0.0444,,,0.26,,,0.2409,,,3.63823,3.57773,,3.95885,1.756,1.782,0.5074,,,0.4624,,,,Al Klawitter,3.0,81.0,63.0,18.0,11.0,11.584,0.0,,-0.219,99.0,97.614,11.521,90.0,0.0199,-11.0,0.363,0.159,1.0,-0.0292,130.933333,7.856,,,,NY1,NL,P,3.0,81.0,3.0,12.0,0.0,0.0,,,,,
50214,jonesad01,1985.0,8.0,1.0,USA,CA,San Diego,,,,,,,Adam,Jones,Adam LaMarque,215.0,74.0,R,R,2006-07-14,2019-09-28,jonea003,jonesad01,97.52228,1.8796,27.604061,51.884592,430945.0,6368.0,2006.0,2019.0,Adam Jones,20.0,2006.0,SEA,1.0,AL,76.0,32.0,202.0,-4.99,-0.82,-0.24,0.0,0.0,3.0,0.0,0.0,3.0,0.4,0.0,2.85,0.2,-2.7,-5.7,3.4,-0.25,-0.55,0.35,0.02,0.35,-0.28,0.27,,N,4.74372,4.92028,0.08573,4.83122,1.909,1.914,0.4918,0.4826,0.5104,0.4913,42.435669,25.536,31.968,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
157955,munozmi01,,,,,,,,,,,,,,,,,,,,,,,,,,,,119525.0,,,,,29.0,1995.0,COL,1.0,NL,,64.0,,,,,,,,,,,,,,-3.905,-7.673,,,-0.8064,,,-0.64,,,0.3816,300000.0,,4.63299,4.75836,,4.69188,1.893,1.89,0.4874,,,0.494,,,,Mike Munoz,0.0,131.0,0.0,131.0,38.0,23.756,-1.784,,-1.119,128.0,131.337,30.327,143.0,0.0336,-33.3,-8.024,0.201,1.3036,-0.2123,72.65,26.154,,,,COL,NL,P,0.0,131.0,5.0,7.0,0.0,0.0,,,,,
4449,bannifl01,1955.0,6.0,10.0,USA,SD,Pierre,,,,,,,Floyd,Bannister,Floyd Franklin,190.0,73.0,L,L,1977-04-19,1992-08-10,bannf001,bannifl01,86.18248,1.8542,25.0672,46.479603,110540.0,1000516.0,1977.0,1992.0,Floyd Bannister,26.0,1981.0,SEA,1.0,AL,0.0,0.0,121.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,Y,,4.07185,0.07042,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


----

Let's get our team data together so we can figure out:
1. The worst team each year
2. The best team each year
3. The best team that didn't make the playoffs each year

In [None]:
teams.sample(10)

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
1197,1950,NL,SLN,STL,,5,153,76.0,78,75,,,N,N,693,5215,1353,255,50,102,606.0,604.0,23.0,,,,670,598,3.97,57,10,14,4068,1398,119,535,603,130,172,0.978,St. Louis Cardinals,Sportsman's Park IV,1093411.0,104,103,STL,SLN,SLN
1016,1939,AL,NYA,NYY,,1,152,77.0,106,45,,,Y,Y,967,5300,1521,259,55,166,701.0,543.0,72.0,37.0,,,556,496,3.31,87,15,26,4044,1208,85,567,565,126,159,0.978,New York Yankees,Yankee Stadium I,859785.0,99,95,NYY,NYA,NYA
2325,2000,AL,ANA,ANA,W,3,162,81.0,82,80,N,N,N,N,864,5628,1574,309,34,236,608.0,1024.0,93.0,52.0,47.0,43.0,869,805,5.0,5,3,46,4344,1534,228,662,846,134,182,0.978,Anaheim Angels,Edison International Field,2066982.0,102,103,ANA,ANA,ANA
271,1891,AA,LS2,LOU,,8,138,,54,83,,,N,,698,4764,1229,127,68,17,438.0,465.0,227.0,,68.0,,873,567,4.22,126,9,1,3630,1334,32,451,481,454,112,0.922,Louisville Colonels,Eclipse Park I,,97,97,LOU,LS2,LS2
1728,1977,NL,PIT,PIT,E,2,162,81.0,96,66,N,,N,N,734,5662,1550,278,57,133,474.0,878.0,260.0,120.0,34.0,43.0,665,594,3.61,25,15,39,4445,1406,149,485,890,145,137,0.977,Pittsburgh Pirates,Three Rivers Stadium,1237349.0,102,101,PIT,PIT,PIT
2190,1995,AL,DET,DET,E,4,144,72.0,60,84,N,N,N,N,654,4865,1204,228,29,159,551.0,987.0,73.0,36.0,41.0,43.0,844,778,5.49,5,3,38,3825,1509,170,536,729,106,143,0.981,Detroit Tigers,Tiger Stadium,1180979.0,100,101,DET,DET,DET
427,1903,NL,PIT,PIT,,1,141,70.0,91,49,,,Y,N,793,4988,1429,208,110,34,364.0,400.0,172.0,,50.0,,613,405,2.91,117,16,5,3754,1215,9,384,454,295,100,0.951,Pittsburgh Pirates,Exposition Park,326855.0,103,99,PIT,PIT,PIT
2005,1988,AL,KCA,KCR,W,3,161,80.0,84,77,N,,N,N,704,5469,1419,275,40,121,486.0,944.0,137.0,54.0,33.0,51.0,648,580,3.65,29,12,32,4285,1415,102,465,886,124,147,0.98,Kansas City Royals,Royals Stadium,2350181.0,101,101,KCR,KCA,KCA
718,1920,AL,WS1,MIN,,6,153,76.0,68,84,,,N,N,723,5251,1526,233,81,36,433.0,543.0,161.0,114.0,,,802,633,4.17,81,10,10,4101,1521,51,520,418,232,95,0.963,Washington Senators,Griffith Stadium I,359260.0,97,98,WSH,WS1,WS1
468,1906,NL,CIN,CIN,,6,155,78.0,64,87,,,N,N,533,5025,1198,140,71,16,395.0,532.0,170.0,,58.0,,582,409,2.69,126,12,5,4109,1248,14,470,567,262,97,0.959,Cincinnati Reds,Palace of the Fans,330056.0,104,104,CIN,CIN,CIN


Let's work off of copies and leave our source DataFrames in one place so that we have a home to which we may tearfully return.

In [None]:
series_post_sorted = series_post

In [None]:
series_post_sorted.columns

Index(['yearID', 'round', 'teamIDwinner', 'lgIDwinner', 'teamIDloser',
       'lgIDloser', 'wins', 'losses', 'ties'],
      dtype='object')

In [None]:
df_teams = teams

Let's add some differential metrics that might be useful. In Pythonglish, this is what we're doing:
```python
for Runs, Strikeouts, Walks, Home Runs, Hits in team_stats:
    df_teams['StatDiff'] = df_teams['Team_Stat'] - df_teams['Opponent_stat']
```

In [None]:
df_teams['RDiff'] = df_teams['R'] - df_teams['RA']
df_teams['SODiff'] = df_teams['SO'] - df_teams['SOA']
df_teams['BBDiff'] = df_teams['BB'] - df_teams['BBA']
df_teams['HRDiff'] = df_teams['HR'] - df_teams['HRA']
df_teams['HDiff'] = df_teams['H'] - df_teams['HA']

Let's also give ourselves a winning percentage column, because baseball seasons haven't always been the same length.

In [None]:
df_teams['WP'] = df_teams['W'] / (df_teams['L'] + df_teams['W'])

Now let's find the best team in each year that didn't make the playoffs and add it to a 'first_losers' DataFrame

In [None]:
df_teams.columns

Index(['yearID', 'lgID', 'teamID', 'franchID', 'divID', 'Rank', 'G', 'Ghome',
       'W', 'L', 'DivWin', 'WCWin', 'LgWin', 'WSWin', 'R', 'AB', 'H', '2B',
       '3B', 'HR', 'BB', 'SO', 'SB', 'CS', 'HBP', 'SF', 'RA', 'ER', 'ERA',
       'CG', 'SHO', 'SV', 'IPouts', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP',
       'name', 'park', 'attendance', 'BPF', 'PPF', 'teamIDBR',
       'teamIDlahman45', 'teamIDretro'],
      dtype='object')

In [None]:
df_teams.head(3)

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
0,1871,,BS1,BNA,,3,31,,20,10,,,N,,401,1372,426,70,37,3,60.0,19.0,73.0,16.0,,,303,109,3.55,22,1,3,828,367,2,42,23,243,24,0.834,Boston Red Stockings,South End Grounds I,,103,98,BOS,BS1,BS1
1,1871,,CH1,CNA,,2,28,,19,9,,,N,,302,1196,323,52,21,10,60.0,22.0,69.0,21.0,,,241,77,2.76,25,0,1,753,308,6,28,22,229,16,0.829,Chicago White Stockings,Union Base-Ball Grounds,,104,102,CHI,CH1,CH1
2,1871,,CL1,CFC,,8,29,,10,19,,,N,,249,1186,328,35,40,7,26.0,25.0,18.0,8.0,,,341,116,4.11,23,0,0,762,346,13,53,34,234,15,0.818,Cleveland Forest Citys,National Association Grounds,,96,100,CLE,CL1,CL1


In [105]:
series_post_sorted[series_post_sorted['yearID'] == 2021]

Unnamed: 0,yearID,round,teamIDwinner,lgIDwinner,teamIDloser,lgIDloser,wins,losses,ties
358,2021,ALWC,BOS,AL,NYA,AL,1,0,0
359,2021,ALDS1,HOU,AL,CHA,AL,3,1,0
360,2021,ALDS2,BOS,AL,TBA,AL,3,1,0
361,2021,ALCS,HOU,AL,BOS,AL,4,2,0
362,2021,NLWC,LAN,NL,SLN,NL,1,0,0
363,2021,NLDS1,ATL,NL,MIL,NL,3,1,0
364,2021,NLDS2,LAN,NL,SFN,NL,3,2,0
365,2021,NLCS,ATL,NL,LAN,NL,4,2,0
366,2021,WS,ATL,NL,HOU,AL,4,2,0


Let's split our teams into temporal eras. There are four major eras in baseball history. Or rather, two gigantic ones — the latter of which has three distinct sub-areas. 

In [124]:
deadball_era = df_teams.mask(df_teams['yearID'] <= 1920)
liveball_era = df_teams.mask(df_teams['yearID'] > 1920)

In [125]:
liveball = deadball_era.dropna(axis=0, how='all')
liveball

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
719,1921.0,AL,BOS,BOS,,5.0,154.0,77.0,75.0,79.0,,,N,N,668.0,5206.0,1440.0,248.0,69.0,17.0,428.0,344.0,83.0,65.0,,,696.0,603.0,3.98,88.0,9.0,5.0,4092.0,1521.0,53.0,452.0,446.0,157.0,151.0,0.975,Boston Red Sox,Fenway Park I,279273.0,97.0,99.0,BOS,BOS,BOS
720,1921.0,NL,BRO,LAD,,5.0,152.0,78.0,77.0,75.0,,,N,N,667.0,5263.0,1476.0,209.0,85.0,59.0,325.0,400.0,91.0,73.0,,,681.0,560.0,3.70,82.0,8.0,12.0,4089.0,1556.0,46.0,361.0,471.0,232.0,142.0,0.964,Brooklyn Robins,Ebbets Field,613245.0,105.0,104.0,BRO,BRO,BRO
721,1921.0,NL,BSN,ATL,,4.0,153.0,74.0,79.0,74.0,,,N,N,721.0,5385.0,1561.0,209.0,100.0,61.0,377.0,470.0,94.0,100.0,,,697.0,600.0,3.90,74.0,11.0,12.0,4155.0,1488.0,54.0,420.0,382.0,199.0,122.0,0.969,Boston Braves,Braves Field,318627.0,94.0,96.0,BSN,BSN,BSN
722,1921.0,AL,CHA,CHW,,7.0,154.0,77.0,62.0,92.0,,,N,N,683.0,5329.0,1509.0,242.0,82.0,35.0,445.0,474.0,97.0,93.0,,,858.0,749.0,4.94,84.0,7.0,9.0,4095.0,1603.0,52.0,549.0,392.0,199.0,155.0,0.969,Chicago White Sox,Comiskey Park,543650.0,98.0,98.0,CHW,CHA,CHA
723,1921.0,NL,CHN,CHC,,7.0,153.0,76.0,64.0,89.0,,,N,N,668.0,5321.0,1553.0,234.0,56.0,37.0,343.0,374.0,70.0,97.0,,,773.0,665.0,4.39,73.0,7.0,7.0,4089.0,1605.0,67.0,409.0,441.0,166.0,129.0,0.974,Chicago Cubs,Wrigley Field,410107.0,100.0,101.0,CHC,CHN,CHN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2980,2021.0,NL,SLN,STL,C,2.0,162.0,81.0,90.0,72.0,N,Y,N,N,706.0,5351.0,1303.0,261.0,22.0,198.0,478.0,1341.0,89.0,22.0,86.0,44.0,672.0,626.0,3.98,3.0,15.0,50.0,4251.0,1234.0,152.0,608.0,1225.0,84.0,137.0,0.986,St. Louis Cardinals,Busch Stadium III,2102530.0,92.0,92.0,STL,SLN,SLN
2981,2021.0,AL,TBA,TBD,E,1.0,162.0,81.0,100.0,62.0,Y,N,N,N,857.0,5507.0,1336.0,288.0,36.0,222.0,585.0,1542.0,88.0,42.0,72.0,41.0,651.0,593.0,3.67,1.0,13.0,42.0,4367.0,1264.0,184.0,436.0,1478.0,80.0,130.0,0.986,Tampa Bay Rays,Tropicana Field,761072.0,92.0,91.0,TBR,TBA,TBA
2982,2021.0,AL,TEX,TEX,W,5.0,162.0,81.0,60.0,102.0,N,N,N,N,625.0,5405.0,1254.0,225.0,24.0,167.0,433.0,1381.0,106.0,29.0,58.0,31.0,815.0,758.0,4.79,0.0,3.0,31.0,4273.0,1402.0,232.0,513.0,1239.0,83.0,146.0,0.986,Texas Rangers,Globe Life Field,2110258.0,99.0,101.0,TEX,TEX,TEX
2983,2021.0,AL,TOR,TOR,E,4.0,162.0,80.0,91.0,71.0,N,N,N,N,846.0,5476.0,1455.0,285.0,13.0,262.0,496.0,1218.0,81.0,20.0,51.0,35.0,663.0,610.0,3.91,1.0,14.0,34.0,4216.0,1257.0,209.0,473.0,1468.0,90.0,122.0,0.984,Toronto Blue Jays,Sahlen Field,805901.0,102.0,101.0,TOR,TOR,TOR


In [126]:
deadball = liveball_era.dropna(axis=0, how='all')
deadball

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
0,1871.0,,BS1,BNA,,3.0,31.0,,20.0,10.0,,,N,,401.0,1372.0,426.0,70.0,37.0,3.0,60.0,19.0,73.0,16.0,,,303.0,109.0,3.55,22.0,1.0,3.0,828.0,367.0,2.0,42.0,23.0,243.0,24.0,0.834,Boston Red Stockings,South End Grounds I,,103.0,98.0,BOS,BS1,BS1
1,1871.0,,CH1,CNA,,2.0,28.0,,19.0,9.0,,,N,,302.0,1196.0,323.0,52.0,21.0,10.0,60.0,22.0,69.0,21.0,,,241.0,77.0,2.76,25.0,0.0,1.0,753.0,308.0,6.0,28.0,22.0,229.0,16.0,0.829,Chicago White Stockings,Union Base-Ball Grounds,,104.0,102.0,CHI,CH1,CH1
2,1871.0,,CL1,CFC,,8.0,29.0,,10.0,19.0,,,N,,249.0,1186.0,328.0,35.0,40.0,7.0,26.0,25.0,18.0,8.0,,,341.0,116.0,4.11,23.0,0.0,0.0,762.0,346.0,13.0,53.0,34.0,234.0,15.0,0.818,Cleveland Forest Citys,National Association Grounds,,96.0,100.0,CLE,CL1,CL1
3,1871.0,,FW1,KEK,,7.0,19.0,,7.0,12.0,,,N,,137.0,746.0,178.0,19.0,8.0,2.0,33.0,9.0,16.0,4.0,,,243.0,97.0,5.17,19.0,1.0,0.0,507.0,261.0,5.0,21.0,17.0,163.0,8.0,0.803,Fort Wayne Kekiongas,Hamilton Field,,101.0,107.0,KEK,FW1,FW1
4,1871.0,,NY2,NNA,,5.0,33.0,,16.0,17.0,,,N,,302.0,1404.0,403.0,43.0,21.0,1.0,33.0,15.0,46.0,15.0,,,313.0,121.0,3.72,32.0,1.0,0.0,879.0,373.0,7.0,42.0,22.0,235.0,14.0,0.840,New York Mutuals,Union Grounds (Brooklyn),,90.0,88.0,NYU,NY2,NY2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,1920.0,NL,PHI,PHI,,8.0,153.0,77.0,62.0,91.0,,,N,N,565.0,5264.0,1385.0,229.0,54.0,64.0,283.0,531.0,100.0,83.0,,,714.0,557.0,3.63,77.0,8.0,11.0,4140.0,1480.0,35.0,444.0,419.0,232.0,135.0,0.964,Philadelphia Phillies,Baker Bowl,330998.0,104.0,109.0,PHI,PHI,PHI
715,1920.0,NL,PIT,PIT,,4.0,155.0,78.0,79.0,75.0,,,N,N,530.0,5219.0,1342.0,162.0,90.0,16.0,374.0,405.0,181.0,117.0,,,552.0,454.0,2.89,92.0,17.0,10.0,4245.0,1389.0,25.0,280.0,444.0,186.0,119.0,0.971,Pittsburgh Pirates,Forbes Field,429037.0,103.0,103.0,PIT,PIT,PIT
716,1920.0,AL,SLA,BAL,,4.0,154.0,78.0,76.0,77.0,,,N,N,797.0,5358.0,1651.0,279.0,83.0,50.0,427.0,339.0,118.0,79.0,,,766.0,617.0,4.03,84.0,9.0,14.0,4134.0,1481.0,53.0,578.0,444.0,231.0,119.0,0.963,St. Louis Browns,Sportsman's Park IV,419311.0,103.0,103.0,SLB,SLA,SLA
717,1920.0,NL,SLN,STL,,5.0,155.0,76.0,75.0,79.0,,,N,N,675.0,5495.0,1589.0,238.0,96.0,32.0,373.0,484.0,126.0,114.0,,,682.0,543.0,3.43,72.0,9.0,12.0,4278.0,1488.0,30.0,479.0,529.0,256.0,136.0,0.961,St. Louis Cardinals,Robison Field/Sportsman's Park IV,326836.0,98.0,98.0,STL,SLN,SLN


In [97]:
pre_division = mask[(mask['LgWin'] == 'N') & (mask['DivWin'].isna())]
pre_WC = mask[(mask['LgWin'] == 'N') & (mask['WCWin'].notna())]
pre_WC.head()


Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
2182,1995,AL,BAL,BAL,E,3,144,72.0,71,73,N,N,N,N,704,4837,1267,229,27,173,574.0,803.0,92.0,45.0,39.0,41.0,640,607,4.31,19,10,29,3801,1165,149,523,930,72,141,0.986,Baltimore Orioles,Oriole Park at Camden Yards,3098475.0,102,101,BAL,BAL,BAL
2183,1995,AL,BOS,BOS,E,1,144,72.0,86,58,Y,N,N,N,791,4997,1399,286,31,175,560.0,923.0,99.0,44.0,65.0,49.0,698,631,4.39,7,9,39,3878,1338,127,476,888,120,151,0.978,Boston Red Sox,Fenway Park II,2164410.0,103,103,BOS,BOS,BOS
2184,1995,AL,CAL,ANA,W,2,145,72.0,78,67,N,N,N,N,801,5019,1390,252,25,186,564.0,889.0,58.0,39.0,36.0,38.0,697,645,4.52,8,9,42,3853,1310,163,486,901,95,120,0.982,California Angels,Anaheim Stadium,1748680.0,99,99,CAL,CAL,CAL
2185,1995,AL,CHA,CHW,C,3,145,72.0,68,76,N,N,N,N,755,5060,1417,252,37,146,576.0,767.0,110.0,39.0,32.0,56.0,758,693,4.85,12,4,36,3854,1374,164,617,892,108,131,0.98,Chicago White Sox,Comiskey Park II,1609773.0,96,95,CHW,CHA,CHA
2186,1995,NL,CHN,CHC,C,3,144,72.0,73,71,N,N,N,N,693,4963,1315,267,39,158,440.0,953.0,105.0,37.0,34.0,35.0,671,597,4.13,6,12,45,3903,1313,162,518,926,115,115,0.979,Chicago Cubs,Wrigley Field,1918265.0,98,98,CHC,CHN,CHN


In [405]:
first_losers = df_teams[
    (df_teams['DivWin'] == 'N') &
    (df_teams['WCWin'] == 'N')
].sort_values('WP', ascending=False)

In [406]:
first_losers

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro,RDiff,WP
2302,1999,NL,CIN,CIN,C,2,163,82.0,96,67,N,N,N,N,865,5649,1536,312,37,209,569.0,1125.0,164.0,54.0,45.0,44.0,711,647,3.98,6,11,55,4386,1309,190,636,1081,105,139,0.983,Cincinnati Reds,Cinergy Field,2061222.0,103,103,CIN,CIN,CIN,154,0.588957
2409,2002,AL,SEA,SEA,W,3,162,81.0,93,69,N,N,N,N,814,5569,1531,285,31,152,629.0,1003.0,137.0,58.0,51.0,72.0,699,654,4.07,8,12,43,4336,1422,178,441,1063,88,134,0.985,Seattle Mariners,Safeco Field,3542938.0,97,95,SEA,SEA,SEA,115,0.574074
2389,2002,AL,BOS,BOS,E,2,162,81.0,93,69,N,N,N,N,859,5640,1560,348,33,177,545.0,944.0,80.0,28.0,72.0,53.0,665,603,3.75,5,17,51,4338,1339,146,430,1157,104,140,0.983,Boston Red Sox,Fenway Park II,2650862.0,103,102,BOS,BOS,BOS,194,0.574074
2902,2019,AL,CLE,CLE,C,2,162,81.0,93,69,N,N,N,N,769,5425,1354,286,18,223,563.0,1332.0,103.0,35.0,50.0,46.0,657,601,3.76,6,16,42,4313,1308,207,450,1508,83,110,0.985,Cleveland Indians,Progressive Field,1738642.0,104,102,CLE,CLE,CLE,112,0.574074
2482,2005,AL,CLE,CLE,C,2,162,81.0,93,69,N,N,N,N,790,5609,1522,337,30,207,503.0,1093.0,62.0,36.0,54.0,50.0,642,582,3.61,6,10,51,4358,1363,157,413,1050,106,156,0.983,Cleveland Indians,Jacobs Field,2013763.0,96,96,CLE,CLE,CLE,148,0.574074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2725,2013,AL,HOU,HOU,W,5,162,81.0,51,111,N,N,N,N,610,5457,1307,266,16,148,426.0,1535.0,110.0,61.0,52.0,38.0,848,766,4.79,2,5,32,4320,1530,191,616,1084,125,168,0.979,Houston Astros,Minute Maid Park,1651883.0,99,101,HOU,HOU,HOU,-238,0.314815
2446,2004,NL,ARI,ARI,W,5,162,81.0,51,111,N,N,N,N,615,5544,1401,295,38,135,441.0,1022.0,53.0,32.0,35.0,37.0,899,794,4.98,5,6,33,4308,1480,197,668,1153,139,144,0.977,Arizona Diamondbacks,Bank One Ballpark,2519560.0,105,107,ARI,ARI,ARI,-284,0.314815
2904,2019,AL,DET,DET,C,5,161,81.0,47,114,N,N,N,N,582,5549,1333,292,41,149,391.0,1595.0,57.0,20.0,48.0,42.0,915,835,5.24,0,3,31,4299,1555,250,536,1368,110,127,0.981,Detroit Tigers,Comerica Park,1501430.0,102,104,DET,DET,DET,-333,0.291925
2867,2018,AL,BAL,BAL,E,5,162,81.0,47,115,N,N,N,N,622,5507,1317,242,15,188,422.0,1412.0,81.0,22.0,57.0,35.0,892,824,5.18,2,7,28,4293,1552,234,589,1203,104,159,0.982,Baltimore Orioles,Oriole Park at Camden Yards,1564192.0,96,97,BAL,BAL,BAL,-270,0.290123


In [395]:
df_teams[(df_teams['yearID'] == 1989) & (df_teams['DivWin'] == 'Y')]
df_teams[(df_teams['yearID'] == 1989) & (df_teams['DivWin'] == 'Y')]
df_teams[(df_teams['yearID'] == 1989) & (df_teams['DivWin'] == 'Y')]

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro,RDiff
2026,1989,NL,CHN,CHC,E,1,162,81.0,93,69,Y,,N,N,702,5513,1438,235,45,124,472.0,921.0,136.0,57.0,26.0,50.0,623,556,3.43,18,10,55,4381,1369,106,532,918,124,130,0.98,Chicago Cubs,Wrigley Field,2491942.0,108,108,CHC,CHN,CHN,79
2038,1989,AL,OAK,OAK,W,1,162,81.0,99,63,Y,,Y,Y,712,5416,1414,220,25,127,562.0,855.0,157.0,55.0,34.0,62.0,576,497,3.09,17,20,57,4345,1287,103,510,930,129,159,0.979,Oakland Athletics,Oakland Coliseum,2667225.0,97,95,OAK,OAK,OAK,136
2043,1989,NL,SFN,SFG,W,1,162,81.0,92,70,Y,,Y,N,699,5469,1365,241,52,141,508.0,1071.0,87.0,54.0,40.0,39.0,600,535,3.3,12,16,47,4371,1320,120,471,802,114,135,0.982,San Francisco Giants,Candlestick Park,2059701.0,97,96,SFG,SFN,SFN,99
2046,1989,AL,TOR,TOR,E,1,162,81.0,89,73,Y,,N,N,731,5581,1449,265,40,142,521.0,923.0,144.0,58.0,31.0,53.0,651,584,3.58,12,12,38,4401,1408,99,478,849,127,164,0.98,Toronto Blue Jays,Exhibition Stadium /Skydome,3375883.0,94,94,TOR,TOR,TOR,80


KeyError: "['NL', 'SDN', 'SDP', 6, 162, 81.0, 65, 97, 'N', nan, 668, 5456, 1419, 209, 48, 113, 577.0, 992.0, 198.0, 91.0, 27.0, 36.0, 763, 680, 4.27, 14, 10, 33, 4300, 1402, 175, 602, 897, 147, 135, 0.976, 'San Diego Padres', 'Jack Murphy Stadium', 1454061.0, 96] not in index"

In [376]:
df_teams = df_teams.set_index(['yearID', 'lgID', 'divID'])

KeyError: 'PIT'

### Meet the Wonkaville Huskies

In [132]:
df_huskiesBatters = df_batters.loc[df_batters.BMI >= 34.55]

In [133]:
df_huskiesBatters.sort_values('BMI').describe()

Unnamed: 0,birthYear,birthMonth,birthDay,deathYear,deathMonth,deathDay,weight,height,yearID,stint,...,BB,SO,IBB,HBP,SH,SF,GIDP,KG,meters,BMI
count,26.0,25.0,25.0,5.0,5.0,5.0,26.0,26.0,26.0,26.0,...,26.0,22.0,0.0,24.0,17.0,0.0,1.0,26.0,26.0,26.0
mean,1969.153846,5.8,17.52,1952.4,7.6,8.6,269.961538,72.115385,1901.730769,1.115385,...,9.884615,14.909091,,0.875,6.0,,0.0,122.452394,1.831731,36.408324
std,40.878055,3.316625,8.529947,49.45503,3.646917,7.602631,30.1801,4.348121,14.17761,0.325813,...,16.310308,13.606403,,1.650099,8.951257,,,13.689452,0.110442,1.728873
min,1853.0,1.0,1.0,1891.0,2.0,2.0,155.0,55.0,1872.0,1.0,...,0.0,0.0,,0.0,0.0,,0.0,70.30676,1.397,34.622243
25%,1977.0,4.0,11.0,1915.0,6.0,2.0,261.25,71.0,1890.25,1.0,...,1.0,3.25,,0.0,0.0,,0.0,118.50091,1.8034,34.891189
50%,1983.5,6.0,18.0,1966.0,9.0,6.0,270.0,72.0,1903.5,1.0,...,3.5,11.0,,0.0,2.0,,0.0,122.46984,1.8288,35.875979
75%,1989.0,8.0,24.0,1975.0,10.0,14.0,283.75,75.0,1912.0,1.0,...,9.25,25.25,,1.0,7.0,,0.0,128.70673,1.905,37.928366
max,1998.0,12.0,30.0,2015.0,11.0,19.0,320.0,78.0,1924.0,2.0,...,68.0,43.0,,6.0,33.0,,0.0,145.14944,1.9812,40.292666


In [134]:
df_huskiesPitchers = df[df_pitchers.BMI > 34.55]

NameError: name 'df' is not defined

In [None]:
df_huskiesPitchers.head()

In [None]:
df_huskies = pd.merge(df_huskiesBatters, df_huskiesPitchers, how='right', on='playerID')

In [None]:
df_huskies.columns

In [None]:
sns.jointplot(data=df_simple, x="height", y="weight", kind = "reg", truncate = False)

In [None]:
sns.choose_diverging_palette()

In [None]:

# Compute the correlation matrix
corr = df_huskiesBatters.corr(method="spearman")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(290, 10, n=40, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=1,
    center=0,
    square=True,
    linewidths=0.25,
    cbar_kws={"shrink": .5},
)



In [None]:
corr_mat = df.corr().stack().reset_index(name="correlation")