In [246]:
## Let's try to figure out who the shortest, heaviest baseball players are.

## Here's our overall plan of attack

# Setting Things Up ✅
## Import CSVs ✅
### Separate CSVs --> DataFrames for People, Pitching Data, Batting Data ✅
## Squish everything into one mondo DF ✅
## Add Physical Data ✅
### Height ✅
### Weight ✅
## Calculate BMI ✅
### Convert Imperial to Metric ✅
### BMI-ify ✅
### Throw BMI back into df ✅
## Assemble per-position lists sorted by BMI, then mWAR

# Knocking Things Down
## Find worst team that made playoffs in 2021
### Describe team fWAR/bWAR
### Describe individual fWAR/bWAR
## Pull from BMI lists per position until high BMI roster is full
### mWAR shall be higher on a team basis.
### mWAR shall be higher per position.
# If there's enough time:
## Repeat for:
### Tallness
### Shortness
### Heavy
### Light

# If there's enough time:
## Repeat for:
### Tallness
### Shortness
### Heavy
### Light

## Setting Things Up

### Import the necessaries

In [247]:
import numpy as np
import pandas as pd
import pybaseball
from pybaseball import bwar_pitch
from pybaseball import bwar_bat
from pybaseball import cache
from pybaseball.lahman import *
from pybaseball import chadwick_register

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)


In [248]:
# Constants
KG_TO_LB = 0.453592
M_TO_IN = 0.0254

In [249]:
chadwick = chadwick_register(save=True)
chadwick = pd.DataFrame(chadwick)

In [250]:
download_lahman()

## Read infinity baseball data to DataFrames

In [251]:
# a table of all player biographical info and ids
people = pd.DataFrame(people())

# park id, name, alias, city, state, and country
parks = pd.DataFrame(parks())

# all star roster data: player, year, team, league, position
allstar = pd.DataFrame(all_star_full())

# each player's games played per position for each season
appearances = pd.DataFrame(appearances())

# batting stats by year, regular season
batting = pd.DataFrame(batting())

# batting stats by year, post season
batting_post = pd.DataFrame(batting_post())

# fielding stats by year 
fielding = pd.DataFrame(fielding())

# games played in left, center, right field 
fielding_of = pd.DataFrame(fielding_of())

# LF/CF/RF splits
fielding_of_split = pd.DataFrame(fielding_of_split())

# postseason fielding 
fielding_post = pd.DataFrame(fielding_post())

# home game attendance by park by year 
home_games = pd.DataFrame(home_games())

# historical player pitching stats
pitching = pd.DataFrame(pitching())

# postseason pitching stats
pitching_post = pd.DataFrame(pitching_post())

# playoff series winners and losers 
series_post = pd.DataFrame(series_post())

# data on teams by year: record, division, stadium, attendance, etc
teams = pd.DataFrame(teams())

# current and historical franchises, whether they're still active, and their ids
teams_franchises = pd.DataFrame(teams_franchises())

# split season data for teams
teams_half = pd.DataFrame(teams_half()) 

# fangraphs batting since 2008
fangraphs_batting = pd.DataFrame(pybaseball.batting_stats_range(
    start_dt="2008-01-01", end_dt="2021-12-31"))

# fangraphs pitching since 2008
fangraphs_pitching = pd.DataFrame(pybaseball.pitching_stats_range(
    start_dt="2008-01-01", end_dt="2021-12-31"))

# fangraphs team pitching since 2008
fangraphs_team_pitching = pd.DataFrame(
    pybaseball.team_pitching(start_season="2008", end_season="2021"))

# fangraphs team batting since 2008
fangraphs_team_batting = pd.DataFrame(
    pybaseball.team_batting(start_season="2008", end_season="2021"))

# fangraphs team fielding since 2008
fangraphs_team_fielding = pd.DataFrame(
    pybaseball.team_fielding(start_season="2008", end_season="2021"))

# bref pitching WAR
bwar_pitch = pd.DataFrame(bwar_pitch(return_all=True))

# bref batting WAR
bwar_bat = pd.DataFrame(bwar_bat(return_all=True))


  table = table.drop('', 1)


----

Here's where we add the good stuff.

In [252]:
# BMI Calculations
people['KG'] = people['weight'] * KG_TO_LB
people['meters'] = people['height'] * M_TO_IN
people['BMI'] = people['KG'] / people['meters'] ** 2
people['ratio'] = people['meters'] * people['BMI']


----

Let's see what we've got

In [253]:
people.shape

(20543, 28)

In [254]:
people.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio'],
      dtype='object')

In [255]:
chadwick.shape

(24258, 8)

In [256]:
chadwick.columns

Index(['name_last', 'name_first', 'key_mlbam', 'key_retro', 'key_bbref',
       'key_fangraphs', 'mlb_played_first', 'mlb_played_last'],
      dtype='object')

In [257]:
# Let's change some of these column names to save ourselves some merging hassle

In [258]:
chadwick = chadwick.rename(columns={
    "name_last": "nameLast",
    "name_first": "nameFirst",
    "key_fangraphs": "fangraphsID",
    "key_bbref": "playerID",
    "key_retro": "retroID",
    "key_mlbam": "mlbID"
})

In [259]:
chadwick.columns

Index(['nameLast', 'nameFirst', 'mlbID', 'retroID', 'playerID', 'fangraphsID',
       'mlb_played_first', 'mlb_played_last'],
      dtype='object')

Merge Chadwick, check.

In [260]:
# Merge chadwick into people
df = people.merge(
    chadwick,
    left_on=["playerID", "retroID", "nameLast", "nameFirst"],
    right_on=["playerID", "retroID", "nameLast", "nameFirst"], how='outer'
)

In [261]:
df.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio', 'mlbID',
       'fangraphsID', 'mlb_played_first', 'mlb_played_last'],
      dtype='object')

In [262]:
df.shape

(24587, 32)

----

prep merge bwar_bat

In [263]:
bwar_bat.columns

Index(['name_common', 'age', 'mlb_ID', 'player_ID', 'year_ID', 'team_ID',
       'stint_ID', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp',
       'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher',
       'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p',
       'runs_replacement', 'runs_above_rep', 'runs_above_avg',
       'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def',
       'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG',
       'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent',
       'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off',
       'waa_win_perc_def', 'waa_win_perc_rep', 'OPS_plus', 'TOB_lg', 'TB_lg'],
      dtype='object')

In [264]:
bwar_bat = bwar_bat.rename(columns={
    'player_ID': 'playerID',
    'mlb_ID': 'mlbID',
    'stint_ID': 'stint'
})

In [265]:
bwar_bat.columns

Index(['name_common', 'age', 'mlbID', 'playerID', 'year_ID', 'team_ID',
       'stint', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp',
       'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher',
       'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p',
       'runs_replacement', 'runs_above_rep', 'runs_above_avg',
       'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def',
       'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG',
       'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent',
       'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off',
       'waa_win_perc_def', 'waa_win_perc_rep', 'OPS_plus', 'TOB_lg', 'TB_lg'],
      dtype='object')

In [266]:
bwar_bat.columns

Index(['name_common', 'age', 'mlbID', 'playerID', 'year_ID', 'team_ID',
       'stint', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp',
       'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher',
       'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p',
       'runs_replacement', 'runs_above_rep', 'runs_above_avg',
       'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def',
       'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG',
       'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent',
       'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off',
       'waa_win_perc_def', 'waa_win_perc_rep', 'OPS_plus', 'TOB_lg', 'TB_lg'],
      dtype='object')

----

Merge bwar_bat, check

In [267]:
df = df.merge(
    bwar_bat, left_on=["playerID", "mlbID"], right_on=["playerID", "mlbID"], how="outer"
)

In [268]:
df.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio', 'mlbID',
       'fangraphsID', 'mlb_played_first', 'mlb_played_last', 'name_common',
       'age', 'year_ID', 'team_ID', 'stint', 'lg_ID', 'PA', 'G', 'Inn',
       'runs_bat', 'runs_br', 'runs_dp', 'runs_field', 'runs_infield',
       'runs_outfield', 'runs_catcher', 'runs_good_plays', 'runs_defense',
       'runs_position', 'runs_position_p', 'runs_replacement',
       'runs_above_rep', 'runs_above_avg', 'runs_above_avg_off',
       'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def', 'WAR', 'WAR_def',
       'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG', 'oppRpG',
       'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent', 'pyth_

In [269]:
bwar_pitch.columns

Index(['name_common', 'age', 'mlb_ID', 'player_ID', 'year_ID', 'team_ID',
       'stint_ID', 'lg_ID', 'G', 'GS', 'IPouts', 'IPouts_start',
       'IPouts_relief', 'RA', 'xRA', 'xRA_sprp_adj', 'xRA_extras_adj',
       'xRA_def_pitcher', 'PPF', 'PPF_custom', 'xRA_final', 'BIP', 'BIP_perc',
       'RS_def_total', 'runs_above_avg', 'runs_above_avg_adj',
       'runs_above_rep', 'RpO_replacement', 'GR_leverage_index_avg', 'WAR',
       'salary', 'teamRpG', 'oppRpG', 'pyth_exponent', 'waa_win_perc', 'WAA',
       'WAA_adj', 'oppRpG_rep', 'pyth_exponent_rep', 'waa_win_perc_rep',
       'WAR_rep', 'ERA_plus', 'ER_lg'],
      dtype='object')

----

merge bwar_pitch, check

In [270]:
bwar_pitch = bwar_pitch.rename(columns={
    "stint_ID": "stint",
    "mlb_ID": "mlbID",
    "player_ID": "playerID"
})

In [271]:
bwar_pitch.stint

0        1
1        1
2        1
3        1
4        1
        ..
53652    1
53653    1
53654    1
53655    1
53656    1
Name: stint, Length: 53657, dtype: int64

In [272]:
print(df.columns.tolist())

['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry', 'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay', 'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast', 'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame', 'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio', 'mlbID', 'fangraphsID', 'mlb_played_first', 'mlb_played_last', 'name_common', 'age', 'year_ID', 'team_ID', 'stint', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp', 'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher', 'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p', 'runs_replacement', 'runs_above_rep', 'runs_above_avg', 'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def', 'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG', 'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent', 'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off', 'waa_win_perc_def', 'waa_win_perc_rep', 'OPS_p

In [273]:
df = df.merge(
    bwar_pitch,
    left_on=[
        'G',
        'WAA',
        'WAR',
        'WAR_rep',
        'age',
        'lg_ID',
        'mlbID',
        'oppRpG',
        'oppRpG_rep',
        'playerID',
        'pyth_exponent',
        'pyth_exponent_rep',
        'runs_above_avg',
        'runs_above_rep',
        'salary',
        'stint',
        'teamRpG',
        'team_ID',
        'waa_win_perc',
        'waa_win_perc_rep',
        'year_ID'
    ],
    right_on=[
        'G',
        'WAA',
        'WAR',
        'WAR_rep',
        'age',
        'lg_ID',
        'mlbID',
        'oppRpG',
        'oppRpG_rep',
        'playerID',
        'pyth_exponent',
        'pyth_exponent_rep',
        'runs_above_avg',
        'runs_above_rep',
        'salary',
        'stint',
        'teamRpG',
        'team_ID',
        'waa_win_perc',
        'waa_win_perc_rep',
        'year_ID'
    ], how='outer'
)

In [274]:
print(df.columns.tolist())

['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry', 'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay', 'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast', 'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame', 'retroID', 'bbrefID', 'KG', 'meters', 'BMI', 'ratio', 'mlbID', 'fangraphsID', 'mlb_played_first', 'mlb_played_last', 'name_common_x', 'age', 'year_ID', 'team_ID', 'stint', 'lg_ID', 'PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp', 'runs_field', 'runs_infield', 'runs_outfield', 'runs_catcher', 'runs_good_plays', 'runs_defense', 'runs_position', 'runs_position_p', 'runs_replacement', 'runs_above_rep', 'runs_above_avg', 'runs_above_avg_off', 'runs_above_avg_def', 'WAA', 'WAA_off', 'WAA_def', 'WAR', 'WAR_def', 'WAR_off', 'WAR_rep', 'salary', 'pitcher', 'teamRpG', 'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'pyth_exponent', 'pyth_exponent_rep', 'waa_win_perc', 'waa_win_perc_off', 'waa_win_perc_def', 'waa_win_perc_rep', 'OPS

----

Merge fielding_of, check

In [275]:
fielding_of.columns

Index(['playerID', 'yearID', 'stint', 'Glf', 'Gcf', 'Grf'], dtype='object')

In [276]:
fielding_of = fielding_of.rename(columns={
    "yearID": "year_ID"
})

In [277]:
df = df.merge(
    fielding_of,
    left_on=["playerID", "year_ID", "stint"],
    right_on=["playerID", "year_ID", "stint"], how='left'
)


In [278]:
df.shape

(177729, 104)

----

In [279]:
fielding.columns

Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'POS', 'G', 'GS',
       'InnOuts', 'PO', 'A', 'E', 'DP', 'PB', 'WP', 'SB', 'CS', 'ZR'],
      dtype='object')

In [280]:
fielding = fielding.rename(columns={
    "yearID": "year_ID"
})

In [281]:
df = df.merge(
    fielding,
    left_on=["playerID", "year_ID", "G", "stint"],
    right_on=["playerID", "year_ID", "G", "stint"], how='left'
)


----

Here's our Frankenstein's Monster

In [282]:
df.sample(25) ### works to here

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,deathCountry,deathState,deathCity,nameFirst,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID,KG,meters,BMI,ratio,mlbID,fangraphsID,mlb_played_first,mlb_played_last,name_common_x,age,year_ID,team_ID,stint,lg_ID,PA,G,Inn,runs_bat,runs_br,runs_dp,runs_field,runs_infield,runs_outfield,runs_catcher,runs_good_plays,runs_defense,runs_position,runs_position_p,runs_replacement,runs_above_rep,runs_above_avg,runs_above_avg_off,runs_above_avg_def,WAA,WAA_off,WAA_def,WAR,WAR_def,WAR_off,WAR_rep,salary,pitcher,teamRpG,oppRpG,oppRpPA_rep,oppRpG_rep,pyth_exponent,pyth_exponent_rep,waa_win_perc,waa_win_perc_off,waa_win_perc_def,waa_win_perc_rep,OPS_plus,TOB_lg,TB_lg,name_common_y,GS_x,IPouts,IPouts_start,IPouts_relief,RA,xRA,xRA_sprp_adj,xRA_extras_adj,xRA_def_pitcher,PPF,PPF_custom,xRA_final,BIP,BIP_perc,RS_def_total,runs_above_avg_adj,RpO_replacement,GR_leverage_index_avg,WAA_adj,ERA_plus,ER_lg,Glf,Gcf,Grf,teamID,lgID,POS,GS_y,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR
177038,youngad01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1925.0,ABC,0.0,NNL,,1.0,,,,,,,,,,,,,,0.98,0.825,,,0.0799,,,0.09,,,0.0129,,,5.55055,4.70155,,5.70519,1.941,1.994,0.5799,,,0.4863,,,,A.D. Young,0.0,4.0,,,0.0,0.825,0.0,,,100.0,,0.825,,,,0.849,0.245,1.0,-0.0019,,0.947,,,,,,,,,,,,,,,,,
94829,stantmi02,1967.0,6.0,2.0,USA,TX,Houston,,,,,,,Mike,Stanton,William Michael,190.0,73.0,L,L,1989-08-24,2007-09-30,stanm003,stantmi02,86.18248,1.8542,25.0672,46.479603,122681.0,849.0,1989.0,2007.0,Mike Stanton,26.0,1993.0,ATL,1.0,NL,0.0,63.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.0,850000.0,Y,4.50252,4.50252,0.08368,4.50252,1.871,1.871,0.5,0.5,0.5,0.5,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,ATL,NL,P,0.0,156.0,1.0,9.0,1.0,1.0,,,,,
100252,townsha01,1879.0,4.0,9.0,USA,DE,Townsend,1963.0,12.0,21.0,USA,DE,Wilmington,Happy,Townsend,John,190.0,72.0,R,R,1901-04-19,1906-09-21,townh101,townsha01,86.18248,1.8288,25.768347,47.125153,123425.0,1013145.0,1901.0,1906.0,Happy Townsend,27.0,1906.0,CLE,1.0,AL,33.0,17.0,,-3.9,0.04,0.0,0.0,,,,,0.0,0.0,2.18,0.0,-1.7,-1.7,-1.7,0.0,-0.2,-0.2,0.0,-0.2,0.0,-0.2,0.0,,Y,3.61284,3.71166,0.06233,3.64211,1.764,1.766,0.4881,0.4881,0.5,0.4916,-7.161032,9.622,9.786,,,,,,,,,,,,,,,,,,,,,,,,,,CLE,AL,P,12.0,279.0,2.0,31.0,3.0,0.0,,,,,
11842,brownel01,1883.0,8.0,25.0,USA,IN,Southport,1955.0,1.0,23.0,USA,IN,Indianapolis,Elmer,Brown,Elmer Young,172.0,71.0,L,R,1911-09-16,1915-04-14,browe101,brownel01,78.017824,1.8034,23.988865,43.261519,111547.0,1001515.0,1911.0,1915.0,Elmer Brown,28.0,1912.0,SLB,1.0,AL,38.0,23.0,,-4.13,0.12,0.0,0.0,,,,,0.0,0.0,3.09,0.0,-0.9,-0.9,-0.9,0.0,-0.1,-0.1,0.0,-0.1,0.0,-0.1,0.0,,Y,4.46736,4.50736,0.07734,4.44513,1.869,1.868,0.4958,0.4958,0.5,0.4935,27.356487,12.421,12.676,,,,,,,,,,,,,,,,,,,,,,,,,,SLA,AL,P,,,2.0,31.0,2.0,1.0,,,,,
5676,baynebi01,1899.0,4.0,18.0,USA,PA,Pittsburgh,1981.0,5.0,22.0,USA,MO,St. Louis,Bill,Bayne,William Lear,160.0,69.0,L,L,1919-09-20,1930-04-26,baynb101,baynebi01,72.57472,1.7526,23.627608,41.409746,110720.0,1000701.0,1919.0,1930.0,Bill Bayne,30.0,1929.0,BOS,1.0,AL,27.0,27.0,,0.04,0.05,0.0,0.0,,,,,0.0,0.0,3.03,0.0,3.1,3.1,3.1,0.0,0.3,0.3,0.0,0.3,0.0,0.3,0.0,4200.0,Y,5.15984,5.04429,0.08788,5.00679,1.939,1.93,0.511,0.511,0.5,0.4964,103.498404,9.212,10.397,,,,,,,,,,,,,,,,,,,,,,,,,,BOS,AL,P,,,4.0,22.0,0.0,3.0,,,,,
169868,stearbi01,,,,,,,,,,,,,,,,,,,,,,,,,,,,122703.0,,,,,21.0,1874.0,HAR,1.0,,,22.0,,,,,,,,,,,,,,-18.541,-42.756,,,-2.7676,,,-1.41,,,1.5106,,,7.31788,9.22019,,8.42047,2.225,2.193,0.3742,,,0.4237,,,,Bill Stearns,18.0,476.0,,,194.0,132.928,0.0,,-12.499,104.0,,151.244,784.0,0.3472,-36.0,-41.851,0.323,1.0,-0.1558,77.573077,40.338,0.0,4.0,15.0,HR1,,P,18.0,477.0,14.0,20.0,20.0,1.0,,,,,
62251,martica04,1991.0,9.0,21.0,D.R.,Puerto Plata,Puerto Plata,,,,,,,Carlos,Martinez,Carlos Ernesto,200.0,72.0,R,R,2013-05-03,2021-07-04,martc006,martica04,90.7184,1.8288,27.124576,49.605424,593372.0,11682.0,2013.0,2021.0,Carlos Martinez,21.0,2013.0,STL,1.0,NL,2.0,21.0,28.3,-0.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26,0.0,-0.3,-0.3,-0.3,0.0,-0.03,-0.03,-0.01,-0.03,-0.01,-0.03,0.0,,Y,4.0161,4.02801,0.07154,4.02491,1.812,1.812,0.4987,0.4987,0.5,0.4997,-100.0,0.648,0.805,,,,,,,,,,,,,,,,,,,,,,,,,,SLN,NL,P,1.0,85.0,4.0,3.0,0.0,1.0,,,,,
41583,harribr01,1980.0,8.0,26.0,USA,NY,Queensbury,,,,,,,Brendan,Harris,Brendan Michael,200.0,73.0,R,R,2004-07-06,2013-07-10,harrb001,harribr01,90.7184,1.8542,26.386527,48.925898,430593.0,2178.0,2004.0,2013.0,Brendan Harris,32.0,2013.0,LAA,1.0,AL,117.0,44.0,265.7,-3.09,-0.37,0.57,-6.0,1.0,0.0,0.0,-1.0,-6.0,0.89,0.0,4.19,-3.8,-8.0,-2.0,-5.1,-0.8,-0.2,-0.47,-0.41,-0.47,0.19,0.39,,N,4.24698,4.29243,0.07418,4.19716,1.843,1.84,0.4806,0.4951,0.4877,0.4897,70.58854,35.995,42.212,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5776,beardte01,1921.0,1.0,7.0,USA,MD,Woodsboro,2011.0,12.0,30.0,USA,IN,Fishers,Ted,Beard,Cramer Theodore,165.0,68.0,L,L,1948-09-05,1958-05-13,beart101,beardte01,74.84268,1.7272,25.087886,43.331797,110736.0,1000721.0,1948.0,1958.0,Ted Beard,37.0,1958.0,CHW,1.0,AL,28.0,19.0,55.7,-0.97,0.62,0.0,0.1,0.0,0.0,0.0,,0.1,-0.39,0.0,0.86,0.2,-0.6,-0.7,-0.3,-0.08,-0.08,-0.04,0.01,-0.04,0.01,0.09,,N,4.15745,4.1964,0.07686,4.1512,1.831,1.831,0.4963,0.4957,0.4983,0.495,44.281502,9.212,8.705,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
177390,zuverge01,,,,,,,,,,,,,,,,,,,,,,,,,,,,124790.0,,,,,29.0,1954.0,DET,2.0,AL,,35.0,,,,,,,,,,,,,,18.398,2.609,,,0.308,,,1.91,,,1.7038,,,4.20124,4.12156,,4.65051,1.829,1.862,0.5088,,,0.4528,,,,George Zuverink,25.0,609.0,578.0,31.0,93.0,97.653,0.0,,2.222,100.0,100.186,95.609,715.0,0.1522,14.6,2.789,0.182,2.575,-0.1039,103.887654,84.149,,,,DET,AL,P,25.0,609.0,18.0,46.0,3.0,3.0,,,,,


----

Let's get our team data together so we can figure out:
1. The worst team each year
2. The best team each year
3. The best team that didn't make the playoffs each year

In [283]:
teams.sample(10)

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
1315,1958,NL,CIN,CIN,,4,154,77.0,76,78,,,N,N,695,5273,1359,242,40,123,572.0,765.0,61.0,38.0,,,621,574,3.73,50,7,20,4155,1422,148,419,705,99,148,0.983,Cincinnati Redlegs,Crosley Field,788582.0,106,105,CIN,CIN,CIN
940,1934,AL,SLA,BAL,,6,154,76.0,67,85,,,N,N,674,5288,1417,252,59,62,514.0,631.0,43.0,31.0,,,800,674,4.49,50,6,20,4050,1499,94,632,499,187,160,0.969,St. Louis Browns,Sportsman's Park IV,115305.0,107,110,SLB,SLA,SLA
887,1931,NL,NY1,SFG,,2,153,78.0,87,65,,,N,N,768,5372,1554,251,64,101,383.0,395.0,83.0,,,,599,499,3.3,90,17,12,4080,1341,71,422,570,159,126,0.974,New York Giants,Polo Grounds IV,812163.0,97,95,NYG,NY1,NY1
2289,1998,AL,SEA,SEA,W,3,161,81.0,76,85,N,N,N,N,859,5628,1553,321,28,234,558.0,1081.0,115.0,39.0,57.0,48.0,855,781,4.93,17,7,31,4273,1530,196,528,1156,125,139,0.979,Seattle Mariners,Kingdome,2651511.0,99,99,SEA,SEA,SEA
1422,1964,AL,CLE,CLE,,6,164,82.0,79,83,,,N,N,689,5603,1386,208,22,164,500.0,1063.0,79.0,51.0,,,693,620,3.75,37,16,37,4461,1443,154,565,1162,118,149,0.981,Cleveland Indians,Cleveland Stadium,653293.0,99,99,CLE,CLE,CLE
1921,1985,AL,CHA,CHW,W,3,163,81.0,85,77,N,,N,N,736,5470,1386,247,37,146,471.0,843.0,108.0,56.0,43.0,45.0,720,656,4.07,20,8,39,4355,1411,161,569,1023,111,152,0.982,Chicago White Sox,Comiskey Park,1669888.0,104,104,CHW,CHA,CHA
1619,1973,NL,CIN,CIN,W,1,162,81.0,99,63,Y,,N,N,741,5505,1398,232,34,137,639.0,947.0,148.0,55.0,31.0,51.0,621,557,3.4,39,17,43,4419,1389,135,518,801,115,162,0.982,Cincinnati Reds,Riverfront Stadium,2017601.0,95,93,CIN,CIN,CIN
2555,2007,NL,PHI,PHI,E,1,162,81.0,89,73,Y,N,N,N,892,5688,1558,326,41,213,641.0,1205.0,138.0,19.0,90.0,52.0,821,767,4.73,5,5,42,4375,1555,198,558,1050,89,162,0.986,Philadelphia Phillies,Citizens Bank Park,3108325.0,104,103,PHI,PHI,PHI
2105,1992,NL,CIN,CIN,W,2,162,81.0,90,72,N,,N,N,660,5460,1418,281,44,99,563.0,888.0,125.0,65.0,21.0,52.0,609,558,3.46,9,11,55,4349,1362,109,470,1060,96,128,0.984,Cincinnati Reds,Riverfront Stadium,2315946.0,103,103,CIN,CIN,CIN
644,1916,NL,CIN,CIN,,7,155,76.0,60,93,,,N,N,505,5254,1336,187,88,14,362.0,573.0,157.0,,,,617,485,3.1,86,7,6,4224,1356,35,461,569,228,126,0.965,Cincinnati Reds,Crosley Field,255846.0,98,99,CIN,CIN,CIN


Let's work off of copies and leave our source DataFrames in one place so that we have a home to which we may tearfully return.

In [284]:
series_post_sorted = series_post

In [285]:
series_post_sorted.columns

Index(['yearID', 'round', 'teamIDwinner', 'lgIDwinner', 'teamIDloser',
       'lgIDloser', 'wins', 'losses', 'ties'],
      dtype='object')

In [286]:
df_teams = teams

Let's add some differential metrics that might be useful. In Pythonglish, this is what we're doing:
```python
for Runs, Strikeouts, Walks, Home Runs, Hits in team_stats:
    df_teams['StatDiff'] = df_teams['Team_Stat'] - df_teams['Opponent_stat']
```

In [287]:
df_teams['RDiff'] = df_teams['R'] - df_teams['RA']
df_teams['SODiff'] = df_teams['SO'] - df_teams['SOA']
df_teams['BBDiff'] = df_teams['BB'] - df_teams['BBA']
df_teams['HRDiff'] = df_teams['HR'] - df_teams['HRA']
df_teams['HDiff'] = df_teams['H'] - df_teams['HA']

Let's also give ourselves a winning percentage column, because baseball seasons haven't always been the same length.

In [288]:
df_teams['WP'] = df_teams['W'] / (df_teams['L'] + df_teams['W'])

Now let's find the best team in each year that didn't make the playoffs and add it to a 'first_losers' DataFrame

In [289]:
df_teams.columns

Index(['yearID', 'lgID', 'teamID', 'franchID', 'divID', 'Rank', 'G', 'Ghome',
       'W', 'L', 'DivWin', 'WCWin', 'LgWin', 'WSWin', 'R', 'AB', 'H', '2B',
       '3B', 'HR', 'BB', 'SO', 'SB', 'CS', 'HBP', 'SF', 'RA', 'ER', 'ERA',
       'CG', 'SHO', 'SV', 'IPouts', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP',
       'name', 'park', 'attendance', 'BPF', 'PPF', 'teamIDBR',
       'teamIDlahman45', 'teamIDretro', 'RDiff', 'SODiff', 'BBDiff', 'HRDiff',
       'HDiff', 'WP'],
      dtype='object')

In [290]:
df_teams.head(3)

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro,RDiff,SODiff,BBDiff,HRDiff,HDiff,WP
0,1871,,BS1,BNA,,3,31,,20,10,,,N,,401,1372,426,70,37,3,60.0,19.0,73.0,16.0,,,303,109,3.55,22,1,3,828,367,2,42,23,243,24,0.834,Boston Red Stockings,South End Grounds I,,103,98,BOS,BS1,BS1,98,-4.0,18.0,1,59,0.666667
1,1871,,CH1,CNA,,2,28,,19,9,,,N,,302,1196,323,52,21,10,60.0,22.0,69.0,21.0,,,241,77,2.76,25,0,1,753,308,6,28,22,229,16,0.829,Chicago White Stockings,Union Base-Ball Grounds,,104,102,CHI,CH1,CH1,61,0.0,32.0,4,15,0.678571
2,1871,,CL1,CFC,,8,29,,10,19,,,N,,249,1186,328,35,40,7,26.0,25.0,18.0,8.0,,,341,116,4.11,23,0,0,762,346,13,53,34,234,15,0.818,Cleveland Forest Citys,National Association Grounds,,96,100,CLE,CL1,CL1,-92,-9.0,-27.0,-6,-18,0.344828


In [291]:
series_post_sorted[series_post_sorted['yearID'] == 2021]

Unnamed: 0,yearID,round,teamIDwinner,lgIDwinner,teamIDloser,lgIDloser,wins,losses,ties
358,2021,ALWC,BOS,AL,NYA,AL,1,0,0
359,2021,ALDS1,HOU,AL,CHA,AL,3,1,0
360,2021,ALDS2,BOS,AL,TBA,AL,3,1,0
361,2021,ALCS,HOU,AL,BOS,AL,4,2,0
362,2021,NLWC,LAN,NL,SLN,NL,1,0,0
363,2021,NLDS1,ATL,NL,MIL,NL,3,1,0
364,2021,NLDS2,LAN,NL,SFN,NL,3,2,0
365,2021,NLCS,ATL,NL,LAN,NL,4,2,0
366,2021,WS,ATL,NL,HOU,AL,4,2,0


----

First, let's make a dict:
```python
year : ['playoff teams']
```

In [340]:
playoff_winners = {k: list(v) for k, v in series_post_sorted.groupby('yearID')['teamIDwinner']}
playoff_losers = {k: list(v) for k, v in series_post_sorted.groupby('yearID')['teamIDloser']}


In [342]:
playoff_winners.keys()

dict_keys([1884, 1885, 1886, 1887, 1888, 1889, 1890, 1892, 1903, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021])

In [344]:
playoff_losers[2012]

['TEX', 'NYA', 'BAL', 'OAK', 'ATL', 'SLN', 'WAS', 'CIN', 'DET']

In [343]:
playoff_winners[2012]

['BAL', 'DET', 'NYA', 'DET', 'SLN', 'SFN', 'SLN', 'SFN', 'SFN']

In [345]:
playoff_teams = playoff_losers[2012] + playoff_winners[2012]

In [346]:
def unique(playoff_teams):
    x = np.array(playoff_teams)
    print(np.unique(x))


In [348]:
unique(playoff_teams)  ## THIS WORKS

['ATL' 'BAL' 'CIN' 'DET' 'NYA' 'OAK' 'SFN' 'SLN' 'TEX' 'WAS']


----

In [319]:
history = {}
year_count = 1884
for year in playoff_winners:
    try:
        playoff_teams = playoff_winners[year_count] + playoff_losers[year_count]
        playoff_teams = unique(playoff_teams)
        season_playoffs = {
            'year': year_count,
            'playoff_teams': playoff_teams
        }
        history.update(
            season_playoffs
        )
    except:
        print(year_count)
    year_count += 1

['NY4']
['SL4']
['CHN']
['SL4']
['SL4']
['BR3']
['LS2']
1891
['CL4']
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
['PIT']
1904
['PHA']
['CHN']
['DET']
['DET']
['DET']
['CHN']
['NY1']
['NY1']
['NY1']
['PHA']
['PHI']
['BRO']
['NY1']
['CHN']
['CHA']
['BRO']
['NYA']
['NYA']
['NY1']
['NY1']
['WS1']
['NYA']
['PIT']
['SLN']
['CHN']
['SLN']
['PHA']
['CHN']
['WS1']
['DET']
['CHN']
['NY1']
['NY1']
['CHN']
['CIN']
['DET']
['BRO']
['NYA']
['SLN']
['SLA']
['CHN']
['BOS']
['BRO']
['BSN']
['BRO']
['PHI']
['NY1']
['BRO']
['BRO']
['CLE']
['NYA']
['BRO']
['NYA']
['ML1']
['CHA']
['NYA']
['CIN']
['SFN']
['NYA']
['NYA']
['MIN']
['LAN']
['BOS']
['SLN']
['ATL' 'BAL' 'MIN']
['CIN' 'MIN' 'PIT']
['BAL' 'OAK' 'SFN']
['CIN' 'DET' 'PIT']
['BAL' 'CIN' 'NYN']
['BAL' 'LAN' 'PIT']
['BOS' 'OAK' 'PIT']
['KCA' 'NYA' 'PHI']
['KCA' 'LAN' 'PHI']
['KCA' 'LAN' 'PHI']
['BAL' 'CAL' 'CIN']
['HOU' 'KCA' 'NYA']
['HOU' 'KCA' 'ML4' 'MON' 'NYA' 'OAK' 'PHI']
['ATL' 'CAL' 'ML4']
['CHA' 'LAN' 'PHI']
['CHN' 'KCA' 'SDN']
['LAN' 'SLN'

In [320]:
history.items()

dict_items([('year', 2008), ('playoff_teams', None)])

Let's use a teeny algo to render a list of every team that made the playoffs in a given year.

In [303]:
playoff_winners[0]

KeyError: 0

TypeError: 'NoneType' object is not subscriptable

In [227]:
playoff_winners[1884]

['PRO']

In [233]:
winners = []
winner_count = 1884
for winner_count in playoff_winners:
    winners.append(playoff_winners[winner_count])
    winner_count += 1

In [236]:
losers = []
loser_count = 1884
for loser_count in playoff_losers:
    losers.append(playoff_losers[loser_count])
    loser_count += 1

In [242]:
playoff_teams = winners + losers

In [243]:
playoff_teams

[['PRO'],
 ['CHN'],
 ['SL4'],
 ['DTN'],
 ['NY1'],
 ['NY1'],
 ['BRO'],
 ['BSN'],
 ['BOS'],
 ['NY1'],
 ['CHA'],
 ['CHN'],
 ['CHN'],
 ['PIT'],
 ['PHA'],
 ['PHA'],
 ['BOS'],
 ['PHA'],
 ['BSN'],
 ['BOS'],
 ['BOS'],
 ['CHA'],
 ['BOS'],
 ['CIN'],
 ['CLE'],
 ['NY1'],
 ['NY1'],
 ['NYA'],
 ['WS1'],
 ['PIT'],
 ['SLN'],
 ['NYA'],
 ['NYA'],
 ['PHA'],
 ['PHA'],
 ['SLN'],
 ['NYA'],
 ['NY1'],
 ['SLN'],
 ['DET'],
 ['NYA'],
 ['NYA'],
 ['NYA'],
 ['NYA'],
 ['CIN'],
 ['NYA'],
 ['SLN'],
 ['NYA'],
 ['SLN'],
 ['DET'],
 ['SLN'],
 ['NYA'],
 ['CLE'],
 ['NYA'],
 ['NYA'],
 ['NYA'],
 ['NYA'],
 ['NYA'],
 ['NY1'],
 ['BRO'],
 ['NYA'],
 ['ML1'],
 ['NYA'],
 ['LAN'],
 ['PIT'],
 ['NYA'],
 ['NYA'],
 ['LAN'],
 ['SLN'],
 ['LAN'],
 ['BAL'],
 ['SLN'],
 ['DET'],
 ['BAL', 'NYN', 'NYN'],
 ['BAL', 'CIN', 'BAL'],
 ['BAL', 'PIT', 'PIT'],
 ['OAK', 'CIN', 'OAK'],
 ['OAK', 'NYN', 'OAK'],
 ['OAK', 'LAN', 'OAK'],
 ['BOS', 'CIN', 'CIN'],
 ['NYA', 'CIN', 'CIN'],
 ['NYA', 'LAN', 'NYA'],
 ['NYA', 'LAN', 'NYA'],
 ['BAL', 'PIT', 'PIT'],
 ['KCA

In [148]:
series_test

['yearID',
 'round',
 'teamIDwinner',
 'lgIDwinner',
 'teamIDloser',
 'lgIDloser',
 'wins',
 'losses',
 'ties']

In [None]:
playoff_teams ={}
for year in series_post_sorted['yearID']:
    teams = []
    playoff_teams.append(
        'year': 'yearID'
        'teams': 
    )

Let's split our teams into temporal eras. There are four major eras in baseball history. Or rather, two gigantic ones — the latter of which has three distinct sub-areas. 

In [124]:
deadball_era = df_teams.mask(df_teams['yearID'] <= 1920)
liveball_era = df_teams.mask(df_teams['yearID'] > 1920)

In [125]:
liveball = deadball_era.dropna(axis=0, how='all')
liveball

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
719,1921.0,AL,BOS,BOS,,5.0,154.0,77.0,75.0,79.0,,,N,N,668.0,5206.0,1440.0,248.0,69.0,17.0,428.0,344.0,83.0,65.0,,,696.0,603.0,3.98,88.0,9.0,5.0,4092.0,1521.0,53.0,452.0,446.0,157.0,151.0,0.975,Boston Red Sox,Fenway Park I,279273.0,97.0,99.0,BOS,BOS,BOS
720,1921.0,NL,BRO,LAD,,5.0,152.0,78.0,77.0,75.0,,,N,N,667.0,5263.0,1476.0,209.0,85.0,59.0,325.0,400.0,91.0,73.0,,,681.0,560.0,3.70,82.0,8.0,12.0,4089.0,1556.0,46.0,361.0,471.0,232.0,142.0,0.964,Brooklyn Robins,Ebbets Field,613245.0,105.0,104.0,BRO,BRO,BRO
721,1921.0,NL,BSN,ATL,,4.0,153.0,74.0,79.0,74.0,,,N,N,721.0,5385.0,1561.0,209.0,100.0,61.0,377.0,470.0,94.0,100.0,,,697.0,600.0,3.90,74.0,11.0,12.0,4155.0,1488.0,54.0,420.0,382.0,199.0,122.0,0.969,Boston Braves,Braves Field,318627.0,94.0,96.0,BSN,BSN,BSN
722,1921.0,AL,CHA,CHW,,7.0,154.0,77.0,62.0,92.0,,,N,N,683.0,5329.0,1509.0,242.0,82.0,35.0,445.0,474.0,97.0,93.0,,,858.0,749.0,4.94,84.0,7.0,9.0,4095.0,1603.0,52.0,549.0,392.0,199.0,155.0,0.969,Chicago White Sox,Comiskey Park,543650.0,98.0,98.0,CHW,CHA,CHA
723,1921.0,NL,CHN,CHC,,7.0,153.0,76.0,64.0,89.0,,,N,N,668.0,5321.0,1553.0,234.0,56.0,37.0,343.0,374.0,70.0,97.0,,,773.0,665.0,4.39,73.0,7.0,7.0,4089.0,1605.0,67.0,409.0,441.0,166.0,129.0,0.974,Chicago Cubs,Wrigley Field,410107.0,100.0,101.0,CHC,CHN,CHN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2980,2021.0,NL,SLN,STL,C,2.0,162.0,81.0,90.0,72.0,N,Y,N,N,706.0,5351.0,1303.0,261.0,22.0,198.0,478.0,1341.0,89.0,22.0,86.0,44.0,672.0,626.0,3.98,3.0,15.0,50.0,4251.0,1234.0,152.0,608.0,1225.0,84.0,137.0,0.986,St. Louis Cardinals,Busch Stadium III,2102530.0,92.0,92.0,STL,SLN,SLN
2981,2021.0,AL,TBA,TBD,E,1.0,162.0,81.0,100.0,62.0,Y,N,N,N,857.0,5507.0,1336.0,288.0,36.0,222.0,585.0,1542.0,88.0,42.0,72.0,41.0,651.0,593.0,3.67,1.0,13.0,42.0,4367.0,1264.0,184.0,436.0,1478.0,80.0,130.0,0.986,Tampa Bay Rays,Tropicana Field,761072.0,92.0,91.0,TBR,TBA,TBA
2982,2021.0,AL,TEX,TEX,W,5.0,162.0,81.0,60.0,102.0,N,N,N,N,625.0,5405.0,1254.0,225.0,24.0,167.0,433.0,1381.0,106.0,29.0,58.0,31.0,815.0,758.0,4.79,0.0,3.0,31.0,4273.0,1402.0,232.0,513.0,1239.0,83.0,146.0,0.986,Texas Rangers,Globe Life Field,2110258.0,99.0,101.0,TEX,TEX,TEX
2983,2021.0,AL,TOR,TOR,E,4.0,162.0,80.0,91.0,71.0,N,N,N,N,846.0,5476.0,1455.0,285.0,13.0,262.0,496.0,1218.0,81.0,20.0,51.0,35.0,663.0,610.0,3.91,1.0,14.0,34.0,4216.0,1257.0,209.0,473.0,1468.0,90.0,122.0,0.984,Toronto Blue Jays,Sahlen Field,805901.0,102.0,101.0,TOR,TOR,TOR


In [126]:
deadball = liveball_era.dropna(axis=0, how='all')
deadball

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
0,1871.0,,BS1,BNA,,3.0,31.0,,20.0,10.0,,,N,,401.0,1372.0,426.0,70.0,37.0,3.0,60.0,19.0,73.0,16.0,,,303.0,109.0,3.55,22.0,1.0,3.0,828.0,367.0,2.0,42.0,23.0,243.0,24.0,0.834,Boston Red Stockings,South End Grounds I,,103.0,98.0,BOS,BS1,BS1
1,1871.0,,CH1,CNA,,2.0,28.0,,19.0,9.0,,,N,,302.0,1196.0,323.0,52.0,21.0,10.0,60.0,22.0,69.0,21.0,,,241.0,77.0,2.76,25.0,0.0,1.0,753.0,308.0,6.0,28.0,22.0,229.0,16.0,0.829,Chicago White Stockings,Union Base-Ball Grounds,,104.0,102.0,CHI,CH1,CH1
2,1871.0,,CL1,CFC,,8.0,29.0,,10.0,19.0,,,N,,249.0,1186.0,328.0,35.0,40.0,7.0,26.0,25.0,18.0,8.0,,,341.0,116.0,4.11,23.0,0.0,0.0,762.0,346.0,13.0,53.0,34.0,234.0,15.0,0.818,Cleveland Forest Citys,National Association Grounds,,96.0,100.0,CLE,CL1,CL1
3,1871.0,,FW1,KEK,,7.0,19.0,,7.0,12.0,,,N,,137.0,746.0,178.0,19.0,8.0,2.0,33.0,9.0,16.0,4.0,,,243.0,97.0,5.17,19.0,1.0,0.0,507.0,261.0,5.0,21.0,17.0,163.0,8.0,0.803,Fort Wayne Kekiongas,Hamilton Field,,101.0,107.0,KEK,FW1,FW1
4,1871.0,,NY2,NNA,,5.0,33.0,,16.0,17.0,,,N,,302.0,1404.0,403.0,43.0,21.0,1.0,33.0,15.0,46.0,15.0,,,313.0,121.0,3.72,32.0,1.0,0.0,879.0,373.0,7.0,42.0,22.0,235.0,14.0,0.840,New York Mutuals,Union Grounds (Brooklyn),,90.0,88.0,NYU,NY2,NY2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,1920.0,NL,PHI,PHI,,8.0,153.0,77.0,62.0,91.0,,,N,N,565.0,5264.0,1385.0,229.0,54.0,64.0,283.0,531.0,100.0,83.0,,,714.0,557.0,3.63,77.0,8.0,11.0,4140.0,1480.0,35.0,444.0,419.0,232.0,135.0,0.964,Philadelphia Phillies,Baker Bowl,330998.0,104.0,109.0,PHI,PHI,PHI
715,1920.0,NL,PIT,PIT,,4.0,155.0,78.0,79.0,75.0,,,N,N,530.0,5219.0,1342.0,162.0,90.0,16.0,374.0,405.0,181.0,117.0,,,552.0,454.0,2.89,92.0,17.0,10.0,4245.0,1389.0,25.0,280.0,444.0,186.0,119.0,0.971,Pittsburgh Pirates,Forbes Field,429037.0,103.0,103.0,PIT,PIT,PIT
716,1920.0,AL,SLA,BAL,,4.0,154.0,78.0,76.0,77.0,,,N,N,797.0,5358.0,1651.0,279.0,83.0,50.0,427.0,339.0,118.0,79.0,,,766.0,617.0,4.03,84.0,9.0,14.0,4134.0,1481.0,53.0,578.0,444.0,231.0,119.0,0.963,St. Louis Browns,Sportsman's Park IV,419311.0,103.0,103.0,SLB,SLA,SLA
717,1920.0,NL,SLN,STL,,5.0,155.0,76.0,75.0,79.0,,,N,N,675.0,5495.0,1589.0,238.0,96.0,32.0,373.0,484.0,126.0,114.0,,,682.0,543.0,3.43,72.0,9.0,12.0,4278.0,1488.0,30.0,479.0,529.0,256.0,136.0,0.961,St. Louis Cardinals,Robison Field/Sportsman's Park IV,326836.0,98.0,98.0,STL,SLN,SLN


Now let's split the live-ball era into groups of its three main playoff structures: No divisions, Divisions, and Divisions with a Wildcard round.

In [134]:
league_era = liveball[(liveball['LgWin'].notna()) & (liveball['DivWin'].isna())]
division_era = liveball[(liveball['LgWin'].notna()) & (liveball['DivWin'].notna()) & (liveball['WCWin'].isna())]
wildcard_era = liveball[(liveball['LgWin'].notna()) & (liveball['DivWin'].notna()) & (liveball['WCWin'].notna())]


----

Now let's make a dataframe comprising each season's best non-playoff team.

In [138]:
wildcard_era_first_losers = wildcard_era.merge(series_post_sorted, left_on='teamIDBR', right_on='teamIDloser')
wildcard_era_first_losers = wildcard_era_first_losers.merge(series_post_sorted, left_on='teamIDBR', right_on='teamIDwinner')

In [139]:
wildcard_era_first_losers

Unnamed: 0,yearID_x,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro,yearID_y,round_x,teamIDwinner_x,lgIDwinner_x,teamIDloser_x,lgIDloser_x,wins_x,losses_x,ties_x,yearID,round_y,teamIDwinner_y,lgIDwinner_y,teamIDloser_y,lgIDloser_y,wins_y,losses_y,ties_y
0,1995.0,NL,ATL,ATL,E,1.0,144.0,72.0,90.0,54.0,Y,N,Y,Y,645.0,4814.0,1202.0,210.0,27.0,168.0,520.0,933.0,73.0,43.0,40.0,34.0,540.0,494.0,3.44,18.0,11.0,34.0,3875.0,1184.0,107.0,436.0,1087.0,100.0,113.0,0.982,Atlanta Braves,Atlanta-Fulton County Stadium,2561831.0,103.0,102.0,ATL,ATL,ATL,1969,NLCS,NYN,NL,ATL,NL,3,0,0,1991,NLCS,ATL,NL,PIT,NL,4,3,0
1,1995.0,NL,ATL,ATL,E,1.0,144.0,72.0,90.0,54.0,Y,N,Y,Y,645.0,4814.0,1202.0,210.0,27.0,168.0,520.0,933.0,73.0,43.0,40.0,34.0,540.0,494.0,3.44,18.0,11.0,34.0,3875.0,1184.0,107.0,436.0,1087.0,100.0,113.0,0.982,Atlanta Braves,Atlanta-Fulton County Stadium,2561831.0,103.0,102.0,ATL,ATL,ATL,1969,NLCS,NYN,NL,ATL,NL,3,0,0,1992,NLCS,ATL,NL,PIT,NL,4,3,0
2,1995.0,NL,ATL,ATL,E,1.0,144.0,72.0,90.0,54.0,Y,N,Y,Y,645.0,4814.0,1202.0,210.0,27.0,168.0,520.0,933.0,73.0,43.0,40.0,34.0,540.0,494.0,3.44,18.0,11.0,34.0,3875.0,1184.0,107.0,436.0,1087.0,100.0,113.0,0.982,Atlanta Braves,Atlanta-Fulton County Stadium,2561831.0,103.0,102.0,ATL,ATL,ATL,1969,NLCS,NYN,NL,ATL,NL,3,0,0,1995,NLCS,ATL,NL,CIN,NL,4,0,0
3,1995.0,NL,ATL,ATL,E,1.0,144.0,72.0,90.0,54.0,Y,N,Y,Y,645.0,4814.0,1202.0,210.0,27.0,168.0,520.0,933.0,73.0,43.0,40.0,34.0,540.0,494.0,3.44,18.0,11.0,34.0,3875.0,1184.0,107.0,436.0,1087.0,100.0,113.0,0.982,Atlanta Braves,Atlanta-Fulton County Stadium,2561831.0,103.0,102.0,ATL,ATL,ATL,1969,NLCS,NYN,NL,ATL,NL,3,0,0,1995,NLDS1,ATL,NL,COL,NL,3,1,0
4,1995.0,NL,ATL,ATL,E,1.0,144.0,72.0,90.0,54.0,Y,N,Y,Y,645.0,4814.0,1202.0,210.0,27.0,168.0,520.0,933.0,73.0,43.0,40.0,34.0,540.0,494.0,3.44,18.0,11.0,34.0,3875.0,1184.0,107.0,436.0,1087.0,100.0,113.0,0.982,Atlanta Braves,Atlanta-Fulton County Stadium,2561831.0,103.0,102.0,ATL,ATL,ATL,1969,NLCS,NYN,NL,ATL,NL,3,0,0,1995,WS,ATL,NL,CLE,AL,4,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56159,2017.0,NL,MIA,FLA,E,2.0,162.0,78.0,77.0,85.0,N,N,N,N,778.0,5602.0,1497.0,271.0,31.0,194.0,486.0,1282.0,91.0,30.0,67.0,41.0,822.0,772.0,4.82,1.0,7.0,34.0,4328.0,1450.0,193.0,627.0,1202.0,73.0,156.0,0.988,Miami Marlins,Marlins Park,1583014.0,93.0,93.0,MIA,FLO,MIA,2020,NLDS2,ATL,NL,MIA,NL,3,0,0,2020,NLWC3,MIA,NL,CHN,NL,2,0,0
56160,2018.0,NL,MIA,FLA,E,5.0,161.0,81.0,63.0,98.0,N,N,N,N,589.0,5488.0,1303.0,222.0,24.0,128.0,455.0,1384.0,45.0,31.0,73.0,31.0,809.0,762.0,4.76,1.0,12.0,30.0,4326.0,1388.0,192.0,605.0,1249.0,83.0,133.0,0.986,Miami Marlins,Marlins Park,811104.0,89.0,90.0,MIA,FLO,MIA,2020,NLDS2,ATL,NL,MIA,NL,3,0,0,2020,NLWC3,MIA,NL,CHN,NL,2,0,0
56161,2019.0,NL,MIA,FLA,E,5.0,162.0,81.0,57.0,105.0,N,N,N,N,615.0,5512.0,1326.0,265.0,18.0,146.0,395.0,1469.0,55.0,30.0,73.0,33.0,808.0,760.0,4.74,2.0,8.0,27.0,4333.0,1340.0,236.0,615.0,1378.0,94.0,135.0,0.984,Miami Marlins,Marlins Park,811302.0,94.0,96.0,MIA,FLO,MIA,2020,NLDS2,ATL,NL,MIA,NL,3,0,0,2020,NLWC3,MIA,NL,CHN,NL,2,0,0
56162,2020.0,NL,MIA,FLA,E,2.0,60.0,26.0,31.0,29.0,N,Y,N,N,263.0,1935.0,472.0,82.0,5.0,60.0,191.0,537.0,51.0,14.0,25.0,9.0,304.0,272.0,4.86,1.0,0.0,18.0,1512.0,506.0,82.0,226.0,451.0,39.0,60.0,0.981,Miami Marlins,Marlins Park,0.0,97.0,99.0,MIA,FLO,MIA,2020,NLDS2,ATL,NL,MIA,NL,3,0,0,2020,NLWC3,MIA,NL,CHN,NL,2,0,0


In [406]:
first_losers

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro,RDiff,WP
2302,1999,NL,CIN,CIN,C,2,163,82.0,96,67,N,N,N,N,865,5649,1536,312,37,209,569.0,1125.0,164.0,54.0,45.0,44.0,711,647,3.98,6,11,55,4386,1309,190,636,1081,105,139,0.983,Cincinnati Reds,Cinergy Field,2061222.0,103,103,CIN,CIN,CIN,154,0.588957
2409,2002,AL,SEA,SEA,W,3,162,81.0,93,69,N,N,N,N,814,5569,1531,285,31,152,629.0,1003.0,137.0,58.0,51.0,72.0,699,654,4.07,8,12,43,4336,1422,178,441,1063,88,134,0.985,Seattle Mariners,Safeco Field,3542938.0,97,95,SEA,SEA,SEA,115,0.574074
2389,2002,AL,BOS,BOS,E,2,162,81.0,93,69,N,N,N,N,859,5640,1560,348,33,177,545.0,944.0,80.0,28.0,72.0,53.0,665,603,3.75,5,17,51,4338,1339,146,430,1157,104,140,0.983,Boston Red Sox,Fenway Park II,2650862.0,103,102,BOS,BOS,BOS,194,0.574074
2902,2019,AL,CLE,CLE,C,2,162,81.0,93,69,N,N,N,N,769,5425,1354,286,18,223,563.0,1332.0,103.0,35.0,50.0,46.0,657,601,3.76,6,16,42,4313,1308,207,450,1508,83,110,0.985,Cleveland Indians,Progressive Field,1738642.0,104,102,CLE,CLE,CLE,112,0.574074
2482,2005,AL,CLE,CLE,C,2,162,81.0,93,69,N,N,N,N,790,5609,1522,337,30,207,503.0,1093.0,62.0,36.0,54.0,50.0,642,582,3.61,6,10,51,4358,1363,157,413,1050,106,156,0.983,Cleveland Indians,Jacobs Field,2013763.0,96,96,CLE,CLE,CLE,148,0.574074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2725,2013,AL,HOU,HOU,W,5,162,81.0,51,111,N,N,N,N,610,5457,1307,266,16,148,426.0,1535.0,110.0,61.0,52.0,38.0,848,766,4.79,2,5,32,4320,1530,191,616,1084,125,168,0.979,Houston Astros,Minute Maid Park,1651883.0,99,101,HOU,HOU,HOU,-238,0.314815
2446,2004,NL,ARI,ARI,W,5,162,81.0,51,111,N,N,N,N,615,5544,1401,295,38,135,441.0,1022.0,53.0,32.0,35.0,37.0,899,794,4.98,5,6,33,4308,1480,197,668,1153,139,144,0.977,Arizona Diamondbacks,Bank One Ballpark,2519560.0,105,107,ARI,ARI,ARI,-284,0.314815
2904,2019,AL,DET,DET,C,5,161,81.0,47,114,N,N,N,N,582,5549,1333,292,41,149,391.0,1595.0,57.0,20.0,48.0,42.0,915,835,5.24,0,3,31,4299,1555,250,536,1368,110,127,0.981,Detroit Tigers,Comerica Park,1501430.0,102,104,DET,DET,DET,-333,0.291925
2867,2018,AL,BAL,BAL,E,5,162,81.0,47,115,N,N,N,N,622,5507,1317,242,15,188,422.0,1412.0,81.0,22.0,57.0,35.0,892,824,5.18,2,7,28,4293,1552,234,589,1203,104,159,0.982,Baltimore Orioles,Oriole Park at Camden Yards,1564192.0,96,97,BAL,BAL,BAL,-270,0.290123


In [395]:
df_teams[(df_teams['yearID'] == 1989) & (df_teams['DivWin'] == 'Y')]
df_teams[(df_teams['yearID'] == 1989) & (df_teams['DivWin'] == 'Y')]
df_teams[(df_teams['yearID'] == 1989) & (df_teams['DivWin'] == 'Y')]

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,DivWin,WCWin,LgWin,WSWin,R,AB,H,2B,3B,HR,BB,SO,SB,CS,HBP,SF,RA,ER,ERA,CG,SHO,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro,RDiff
2026,1989,NL,CHN,CHC,E,1,162,81.0,93,69,Y,,N,N,702,5513,1438,235,45,124,472.0,921.0,136.0,57.0,26.0,50.0,623,556,3.43,18,10,55,4381,1369,106,532,918,124,130,0.98,Chicago Cubs,Wrigley Field,2491942.0,108,108,CHC,CHN,CHN,79
2038,1989,AL,OAK,OAK,W,1,162,81.0,99,63,Y,,Y,Y,712,5416,1414,220,25,127,562.0,855.0,157.0,55.0,34.0,62.0,576,497,3.09,17,20,57,4345,1287,103,510,930,129,159,0.979,Oakland Athletics,Oakland Coliseum,2667225.0,97,95,OAK,OAK,OAK,136
2043,1989,NL,SFN,SFG,W,1,162,81.0,92,70,Y,,Y,N,699,5469,1365,241,52,141,508.0,1071.0,87.0,54.0,40.0,39.0,600,535,3.3,12,16,47,4371,1320,120,471,802,114,135,0.982,San Francisco Giants,Candlestick Park,2059701.0,97,96,SFG,SFN,SFN,99
2046,1989,AL,TOR,TOR,E,1,162,81.0,89,73,Y,,N,N,731,5581,1449,265,40,142,521.0,923.0,144.0,58.0,31.0,53.0,651,584,3.58,12,12,38,4401,1408,99,478,849,127,164,0.98,Toronto Blue Jays,Exhibition Stadium /Skydome,3375883.0,94,94,TOR,TOR,TOR,80


KeyError: "['NL', 'SDN', 'SDP', 6, 162, 81.0, 65, 97, 'N', nan, 668, 5456, 1419, 209, 48, 113, 577.0, 992.0, 198.0, 91.0, 27.0, 36.0, 763, 680, 4.27, 14, 10, 33, 4300, 1402, 175, 602, 897, 147, 135, 0.976, 'San Diego Padres', 'Jack Murphy Stadium', 1454061.0, 96] not in index"

In [376]:
df_teams = df_teams.set_index(['yearID', 'lgID', 'divID'])

KeyError: 'PIT'

### Meet the Wonkaville Huskies

In [132]:
df_huskiesBatters = df_batters.loc[df_batters.BMI >= 34.55]

In [133]:
df_huskiesBatters.sort_values('BMI').describe()

Unnamed: 0,birthYear,birthMonth,birthDay,deathYear,deathMonth,deathDay,weight,height,yearID,stint,...,BB,SO,IBB,HBP,SH,SF,GIDP,KG,meters,BMI
count,26.0,25.0,25.0,5.0,5.0,5.0,26.0,26.0,26.0,26.0,...,26.0,22.0,0.0,24.0,17.0,0.0,1.0,26.0,26.0,26.0
mean,1969.153846,5.8,17.52,1952.4,7.6,8.6,269.961538,72.115385,1901.730769,1.115385,...,9.884615,14.909091,,0.875,6.0,,0.0,122.452394,1.831731,36.408324
std,40.878055,3.316625,8.529947,49.45503,3.646917,7.602631,30.1801,4.348121,14.17761,0.325813,...,16.310308,13.606403,,1.650099,8.951257,,,13.689452,0.110442,1.728873
min,1853.0,1.0,1.0,1891.0,2.0,2.0,155.0,55.0,1872.0,1.0,...,0.0,0.0,,0.0,0.0,,0.0,70.30676,1.397,34.622243
25%,1977.0,4.0,11.0,1915.0,6.0,2.0,261.25,71.0,1890.25,1.0,...,1.0,3.25,,0.0,0.0,,0.0,118.50091,1.8034,34.891189
50%,1983.5,6.0,18.0,1966.0,9.0,6.0,270.0,72.0,1903.5,1.0,...,3.5,11.0,,0.0,2.0,,0.0,122.46984,1.8288,35.875979
75%,1989.0,8.0,24.0,1975.0,10.0,14.0,283.75,75.0,1912.0,1.0,...,9.25,25.25,,1.0,7.0,,0.0,128.70673,1.905,37.928366
max,1998.0,12.0,30.0,2015.0,11.0,19.0,320.0,78.0,1924.0,2.0,...,68.0,43.0,,6.0,33.0,,0.0,145.14944,1.9812,40.292666


In [134]:
df_huskiesPitchers = df[df_pitchers.BMI > 34.55]

NameError: name 'df' is not defined

In [None]:
df_huskiesPitchers.head()

In [None]:
df_huskies = pd.merge(df_huskiesBatters, df_huskiesPitchers, how='right', on='playerID')

In [None]:
df_huskies.columns

In [None]:
sns.jointplot(data=df_simple, x="height", y="weight", kind = "reg", truncate = False)

In [None]:
sns.choose_diverging_palette()

In [None]:

# Compute the correlation matrix
corr = df_huskiesBatters.corr(method="spearman")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(290, 10, n=40, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=1,
    center=0,
    square=True,
    linewidths=0.25,
    cbar_kws={"shrink": .5},
)



In [None]:
corr_mat = df.corr().stack().reset_index(name="correlation")