In [122]:
## Let's try to figure out who the shortest, heaviest baseball players are.

## Here's our overall plan of attack

# Setting Things Up ✅
## Import CSVs ✅
### Separate CSVs --> DataFrames for People, Pitching Data, Batting Data ✅
## Squish everything into one mondo DF ✅
## Add Physical Data ✅
### Height ✅
### Weight ✅
## Calculate BMI ✅
### Convert Imperial to Metric ✅
### BMI-ify ✅
### Throw BMI back into df ✅
## Assemble per-position lists sorted by BMI, then mWAR

# Knocking Things Down
## Find worst team that made playoffs in 2021
### Describe team fWAR/bWAR
### Describe individual fWAR/bWAR
## Pull from BMI lists per position until high BMI roster is full
### mWAR shall be higher on a team basis.
### mWAR shall be higher per position.
# If there's enough time:
## Repeat for:
### Tallness
### Shortness
### Heavy
### Light

# If there's enough time:
## Repeat for:
### Tallness
### Shortness
### Heavy
### Light

## Setting Things Up

### Import the necessaries

In [123]:
import numpy as np
import pandas as pd
import pybaseball
import seaborn as sns
import matplotlib.pyplot as plot
from deepdiff import DeepDiff
from pybaseball import bwar_pitch
from pybaseball import bwar_bat
from pybaseball import cache
from pybaseball.lahman import *
from pybaseball import chadwick_register

pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 100)


In [124]:
# Constants
KG_TO_LB = 0.453592
M_TO_IN = 0.0254

In [125]:
chadwick = chadwick_register(save=True)
chadwick = pd.DataFrame(chadwick)

In [126]:
download_lahman()

### Read infinity baseball data to DataFrames

In [127]:
# a table of all player biographical info and ids
people = pd.DataFrame(people())

# park id, name, alias, city, state, and country
parks = pd.DataFrame(parks())

# all star roster data: player, year, team, league, position
allstar = pd.DataFrame(all_star_full())

# each player's games played per position for each season
appearances = pd.DataFrame(appearances())

# batting stats by year, regular season
batting = pd.DataFrame(batting())

# batting stats by year, post season
batting_post = pd.DataFrame(batting_post())

# fielding stats by year 
fielding = pd.DataFrame(fielding())

# games played in left, center, right field 
fielding_of = pd.DataFrame(fielding_of())

# LF/CF/RF splits
fielding_of_split = pd.DataFrame(fielding_of_split())

# postseason fielding 
fielding_post = pd.DataFrame(fielding_post())

# home game attendance by park by year 
home_games = pd.DataFrame(home_games())

# historical player pitching stats
pitching = pd.DataFrame(pitching())

# postseason pitching stats
pitching_post = pd.DataFrame(pitching_post())

# playoff series winners and losers 
series_post = pd.DataFrame(series_post())

# data on teams by year: record, division, stadium, attendance, etc
teams = pd.DataFrame(teams())

# current and historical franchises, whether they're still active, and their ids
teams_franchises = pd.DataFrame(teams_franchises())

# split season data for teams
teams_half = pd.DataFrame(teams_half()) 

# fangraphs batting since 2008
fangraphs_batting = pd.DataFrame(pybaseball.batting_stats_range(
    start_dt="2008-01-01", end_dt="2021-12-31"))

# fangraphs pitching since 2008
fangraphs_pitching = pd.DataFrame(pybaseball.pitching_stats_range(
    start_dt="2008-01-01", end_dt="2021-12-31"))

# fangraphs team pitching since 2008
fangraphs_team_pitching = pd.DataFrame(
    pybaseball.team_pitching(start_season="2008", end_season="2021"))

# fangraphs team batting since 2008
fangraphs_team_batting = pd.DataFrame(
    pybaseball.team_batting(start_season="2008", end_season="2021"))

# fangraphs team fielding since 2008
fangraphs_team_fielding = pd.DataFrame(
    pybaseball.team_fielding(start_season="2008", end_season="2021"))

# bref pitching WAR
bwar_pitch = pd.DataFrame(bwar_pitch(return_all=True))

# bref batting WAR
bwar_bat = pd.DataFrame(bwar_bat(return_all=True))


  table = table.drop('', 1)


----

### Add BMI Calcs

In [128]:
# BMI Calculations
people['KG'] = people['weight'] * KG_TO_LB
people['meters'] = people['height'] * M_TO_IN
people['BMI'] = people['KG'] / people['meters'] ** 2
people['ratio'] = people['meters'] * people['BMI']


----

# Historical Team Histories

Let's get our team data together so we can figure out:
1. The worst team each year
2. The best team each year
3. The best team that didn't make the playoffs each year

### Team Prep

In [None]:
teams.sample(10)

Let's work off of copies and leave our source DataFrames in one place so that we have a home to which we may tearfully return.

In [None]:
series_post_sorted = series_post

In [None]:
series_post_sorted.columns

In [None]:
df_teams = teams

#### Add Differentials, Which Should've Been Here in the First Place

Let's add some differential metrics that might be useful. In Pythonglish, this is what we're doing:
```python
for Runs, Strikeouts, Walks, Home Runs, Hits in team_stats:
    df_teams['StatDiff'] = df_teams['Team_Stat'] - df_teams['Opponent_stat']
```

In [None]:
df_teams['RDiff'] = df_teams['R'] - df_teams['RA']
df_teams['SODiff'] = df_teams['SO'] - df_teams['SOA']
df_teams['BBDiff'] = df_teams['BB'] - df_teams['BBA']
df_teams['HRDiff'] = df_teams['HR'] - df_teams['HRA']
df_teams['HDiff'] = df_teams['H'] - df_teams['HA']

#### Add Winning Percentage

Let's also give ourselves a winning percentage column, because baseball seasons haven't always been the same length.

In [None]:
df_teams['WP'] = df_teams['W'] / (df_teams['L'] + df_teams['W'])

## Locating the First Losers

Now let's find the best team in each year that didn't make the playoffs and add it to a 'first_losers' DataFrame

First, let's make two dicts — one for playoff winners and one for playoff losers — which serves to identify any team that made the playoffs. After some cleaning, the inverse of our by-year dict will comprise the teams that didn't make the playoffs.

We want to end up with something that looks like this:

```python
history = {
    1871: ['playoff teams'],
    1872: ['playoff teams'],
    [...]
    2021: ['playoff teams']
}
```

In [None]:
all_teams = {k: list(v) for k, v in teams.groupby('yearID')['teamID']}
playoff_winners = {k: list(v) for k, v in series_post_sorted.groupby('yearID')['teamIDwinner']}
playoff_losers = {k: list(v) for k, v in series_post_sorted.groupby('yearID')['teamIDloser']}


Let's make a little algo to jury-rig ourselves a little .unique() function

In [None]:
def unique(playoff_teams):
    x = np.array(playoff_teams)
    return np.unique(x)


### It's Loopin' Time
Now let's make a loop that zooms over our entire table to generate each year's list of playoff teams

In [None]:
history = {}
for year in range(1871, 2022):
    try:
        playoff_teams = playoff_winners[year] + playoff_losers[year]
        playoff_teams = unique(playoff_teams)
        year = {
            year: list(playoff_teams)
        }
        print('---')
        print(year)
        history.update(year)
    except:
        print('---')
        print(year)
        print("There weren't any playoffs this year.")

## Next we're going to add a bool column to our monster team dataframe where TRUE = made playoffs and FALSE = missed playoffs.

In [None]:
df_teams = teams

In [None]:
df_teams.groupby(['yearID']).teamID.unique()

I want to get the teamID from each row in df_teams and check whether it's in df_teams.groupby(['yearID']).teamID.unique()

In [None]:
for team in df_teams.groupby(['yearID']):
    season = df_teams.yearID
    df_teams['playoff_teams'] = dict(df_teams[df_teams["yearID"] == season].teamID)


In [None]:
for team in df_teams.groupby(['yearID']):
    season = df_teams.yearID
    df_teams.insert(5, "playoff_teams", list(
        df_teams[df_teams["yearID"] == season].teamID), True)


In [None]:
for team in range(len(df_teams)):
    team_season = df_teams.yearID
    try:
        if df_teams.loc[team]['teamID'] in list(df_teams[df_teams["yearID"] == team_season].teamID):
            df_teams['playoffs'] == 1
            print('- - -')
            print(f'{df_teams.iloc[team].yearID} : {df_teams.iloc[team].teamID}')
            print('Pass')
        else:
            df_teams['playoffs'] == 0
            print(f'{df_teams.iloc[team].yearID} : {df_teams.iloc[team].teamID}')
            print("- - -")
            print("Pass")
    except:
        print("- - -")
        print(f'{df_teams.iloc[team].yearID} : {df_teams.iloc[team].teamID}')
        print("Exception")


In [None]:
df_teams.sample(1)

In [None]:
history.items()

In [None]:
conditions = [
    df_teams.loc[0].teamID
    in list(df_teams[df_teams["yearID"] == 1871].teamID)  # ! DING DING DING
]
values = [True]

df_teams['made_playoffs']

In [None]:
df_teams.loc[df_teams['teamID'] in list(df_teams[df_teams['yearID'] == 1871].teamID), 'made_playoffs'] == True

In [None]:
df_teams.loc[0].teamID in list(df_teams[df_teams['yearID'] == 1871].teamID) # ! DING DING DING

In [None]:
if df_teams.loc[2980].teamID in list(history[2021]):# ! DING DING DING

In [None]:
df_teams

In [None]:
season = 1871
for team in range(len(df_teams)):
    try:
        if df_teams.iloc[team].teamID in list(df_teams[df_teams['yearID'] == season].teamID):
            print('- - -')
            print(f'{df_teams.iloc[team].yearID} + {df_teams.iloc[team].yearID}')
            print('Pass')
            df_teams['playoffs'] == 1
        else:
            df_teams['playoffs'] == 0
            print("- - -")
            print("Pass")
    except:
        print("- - -")
        print("Exception")
    season += 1


In [None]:
season = 1871
for team in range(1871, 2022):
    try:
        if df_teams[df_teams[season]].teamID in df_teams.groupby(['yearID']).teamID.unique():
            df_teams['playoffs'] == True
        else:
            df_teams['playoffs'] == False
            print("- - -")
            print(season)
            print("Pass")
    except:
        print("- - -")
        print(season)
        print("Exception")
season += 1


In [None]:
season = 1871
for team in df_teams:
    try:
        if df_teams['teamID'] in history[season]:
            df_teams['playoffs'] == True
        else: df_teams['playoffs'] == False
        print("- - -")
        print(season)
        print("Pass")
    except:
        print("- - -")
        print(season)
        print("Exception")
    season += 1

In [None]:
df_teams['playoffs']

----

### Erafying Things. For Posterity.

Let's split our teams into temporal eras. There are four major eras in baseball history. Or rather, two gigantic ones — the latter of which has three distinct sub-areas. 

In [None]:
deadball_era = df_teams.mask(df_teams['yearID'] <= 1920)
liveball_era = df_teams.mask(df_teams['yearID'] > 1920)

In [None]:
liveball = deadball_era.dropna(axis=0, how='all')
liveball

In [None]:
deadball = liveball_era.dropna(axis=0, how='all')
deadball

Now let's split the live-ball era into groups of its three main playoff structures: No divisions, Divisions, and Divisions with a Wildcard round.

In [None]:
league_era = liveball[(liveball['LgWin'].notna()) & (liveball['DivWin'].isna())]
division_era = liveball[(liveball['LgWin'].notna()) & (liveball['DivWin'].notna()) & (liveball['WCWin'].isna())]
wildcard_era = liveball[(liveball['LgWin'].notna()) & (liveball['DivWin'].notna()) & (liveball['WCWin'].notna())]


----

### Meet the First Losers. And Their Friends.

In [None]:
league_era['yearID']

In [None]:
history[2012]  #EVERYTHING WORKS TO HERE

In [None]:
teams.sample(20)

In [None]:
losers = {}
season = 2012
for year in league_era['yearID']:
    try:
        playoff_teams = history[season]
        print('---')
        print(season)
        print('in')
        print(playoff_teams)
        league_era[league_era["yearID"] == season]
        playoff_missers = {
            season: league_era[~league_era["teamID"].isin(playoff_teams)]
        }
        print('out')
        print(playoff_missers.values())
        # losers.update(playoff_missers)
    except:
        pass
    season += 1

In [None]:
losers.keys()

In [None]:
league_era.merge(pd.DataFrame([{'yearID': k, 'year': i} for k, v in history.items() for i in v]))

In [None]:
print(list(df.columns))

In [None]:
wildcard_era_first_losers = wildcard_era.merge(series_post_sorted, left_on='teamIDBR', right_on='teamIDloser')
wildcard_era_first_losers = wildcard_era_first_losers.merge(series_post_sorted, left_on='teamIDBR', right_on='teamIDwinner')

In [None]:
df_teams = df_teams.set_index(['yearID', 'lgID', 'divID'])

### Meet the Wonkaville Huskies

In [None]:
df_huskies = pd.merge(df_huskiesBatters, df_huskiesPitchers, how='right', on='playerID')

In [None]:
df_huskies.columns

In [None]:
sns.jointplot(data=df_simple, x="height", y="weight", kind = "reg", truncate = False)

In [None]:
sns.choose_diverging_palette()

In [None]:

# Compute the correlation matrix
corr = df_huskiesBatters.corr(method="spearman")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(290, 10, n=40, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=1,
    center=0,
    square=True,
    linewidths=0.25,
    cbar_kws={"shrink": .5},
)



In [None]:
corr_mat = df.corr().stack().reset_index(name="correlation")

##### Old