# Assemble the Features

We want to assemble our data in to a data frame of features; for now I'm going to try to make something including:

* Position player performance data (~3 numbers)
* Position player position
* Team salary data
* Team performance for position (previous year)
* Team value lost for position (from previous year, using FAs)

We'll try doing it in stages

In [438]:
# Bring in packages and connect to database
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
from scipy.stats import zscore

# Set postgres username/password, and connection specifics
username = 'postgres'
password = 'S@ndw1ches'     # change this
host     = 'localhost'
port     = '5432'            # default port that postgres listens on
db_name  = 'mlb_fa_db'

engine = create_engine( 'postgresql://{}:{}@{}:{}/{}'.format(username, password, host, port, db_name) )

In [403]:
# Make a quick querying function
def pullFullTable(table, engine):
    '''Quick little function for pulling a full table'''
    
    query = 'select * from {}'.format(table)
    
    # Execute the query with context manager
    with engine.connect() as con:
        results = con.execute(query)
        fetched_data = pd.DataFrame(results.fetchall())
        fetched_data.columns = results.keys()
        
    return fetched_data

## Task 1: Grab Batting data and filter it by only free agents

We'll do it in 5 stages:

1. Pull batting data and shorten its columns to just the ones I want
2. Pull the "people" data to get the first/last names for batting data
3. Join batting and people to get all the data JUST for our desired years
3. Pull the "free_agents" data
5. Join "batting" and new free_agents/people to filter batting by only free agents

In [404]:
print(engine.table_names())

['batting', 'pitching', 'salary', 'people', 'appearances', 'teams', 'position_team_war', 'pitcher_team_war', 'payrolls', 'free_agents']


In [452]:
# Create our query
batting_data = pullFullTable('batting', engine)
    
batting_data.head()

Unnamed: 0,index,playerID,yearID,stint,teamID,lgID,G,AB,R,H,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,76643,abbotje01,1998,1,CHA,AL,89,244,33,68,...,41.0,3.0,3.0,9,28.0,1.0,0.0,2.0,5.0,2.0
1,76644,abbotji01,1998,1,CHA,AL,5,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,76645,abbotku01,1998,1,OAK,AL,35,123,17,33,...,9.0,2.0,1.0,10,34.0,0.0,1.0,1.0,1.0,3.0
3,76646,abbotku01,1998,2,COL,NL,42,71,9,18,...,15.0,0.0,0.0,2,19.0,0.0,1.0,0.0,2.0,2.0
4,76647,abbotpa01,1998,1,SEA,AL,4,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [453]:
# Drop non-numeric team/league columns
batting_data.drop(['teamID','lgID'], axis = 1)

# Add data from players who had multiple stints and 
batting_totals = batting_data.groupby(['playerID','yearID'], as_index= False).sum()

In [458]:
# Create new variables and select only them (fill NaN with 0)
batting_totals['OBP'] = (batting_totals['H'] + 
                       batting_totals['BB'] + 
                       batting_totals['HBP']).divide(batting_totals['AB'] + 
                                                   batting_totals['BB'] + 
                                                   batting_totals['HBP'] + 
                                                   batting_totals['SF']).fillna(0)

batting_totals['SLG'] = (batting_totals['H'] + 
                         batting_totals['2B'] + 
                         2 * batting_totals['3B'] + 
                         3 * batting_totals['HR']).divide(batting_totals['AB']).fillna(0)

batting_trimmed = batting_totals[['playerID', 'yearID', 'G', 'OBP', 'SLG', 'HR', 'RBI']]

# Fill "NaN" values for OBP/SLG with 0
print(batting_trimmed.shape)
print(batting_trimmed.head())
print(batting_trimmed[batting_trimmed['playerID'] == 'pujolal01'])

(25365, 7)
    playerID  yearID   G  OBP  SLG  HR  RBI
0  aardsda01    2004  11  0.0  0.0   0  0.0
1  aardsda01    2006  45  0.0  0.0   0  0.0
2  aardsda01    2007  25  0.0  0.0   0  0.0
3  aardsda01    2008  47  0.0  0.0   0  0.0
4  aardsda01    2009  73  0.0  0.0   0  0.0
        playerID  yearID    G       OBP       SLG  HR    RBI
18417  pujolal01    2001  161  0.402963  0.610169  37  130.0
18418  pujolal01    2002  157  0.394074  0.561017  34  127.0
18419  pujolal01    2003  157  0.439416  0.666667  43  124.0
18420  pujolal01    2004  154  0.414740  0.657095  46  123.0
18421  pujolal01    2005  161  0.430000  0.609137  41  117.0
18422  pujolal01    2006  143  0.430599  0.671028  49  137.0
18423  pujolal01    2007  158  0.428571  0.568142  32  103.0
18424  pujolal01    2008  148  0.461778  0.652672  37  116.0
18425  pujolal01    2009  160  0.442857  0.658451  47  135.0
18426  pujolal01    2010  159  0.414286  0.596252  42  118.0
18427  pujolal01    2011  147  0.365591  0.540587  37 

In [459]:
# Standardize the numerical columns by year
numerical = ['G', 'OBP', 'SLG', 'HR', 'RBI']

batting_trimmed[numerical] = batting_trimmed.groupby('yearID')[numerical].transform(zscore)

# Take a look at the output
print(batting_trimmed.shape)
print(batting_trimmed.head())
print(batting_trimmed[batting_trimmed['playerID'] == 'pujolal01'])

(25365, 7)
    playerID  yearID         G       OBP       SLG        HR       RBI
0  aardsda01    2004 -0.926802 -1.128818 -1.021286 -0.520478 -0.613383
1  aardsda01    2006 -0.223818 -1.096651 -1.018416 -0.510545 -0.606120
2  aardsda01    2007 -0.623055 -1.074030 -0.948662 -0.505253 -0.600185
3  aardsda01    2008 -0.146466 -1.085320 -0.912276 -0.496440 -0.599853
4  aardsda01    2009  0.388188 -1.070321 -1.006028 -0.506939 -0.601370
        playerID  yearID         G       OBP       SLG        HR       RBI
18417  pujolal01    2001  2.193543  1.141557  1.615230  3.637957  3.709145
18418  pujolal01    2002  2.089353  1.156760  1.559438  3.652412  3.852653
18419  pujolal01    2003  2.102354  1.271069  1.857574  4.707794  3.641002
18420  pujolal01    2004  2.069087  1.336778  1.943523  4.956622  3.615371
18421  pujolal01    2005  2.189724  1.362913  1.529887  4.658596  3.554547
18422  pujolal01    2006  1.802831  1.322061  1.937764  5.258251  3.979439
18423  pujolal01    2007  2.149787  1.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


### Now we'll pull the People and Free Agents and join People to Batting, and 

Note: I tried to do the join directly with SQL and it got mad, so I'm going to do it here instead

In [460]:
# Bring in people and free agents
people = pullFullTable('people', engine)
free_agents = pullFullTable('free_agents', engine)

print(people.shape, free_agents.shape)
print(people.columns, free_agents.columns)

(19370, 25) (7687, 9)
Index(['index', 'playerID', 'birthYear', 'birthMonth', 'birthDay',
       'birthCountry', 'birthState', 'birthCity', 'deathYear', 'deathMonth',
       'deathDay', 'deathCountry', 'deathState', 'deathCity', 'nameFirst',
       'nameLast', 'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut',
       'finalGame', 'retroID', 'bbrefID'],
      dtype='object') Index(['index', 'Age', 'Destination', 'Full_Name', 'Origin', 'WAR_3',
       'nameFirst', 'nameLast', 'Year'],
      dtype='object')


In [461]:
# Join people to batting, adding ONLY nameLast/nameFirst
people_trimmed = people[['playerID', 'nameFirst', 'nameLast']]

batting_w_people = pd.merge(batting_trimmed, people_trimmed, on = 'playerID', how = 'inner')
print(batting_w_people.shape)
print(batting_w_people.columns)

(25365, 9)
Index(['playerID', 'yearID', 'G', 'OBP', 'SLG', 'HR', 'RBI', 'nameFirst',
       'nameLast'],
      dtype='object')


In [462]:
# Make the yearID into a datetime object
batting_w_people['yearID'] = pd.to_datetime(batting_w_people['yearID'], format = '%Y')

In [463]:
free_agents.head(10)

Unnamed: 0,index,Age,Destination,Full_Name,Origin,WAR_3,nameFirst,nameLast,Year
0,0,28,Seattle Mariners,Allen Watson,LAA,1.9,Allen,Watson,1998-01-01
1,1,30,Pittsburgh Pirates,Frank Castillo,DET,-2.5,Frank,Castillo,1998-01-01
2,2,27,Los Angeles Dodgers,Robinson Checo,BOS,0.1,Robinson,Checo,1998-01-01
3,3,31,Toronto Blue Jays,Pat Kelly,STL,-0.2,Pat,Kelly,1998-01-01
4,4,27,Detroit Tigers,Walt McKeel,BOS,-0.1,Walt,McKeel,1998-01-01
5,5,29,Miami Marlins,Reid Cornelius,MIA,0.0,Reid,Cornelius,1998-01-01
6,6,28,Pittsburgh Pirates,Brad Clontz,NYM,-1.1,Brad,Clontz,1998-01-01
7,7,28,Pittsburgh Pirates,Greg Hansell,KCR,-0.2,Greg,Hansell,1998-01-01
8,8,30,Cleveland Indians,Chris Haney,CHC,1.5,Chris,Haney,1998-01-01
9,9,30,Washington Nationals,James Mouton,SDP,-1.6,James,Mouton,1998-01-01


In [464]:
# Join based on nameFirst/nameLast
free_agents_batting = pd.merge(free_agents, batting_w_people, 
                               left_on = ['nameFirst', 'nameLast', 'Year'],
                               right_on = ['nameFirst', 'nameLast', 'yearID'])
print(free_agents_batting.shape)
print(free_agents_batting.columns)
print(free_agents_batting.head(10))

(5240, 16)
Index(['index', 'Age', 'Destination', 'Full_Name', 'Origin', 'WAR_3',
       'nameFirst', 'nameLast', 'Year', 'playerID', 'yearID', 'G', 'OBP',
       'SLG', 'HR', 'RBI'],
      dtype='object')
   index  Age           Destination          Full_Name Origin  WAR_3  \
0      0   28      Seattle Mariners       Allen Watson    LAA    1.9   
1      1   30    Pittsburgh Pirates     Frank Castillo    DET   -2.5   
2      2   27   Los Angeles Dodgers     Robinson Checo    BOS    0.1   
3      3   31     Toronto Blue Jays          Pat Kelly    STL   -0.2   
4      6   28    Pittsburgh Pirates        Brad Clontz    NYM   -1.1   
5      8   30     Cleveland Indians        Chris Haney    CHC    1.5   
6      9   30  Washington Nationals       James Mouton    SDP   -1.6   
7     10   37      New York Yankees  Darryl Strawberry    NYY    1.4   
8     11   38     Cleveland Indians      Mark Langston    SDP    1.4   
9     12   41      New York Yankees        Tony Fossas    TEX   -0.4   

  

In [465]:
# Pull out only the desired columns
free_agents_batting = free_agents_batting.drop(['index', 'Full_Name', 'Year'], axis = 1)
print(free_agents_batting.head(10))

   Age           Destination Origin  WAR_3 nameFirst    nameLast   playerID  \
0   28      Seattle Mariners    LAA    1.9     Allen      Watson  watsoal01   
1   30    Pittsburgh Pirates    DET   -2.5     Frank    Castillo  castifr01   
2   27   Los Angeles Dodgers    BOS    0.1  Robinson       Checo  checoro01   
3   31     Toronto Blue Jays    STL   -0.2       Pat       Kelly  kellypa03   
4   28    Pittsburgh Pirates    NYM   -1.1      Brad      Clontz  clontbr01   
5   30     Cleveland Indians    CHC    1.5     Chris       Haney  haneych01   
6   30  Washington Nationals    SDP   -1.6     James      Mouton  moutoja01   
7   37      New York Yankees    NYY    1.4    Darryl  Strawberry  strawda01   
8   38     Cleveland Indians    SDP    1.4      Mark    Langston  langsma01   
9   41      New York Yankees    TEX   -0.4      Tony      Fossas  fossato01   

      yearID         G       OBP       SLG        HR       RBI  
0 1998-01-01 -0.598315 -1.183600 -1.028652 -0.491245 -0.610022  


## Task 2: Add positions

This will require data from our new "free_agents_batting" and "appearances". Basically:

* Pull appearances data
* Collapse "appearances" data into positions
* Join it with free_agents_batting data

In [466]:
# Bring in Appearances data to add positions
appearances = pullFullTable('appearances', engine)
    
print(appearances.head())

   index  yearID teamID lgID   playerID  G_all    GS  G_batting  G_defense  \
0  76591    1998    CHA   AL  abbotje01     89  61.0         89       76.0   
1  76592    1998    CHA   AL  abbotji01      5   5.0          0        5.0   
2  76593    1998    COL   NL  abbotku01     42  15.0         42       25.0   
3  76594    1998    OAK   AL  abbotku01     35  32.0         35       32.0   
4  76595    1998    SEA   AL  abbotpa01      4   4.0          0        4.0   

   G_p  ...   G_2b  G_3b  G_ss  G_lf  G_cf  G_rf  G_of  G_dh  G_ph  G_pr  
0    0  ...      0     0     0    20    38    27    76   2.0  15.0   0.0  
1    5  ...      0     0     0     0     0     0     0   0.0   0.0   0.0  
2    0  ...      7     3     7     4     0     5     9   1.0  19.0   1.0  
3    0  ...      0     1    28     5     0     1     5   3.0   1.0   1.0  
4    4  ...      0     0     0     0     0     0     0   0.0   0.0   0.0  

[5 rows x 22 columns]


In [467]:
# Subset to only positional data and group by playerID/yearID
appearances_compact = appearances.drop(['index', 'teamID','lgID', 'G_batting', 
                                        'G_defense','G_all','GS', 'G_ph', 'G_pr'], 
                                       axis = 1).groupby(['playerID','yearID'], 
                                                         as_index = False).sum()

# Check data
print(appearances_compact.head())

    playerID  yearID  G_p  G_c  G_1b  G_2b  G_3b  G_ss  G_lf  G_cf  G_rf  \
0  aardsda01    2004   11    0     0     0     0     0     0     0     0   
1  aardsda01    2006   45    0     0     0     0     0     0     0     0   
2  aardsda01    2007   25    0     0     0     0     0     0     0     0   
3  aardsda01    2008   47    0     0     0     0     0     0     0     0   
4  aardsda01    2009   73    0     0     0     0     0     0     0     0   

   G_of  G_dh  
0     0   0.0  
1     0   0.0  
2     0   0.0  
3     0   0.0  
4     0   0.0  


In [468]:
# Figure out primary position by melting, then grouping and finding the max value
appearances_melt = pd.melt(appearances_compact, id_vars= ['playerID', 'yearID'],
                           value_name = 'Games', var_name = 'Position')
print(appearances_melt[appearances_melt['playerID'] == 'clontbr01'])

         playerID  yearID Position  Games
4422    clontbr01    1998      G_p   20.0
4423    clontbr01    1999      G_p   56.0
4424    clontbr01    2000      G_p    5.0
29787   clontbr01    1998      G_c    0.0
29788   clontbr01    1999      G_c    0.0
29789   clontbr01    2000      G_c    0.0
55152   clontbr01    1998     G_1b    0.0
55153   clontbr01    1999     G_1b    0.0
55154   clontbr01    2000     G_1b    0.0
80517   clontbr01    1998     G_2b    0.0
80518   clontbr01    1999     G_2b    0.0
80519   clontbr01    2000     G_2b    0.0
105882  clontbr01    1998     G_3b    0.0
105883  clontbr01    1999     G_3b    0.0
105884  clontbr01    2000     G_3b    0.0
131247  clontbr01    1998     G_ss    0.0
131248  clontbr01    1999     G_ss    0.0
131249  clontbr01    2000     G_ss    0.0
156612  clontbr01    1998     G_lf    0.0
156613  clontbr01    1999     G_lf    0.0
156614  clontbr01    2000     G_lf    0.0
181977  clontbr01    1998     G_cf    0.0
181978  clontbr01    1999     G_cf

In [469]:
# Grab the index for the maximum games
primary_idx = appearances_melt.groupby(['playerID','yearID'])['Games'].idxmax()

# Use it to screen out the proper rows
primary_position = appearances_melt.loc[primary_idx]

# Turn the "Position" Column into the right contents by pulling just the position and capitalizing
primary_position['Position'] = primary_position.Position.str.split("_").str.get(1).str.upper()
print(primary_position[primary_position['playerID'] == 'clontbr01'])

       playerID  yearID Position  Games
4422  clontbr01    1998        P   20.0
4423  clontbr01    1999        P   56.0
4424  clontbr01    2000        P    5.0


In [470]:
# Finally, turn the yearID into a Datetime
primary_position['yearID'] = pd.to_datetime(primary_position.yearID, format = '%Y')

In [471]:
# Do the join on the 6202 x 13 free_agents_batting
# Join based on nameFirst/nameLast
fa_bat_pos = pd.merge(free_agents_batting, primary_position, 
                      on = ['playerID', 'yearID']).drop(['Games'], axis = 1)
print(fa_bat_pos.head(10))
print(fa_bat_pos.shape)

   Age           Destination Origin  WAR_3 nameFirst    nameLast   playerID  \
0   28      Seattle Mariners    LAA    1.9     Allen      Watson  watsoal01   
1   30    Pittsburgh Pirates    DET   -2.5     Frank    Castillo  castifr01   
2   27   Los Angeles Dodgers    BOS    0.1  Robinson       Checo  checoro01   
3   31     Toronto Blue Jays    STL   -0.2       Pat       Kelly  kellypa03   
4   28    Pittsburgh Pirates    NYM   -1.1      Brad      Clontz  clontbr01   
5   30     Cleveland Indians    CHC    1.5     Chris       Haney  haneych01   
6   30  Washington Nationals    SDP   -1.6     James      Mouton  moutoja01   
7   37      New York Yankees    NYY    1.4    Darryl  Strawberry  strawda01   
8   38     Cleveland Indians    SDP    1.4      Mark    Langston  langsma01   
9   41      New York Yankees    TEX   -0.4      Tony      Fossas  fossato01   

      yearID         G       OBP       SLG        HR       RBI Position  
0 1998-01-01 -0.598315 -1.183600 -1.028652 -0.491245 -0.

## Task 3 Add Team WAR for position

Basically I see this as:

1. Load the Team WAR data
2. Change column names to be more concise
3. Join it to the existing data frame using yearID + position. This should necessarily remove pitchers

In [472]:
# Pull the data but drop the index
position_war = pullFullTable('position_team_war', engine).drop(['index'], axis = 1)
print(position_war.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 32 columns):
Angels          180 non-null float64
Astros          165 non-null float64
Athletics       180 non-null float64
Blue Jays       180 non-null float64
Braves          160 non-null float64
Brewers         163 non-null float64
Cardinals       161 non-null float64
Cubs            160 non-null float64
Diamondbacks    167 non-null float64
Dodgers         163 non-null float64
Giants          162 non-null float64
Indians         180 non-null float64
Mariners        180 non-null float64
Marlins         162 non-null float64
Mets            162 non-null float64
Nationals       162 non-null float64
Orioles         180 non-null float64
Padres          164 non-null float64
Phillies        162 non-null float64
Pirates         162 non-null float64
Position        180 non-null object
Rangers         180 non-null float64
Rays            180 non-null float64
Red Sox         180 non-null float64
Reds    

In [473]:
# Change the Year to a date-time and call it "yearID"
position_war['yearID'] = pd.to_datetime(position_war.Year, format = '%Y')
position_war = position_war.drop(['Year'], axis = 1)
print(position_war.columns)

Index(['Angels', 'Astros', 'Athletics', 'Blue Jays', 'Braves', 'Brewers',
       'Cardinals', 'Cubs', 'Diamondbacks', 'Dodgers', 'Giants', 'Indians',
       'Mariners', 'Marlins', 'Mets', 'Nationals', 'Orioles', 'Padres',
       'Phillies', 'Pirates', 'Position', 'Rangers', 'Rays', 'Red Sox', 'Reds',
       'Rockies', 'Royals', 'Tigers', 'Twins', 'White Sox', 'Yankees',
       'yearID'],
      dtype='object')


In [474]:
# Create a dictionary for converting these to abbreviations
team_dict = {'Angels' : 'LAA', 'Astros' : 'HOU', 'Athletics' : 'OAK', 'Blue Jays' : 'TOR', 
             'Braves' : 'ATL', 'Brewers': 'MIL', 'Cardinals' : 'STL', 'Cubs' : 'CHN',
             'Diamondbacks' : 'ARI', 'Dodgers' : 'LAN', 'Giants' : 'SFN', 'Indians' : 'CLE',
             'Mariners' : 'SEA', 'Marlins' : 'MIA', 'Mets' : 'NYN', 'Nationals' : 'WAS',
             'Orioles' : 'BAL', 'Padres' : 'SDN', 'Phillies' : 'PHI', 'Pirates' : 'PIT', 
             'Rangers' : 'TEX', 'Rays' : 'TBR', 'Red Sox' : 'BOS', 'Reds' : 'CIN', 
             'Rockies' : 'COL', 'Royals' : 'KCR', 'Tigers' : 'DET', 'Twins' : 'MIN', 
             'White Sox' : 'CHA', 'Yankees' : 'NYA'}

# Alter it to include WAR
team_dict = {key : value + "_WAR" for key, value in team_dict.items()}
print(team_dict)

{'Angels': 'LAA_WAR', 'Astros': 'HOU_WAR', 'Athletics': 'OAK_WAR', 'Blue Jays': 'TOR_WAR', 'Braves': 'ATL_WAR', 'Brewers': 'MIL_WAR', 'Cardinals': 'STL_WAR', 'Cubs': 'CHN_WAR', 'Diamondbacks': 'ARI_WAR', 'Dodgers': 'LAN_WAR', 'Giants': 'SFN_WAR', 'Indians': 'CLE_WAR', 'Mariners': 'SEA_WAR', 'Marlins': 'MIA_WAR', 'Mets': 'NYN_WAR', 'Nationals': 'WAS_WAR', 'Orioles': 'BAL_WAR', 'Padres': 'SDN_WAR', 'Phillies': 'PHI_WAR', 'Pirates': 'PIT_WAR', 'Rangers': 'TEX_WAR', 'Rays': 'TBR_WAR', 'Red Sox': 'BOS_WAR', 'Reds': 'CIN_WAR', 'Rockies': 'COL_WAR', 'Royals': 'KCR_WAR', 'Tigers': 'DET_WAR', 'Twins': 'MIN_WAR', 'White Sox': 'CHA_WAR', 'Yankees': 'NYA_WAR'}


In [475]:
position_war = position_war.rename(columns = team_dict)
print(position_war.head())

   LAA_WAR  HOU_WAR  OAK_WAR  TOR_WAR  ATL_WAR  MIL_WAR  STL_WAR  CHN_WAR  \
0     -0.2      2.4      0.8      1.9      5.7     -0.2      1.3      0.6   
1     -0.8      1.7      0.4      2.0      2.4      1.2      0.4      0.9   
2      2.2      3.0      1.5      2.6      2.2      2.6      3.3      2.6   
3      2.3      1.4      2.8      0.3      2.4      1.0      1.1      0.1   
4     -1.4      0.3      1.8     -0.1     -0.3     -1.4      3.8      0.4   

   ARI_WAR  LAN_WAR    ...      TBR_WAR  BOS_WAR  CIN_WAR  COL_WAR  KCR_WAR  \
0      2.8      2.3    ...         -0.3      2.3      1.7      1.1      1.1   
1      2.1      0.4    ...          3.3      2.7      1.9     -0.1      0.7   
2      2.4      5.6    ...          0.2      1.6     -0.5      0.0      1.0   
3      0.7      6.4    ...          0.3      1.8      2.3      0.1      0.0   
4      2.0      2.9    ...          1.0      2.4      2.3     -0.7      1.0   

   DET_WAR  MIN_WAR  CHA_WAR  NYA_WAR     yearID  
0      0.8 

In [476]:
position_war.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 32 columns):
LAA_WAR     180 non-null float64
HOU_WAR     165 non-null float64
OAK_WAR     180 non-null float64
TOR_WAR     180 non-null float64
ATL_WAR     160 non-null float64
MIL_WAR     163 non-null float64
STL_WAR     161 non-null float64
CHN_WAR     160 non-null float64
ARI_WAR     167 non-null float64
LAN_WAR     163 non-null float64
SFN_WAR     162 non-null float64
CLE_WAR     180 non-null float64
SEA_WAR     180 non-null float64
MIA_WAR     162 non-null float64
NYN_WAR     162 non-null float64
WAS_WAR     162 non-null float64
BAL_WAR     180 non-null float64
SDN_WAR     164 non-null float64
PHI_WAR     162 non-null float64
PIT_WAR     162 non-null float64
Position    180 non-null object
TEX_WAR     180 non-null float64
TBR_WAR     180 non-null float64
BOS_WAR     180 non-null float64
CIN_WAR     162 non-null float64
COL_WAR     166 non-null float64
KCR_WAR     180 non-null float64
DET_W

### Now add these WAR data to the batting data, by position/year


In [477]:
fa_bat_pos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5240 entries, 0 to 5239
Data columns (total 14 columns):
Age            5240 non-null int64
Destination    5240 non-null object
Origin         5240 non-null object
WAR_3          5240 non-null float64
nameFirst      5240 non-null object
nameLast       5240 non-null object
playerID       5240 non-null object
yearID         5240 non-null datetime64[ns]
G              5240 non-null float64
OBP            5240 non-null float64
SLG            5240 non-null float64
HR             5240 non-null float64
RBI            5240 non-null float64
Position       5240 non-null object
dtypes: datetime64[ns](1), float64(6), int64(1), object(6)
memory usage: 614.1+ KB


In [478]:
fa_bat_pos['Position'].value_counts()

P     2539
OF     670
C      546
3B     301
2B     298
1B     297
SS     235
DH     119
LF      94
RF      73
CF      68
Name: Position, dtype: int64

# ALERT ALERT: Grab Pitcher info and add in to fix this!!

In [479]:
# Merge the 2 data frames
# This is only the ~2350 position players for now; do a left join
fa_bat_team_war = pd.merge(fa_bat_pos, position_war, how = 'left',
                           on = ['Position', 'yearID'], )

print(fa_bat_team_war.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5240 entries, 0 to 5239
Data columns (total 44 columns):
Age            5240 non-null int64
Destination    5240 non-null object
Origin         5240 non-null object
WAR_3          5240 non-null float64
nameFirst      5240 non-null object
nameLast       5240 non-null object
playerID       5240 non-null object
yearID         5240 non-null datetime64[ns]
G              5240 non-null float64
OBP            5240 non-null float64
SLG            5240 non-null float64
HR             5240 non-null float64
RBI            5240 non-null float64
Position       5240 non-null object
LAA_WAR        2031 non-null float64
HOU_WAR        1932 non-null float64
OAK_WAR        2031 non-null float64
TOR_WAR        2031 non-null float64
ATL_WAR        1912 non-null float64
MIL_WAR        1935 non-null float64
STL_WAR        1914 non-null float64
CHN_WAR        1912 non-null float64
ARI_WAR        1955 non-null float64
LAN_WAR        1931 non-null float64
SFN_WA

## Task ???: Pull Team data

Pull this to help with team -> teamID

In [480]:
# Change column names to team abbreviations using Team Data
teams = pullFullTable('teams', engine)
print(teams.head())
print(list(teams.columns))

   index  yearID lgID teamID franchID divID  Rank    G  Ghome    W  \
0   2265    1998   AL    ANA      ANA     W     2  162   81.0   85   
1   2266    1998   NL    ARI      ARI     W     5  162   81.0   65   
2   2267    1998   NL    ATL      ATL     E     1  162   81.0  106   
3   2268    1998   AL    BAL      BAL     E     4  162   81.0   79   
4   2269    1998   AL    BOS      BOS     E     2  162   81.0   92   

      ...        DP     FP                  name                         park  \
0     ...       146  0.983        Anaheim Angels   Edison International Field   
1     ...       125  0.984  Arizona Diamondbacks            Bank One Ballpark   
2     ...       139  0.985        Atlanta Braves                 Turner Field   
3     ...       144  0.987     Baltimore Orioles  Oriole Park at Camden Yards   
4     ...       128  0.983        Boston Red Sox               Fenway Park II   

  attendance  BPF  PPF  teamIDBR  teamIDlahman45  teamIDretro  
0  2519280.0  102  102      

In [481]:
# Pull just a handful of these columns (W, G, teamID, name, yearID)
teams_short = teams[['yearID', 'teamID', 'name', 'W', 'G']]
teams_short['teamID'].value_counts()

PIT    20
OAK    20
CHA    20
NYN    20
DET    20
TOR    20
PHI    20
MIL    20
BOS    20
COL    20
CLE    20
ARI    20
BAL    20
TEX    20
KCA    20
CHN    20
LAN    20
TBA    20
CIN    20
NYA    20
HOU    20
ATL    20
SEA    20
SDN    20
SLN    20
SFN    20
MIN    20
FLO    14
WAS    13
LAA    13
MON     7
ANA     7
MIA     6
Name: teamID, dtype: int64

In [482]:
# Convert altered names/teamID 
name_change = {'Anaheim Angels': 'Los Angeles Angels of Anaheim', 
                   'Tampa Bay Devil Rays' : 'Tampa Bay Rays',
                   'Montreal Expos' : 'Washington Nationals', 
                   'Florida Marlins' : 'Miami Marlins'
                  }
    
origin_change = {'ANA': 'LAA', 'TBD':'TBR', 'MON':'WAS', 'FLO':'MIA'}

teams_short['name'] = teams_short['name'].replace(name_change)
teams_short['teamID'] = teams_short['teamID'].replace(origin_change)

# Change W/G to W_Pct
    
teams_short['W_Pct'] = teams_short['W'].divide(teams_short.G)

teams_short = teams_short.drop(['W','G'], axis = 1)

teams_short.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,yearID,teamID,name,W_Pct
0,1998,LAA,Los Angeles Angels of Anaheim,0.524691
1,1998,ARI,Arizona Diamondbacks,0.401235
2,1998,ATL,Atlanta Braves,0.654321
3,1998,BAL,Baltimore Orioles,0.487654
4,1998,BOS,Boston Red Sox,0.567901


In [483]:
teams_short.name.value_counts()

Boston Red Sox                   20
Seattle Mariners                 20
New York Yankees                 20
Colorado Rockies                 20
Cleveland Indians                20
Milwaukee Brewers                20
San Diego Padres                 20
Cincinnati Reds                  20
Oakland Athletics                20
Chicago Cubs                     20
Washington Nationals             20
Toronto Blue Jays                20
Detroit Tigers                   20
Miami Marlins                    20
San Francisco Giants             20
Pittsburgh Pirates               20
St. Louis Cardinals              20
Kansas City Royals               20
Arizona Diamondbacks             20
Houston Astros                   20
Minnesota Twins                  20
Baltimore Orioles                20
Los Angeles Angels of Anaheim    20
Tampa Bay Rays                   20
New York Mets                    20
Atlanta Braves                   20
Los Angeles Dodgers              20
Philadelphia Phillies       

### Next task: Use these data to change the free_agents data a bit

We need to convert the Destination data from free_agents...this is where to do it!

*This will also remove the FAs without teams*

In [484]:
# Check destination data
fa_bat_team_war.Destination.value_counts()

Los Angeles Dodgers              228
Chicago Cubs                     214
Boston Red Sox                   210
New York Mets                    209
Philadelphia Phillies            205
New York Yankees                 201
Washington Nationals             200
Texas Rangers                    199
Colorado Rockies                 199
Cleveland Indians                199
Baltimore Orioles                194
Pittsburgh Pirates               189
Cincinnati Reds                  185
San Diego Padres                 182
Tampa Bay Rays                   179
Kansas City Royals               178
San Francisco Giants             177
Toronto Blue Jays                174
Atlanta Braves                   161
Seattle Mariners                 157
Milwaukee Brewers                157
Houston Astros                   152
St. Louis Cardinals              148
Miami Marlins                    144
Detroit Tigers                   144
Arizona Diamondbacks             141
Chicago White Sox                136
O

In [485]:
#Fix the weird angels data
fa_bat_team_war['Destination'] = fa_bat_team_war['Destination'].replace({'Los Angeles Angels' :
                                                                         'Los Angeles Angels of Anaheim'})
fa_bat_team_war.Destination.value_counts()

Los Angeles Dodgers              228
Chicago Cubs                     214
Boston Red Sox                   210
New York Mets                    209
Philadelphia Phillies            205
New York Yankees                 201
Washington Nationals             200
Texas Rangers                    199
Colorado Rockies                 199
Cleveland Indians                199
Baltimore Orioles                194
Pittsburgh Pirates               189
Cincinnati Reds                  185
San Diego Padres                 182
Tampa Bay Rays                   179
Kansas City Royals               178
San Francisco Giants             177
Toronto Blue Jays                174
Atlanta Braves                   161
Seattle Mariners                 157
Milwaukee Brewers                157
Houston Astros                   152
St. Louis Cardinals              148
Miami Marlins                    144
Detroit Tigers                   144
Arizona Diamondbacks             141
Chicago White Sox                136
O

In [486]:
# Do a join to the sub-team DF
team_translate = teams_short[['teamID', 'name']].drop_duplicates()


fa_bat_team_war_teamID = pd.merge(fa_bat_team_war, team_translate, how = 'left',
                          left_on = ['Destination'], right_on = ['name'])

# Substitute the Destination Column with the info from teamID and drop teamID
fa_bat_team_war_teamID['Destination'] = fa_bat_team_war_teamID['teamID']
fa_bat_team_war_teamID = fa_bat_team_war_teamID.drop(['teamID'], axis = 1)
fa_bat_team_war_teamID.drop_duplicates().head(10)

Unnamed: 0,Age,Destination,Origin,WAR_3,nameFirst,nameLast,playerID,yearID,G,OBP,...,TBR_WAR,BOS_WAR,CIN_WAR,COL_WAR,KCR_WAR,DET_WAR,MIN_WAR,CHA_WAR,NYA_WAR,name
0,28,SEA,LAA,1.9,Allen,Watson,watsoal01,1998-01-01,-0.598315,-1.1836,...,,,,,,,,,,Seattle Mariners
1,30,PIT,DET,-2.5,Frank,Castillo,castifr01,1998-01-01,-0.618614,-1.1836,...,,,,,,,,,,Pittsburgh Pirates
2,27,LAN,BOS,0.1,Robinson,Checo,checoro01,1998-01-01,-1.126081,-1.1836,...,,,,,,,,,,Los Angeles Dodgers
3,31,TOR,STL,-0.2,Pat,Kelly,kellypa03,1998-01-01,-0.090848,0.424072,...,2.0,2.2,1.5,-1.4,4.9,5.1,0.1,4.4,3.3,Toronto Blue Jays
4,28,PIT,NYM,-1.1,Brad,Clontz,clontbr01,1998-01-01,-0.760705,-1.1836,...,,,,,,,,,,Pittsburgh Pirates
5,30,CLE,CHC,1.5,Chris,Haney,haneych01,1998-01-01,-0.395328,-1.1836,...,,,,,,,,,,Cleveland Indians
6,30,WAS,SDP,-1.6,James,Mouton,moutoja01,1998-01-01,-0.05025,0.33114,...,,,,,,,,,,Washington Nationals
7,37,NYA,NYY,1.4,Darryl,Strawberry,strawda01,1998-01-01,0.88349,0.818029,...,1.7,2.4,,-0.3,-0.6,-0.3,-0.8,2.8,2.5,New York Yankees
8,38,CLE,SDP,1.4,Mark,Langston,langsma01,1998-01-01,-0.699809,-0.312778,...,,,,,,,,,,Cleveland Indians
9,41,NYA,TEX,-0.4,Tony,Fossas,fossato01,1998-01-01,-0.334432,-1.1836,...,,,,,,,,,,New York Yankees


## Task 5: Use payroll data to cluster teams

Now I need to use payroll data to create clusters of teams. So I'll:

1. Load the payroll data
2. Standardize it for each year
3. Run it through clustering
4. Use cluster labels to create a translation

In [496]:
# Load payroll data
payrolls = pullFullTable('payrolls', engine)
payrolls.set_index('Year', inplace=True)
payrolls.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 20 entries, 1998-01-01 00:00:00-08:00 to 2017-01-01 00:00:00-08:00
Data columns (total 30 columns):
Arizona Diamondbacks     20 non-null float64
Atlanta Braves           20 non-null float64
Baltimore Orioles        20 non-null float64
Boston Red Sox           20 non-null float64
Chicago Cubs             20 non-null float64
Chicago White Sox        20 non-null float64
Cincinnati Reds          20 non-null float64
Cleveland Indians        20 non-null float64
Colorado Rockies         20 non-null float64
Detroit Tigers           20 non-null float64
Houston Astros           20 non-null float64
Kansas City Royals       20 non-null float64
Los Angeles Angels       20 non-null float64
Los Angeles Dodgers      20 non-null float64
Miami Marlins            20 non-null float64
Milwaukee Brewers        20 non-null float64
Minnesota Twins          20 non-null float64
New York Mets            20 non-null float64
New York Yankees         20 non-null 

In [501]:
# Transpose the data and standardize it
payrolls_transposed = payrolls.transpose()
payrolls_transposed_standard = payrolls_transposed.transform(zscore)
payrolls_transposed_standard

Year,1998-01-01 00:00:00-08:00,1999-01-01 00:00:00-08:00,2000-01-01 00:00:00-08:00,2001-01-01 00:00:00-08:00,2002-01-01 00:00:00-08:00,2003-01-01 00:00:00-08:00,2004-01-01 00:00:00-08:00,2005-01-01 00:00:00-08:00,2006-01-01 00:00:00-08:00,2007-01-01 00:00:00-08:00,2008-01-01 00:00:00-08:00,2009-01-01 00:00:00-08:00,2010-01-01 00:00:00-08:00,2011-01-01 00:00:00-08:00,2012-01-01 00:00:00-08:00,2013-01-01 00:00:00-08:00,2014-01-01 00:00:00-08:00,2015-01-01 00:00:00-08:00,2016-01-01 00:00:00-08:00,2017-01-01 00:00:00-08:00
Arizona Diamondbacks,-0.753345,1.052155,1.011562,0.812363,1.452963,0.352061,0.022849,-0.31888,-0.563431,-0.914489,-0.626029,-0.449994,-0.80553,-0.977706,-0.655632,-0.357012,-0.057106,-1.408065,-0.696131,-0.922471
Atlanta Braves,1.291746,1.274919,1.237954,1.086543,1.068447,1.28019,0.655011,0.398006,0.397228,0.141994,0.343782,0.246606,-0.175462,-0.146344,-0.406266,-0.376312,-0.098971,-0.81022,-0.991658,0.192995
Baltimore Orioles,2.023482,1.073484,1.257092,0.362773,-0.287846,0.106977,-0.539899,0.025154,-0.156538,0.318375,-0.599216,-0.642677,-0.250166,-0.18871,-0.458183,-0.320851,-0.180595,-0.077264,0.356148,0.775497
Boston Red Sox,0.760615,1.116141,1.167002,1.813591,1.681205,1.052145,1.805297,1.498736,1.340867,1.81401,1.175508,0.997529,1.906952,1.716745,2.075792,1.169489,1.115339,1.171818,1.464799,1.077278
Chicago Cubs,0.607805,0.341208,0.27637,-0.037227,0.33725,0.324144,0.666786,0.41494,0.531493,0.513353,0.772247,1.389498,1.484514,0.801898,-0.271228,-0.046654,-0.610936,-0.119878,0.772532,0.976764
Chicago White Sox,-0.23635,-1.119553,-1.169274,0.008265,-0.429315,-0.722176,-0.118768,0.062885,0.794035,0.783324,0.848395,0.226798,0.458594,0.870181,-0.030423,0.395257,-0.560651,-0.281559,-0.325869,-0.471115
Cincinnati Reds,-1.235334,-0.285848,-0.559648,-0.682308,-0.922811,-0.419447,-0.694841,-0.331952,-0.524664,-0.421044,-0.413673,-0.448793,-0.495281,-0.42172,-0.436919,0.09577,-0.064122,-0.10559,-0.813359,-1.122302
Cleveland Indians,1.257414,1.217095,0.947612,1.116051,0.469672,-0.919041,-1.075997,-0.937725,-0.67847,-0.626521,-0.283632,-0.20809,-0.792769,-1.088605,-0.541029,-0.5265,-0.762491,-0.857095,-0.710011,-0.317951
Colorado Rockies,0.476537,0.294759,0.369728,0.250478,-0.437539,-0.135932,-0.111331,-0.739861,-1.144932,-0.855396,-0.56007,-0.399572,-0.180513,-0.117685,-0.55097,-0.683342,-0.451429,-0.593642,-0.384804,-0.664485
Detroit Tigers,-1.192925,-0.626155,0.258165,-0.658538,-0.511564,-0.788886,-0.688333,-0.118045,0.159269,0.36727,1.290802,0.797643,0.846471,0.319677,0.946604,0.949421,1.10154,1.274592,1.457752,1.219912


In [None]:
# Cluster them

# Perform the necessary imports
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Grab

# Final Task: Save the data

For now, save a test set

In [487]:
import pickle

test_set = fa_bat_team_war_teamID.dropna()
final_data = test_set

final_data.to_pickle('final_data.pickle')

# Experiments with Standardizing

