# Assemble the Features

We want to assemble our data in to a data frame of features; for now I'm going to try to make something including:

* Position player performance data (~3 numbers)
* Position player position
* Team salary data
* Team performance for position (previous year)
* Team value lost for position (from previous year, using FAs)

We'll try doing it in stages

In [317]:
# Bring in packages and connect to database
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

# Set postgres username/password, and connection specifics
username = 'postgres'
password = 'S@ndw1ches'     # change this
host     = 'localhost'
port     = '5432'            # default port that postgres listens on
db_name  = 'mlb_fa_db'

engine = create_engine( 'postgresql://{}:{}@{}:{}/{}'.format(username, password, host, port, db_name) )

In [318]:
# Make a quick querying function
def pullFullTable(table, engine):
    '''Quick little function for pulling a full table'''
    
    query = 'select * from {}'.format(table)
    
    # Execute the query with context manager
    with engine.connect() as con:
        results = con.execute(query)
        fetched_data = pd.DataFrame(results.fetchall())
        fetched_data.columns = results.keys()
        
    return fetched_data

## Task 1: Grab Batting data and filter it by only free agents

We'll do it in 5 stages:

1. Pull batting data and shorten its columns to just the ones I want
2. Pull the "people" data to get the first/last names for batting data
3. Join batting and people to get all the data JUST for our desired years
3. Pull the "free_agents" data
5. Join "batting" and new free_agents/people to filter batting by only free agents

In [319]:
print(engine.table_names())

['batting', 'pitching', 'salary', 'people', 'appearances', 'teams', 'position_team_war', 'pitcher_team_war', 'payrolls', 'free_agents']


In [320]:
# Create our query
batting_data = pullFullTable('batting', engine)
    
batting_data.head()

Unnamed: 0,index,playerID,yearID,stint,teamID,lgID,G,AB,R,H,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,76643,abbotje01,1998,1,CHA,AL,89,244,33,68,...,41.0,3.0,3.0,9,28.0,1.0,0.0,2.0,5.0,2.0
1,76644,abbotji01,1998,1,CHA,AL,5,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,76645,abbotku01,1998,1,OAK,AL,35,123,17,33,...,9.0,2.0,1.0,10,34.0,0.0,1.0,1.0,1.0,3.0
3,76646,abbotku01,1998,2,COL,NL,42,71,9,18,...,15.0,0.0,0.0,2,19.0,0.0,1.0,0.0,2.0,2.0
4,76647,abbotpa01,1998,1,SEA,AL,4,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [321]:
# Drop non-numeric team/league columns
batting_data.drop(['teamID','lgID'], axis = 1)

# Add data from players who had multiple stints
batting_data.groupby(['playerID','yearID']).sum().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,index,stint,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
playerID,yearID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
aardsda01,2004,84653,1,11,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
aardsda01,2006,87329,1,45,2,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
aardsda01,2007,88706,1,25,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
aardsda01,2008,90091,1,47,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
aardsda01,2009,91476,1,73,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [322]:
# Create new variables and select only them (fill NaN with 0)
batting_data['OBP'] = (batting_data['H'] + 
                       batting_data['BB'] + 
                       batting_data['HBP']).divide(batting_data['AB'] + 
                                                   batting_data['BB'] + 
                                                   batting_data['HBP'] + 
                                                   batting_data['SF']).fillna(0)

batting_data['SLG'] = (batting_data['H'] + 
                       batting_data['2B'] + 
                       2 * batting_data['3B'] + 
                       3 * batting_data['HR']).divide(batting_data['AB']).fillna(0)

batting_trimmed = batting_data[['playerID', 'yearID', 'G', 'OBP', 'SLG', 'HR', 'RBI']]

# Fill "NaN" values for OBP/SLG with 0
print(batting_trimmed.shape)
print(batting_trimmed.head())

(27681, 7)
    playerID  yearID   G       OBP       SLG  HR   RBI
0  abbotje01    1998  89  0.298450  0.491803  12  41.0
1  abbotji01    1998   5  0.000000  0.000000   0   0.0
2  abbotku01    1998  35  0.325926  0.390244   2   9.0
3  abbotku01    1998  42  0.276316  0.464789   3  15.0
4  abbotpa01    1998   4  0.000000  0.000000   0   0.0


### Now we'll pull the People and Free Agents and join People to Batting, and 

Note: I tried to do the join directly with SQL and it got mad, so I'm going to do it here instead

In [323]:
# Bring in people and free agents
people = pullFullTable('people', engine)
free_agents = pullFullTable('free_agents', engine)

print(people.shape, free_agents.shape)
print(people.columns, free_agents.columns)

(19370, 25) (7687, 9)
Index(['index', 'playerID', 'birthYear', 'birthMonth', 'birthDay',
       'birthCountry', 'birthState', 'birthCity', 'deathYear', 'deathMonth',
       'deathDay', 'deathCountry', 'deathState', 'deathCity', 'nameFirst',
       'nameLast', 'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut',
       'finalGame', 'retroID', 'bbrefID'],
      dtype='object') Index(['index', 'Age', 'Destination', 'Full_Name', 'Origin', 'WAR_3',
       'nameFirst', 'nameLast', 'Year'],
      dtype='object')


In [324]:
# Join people to batting, adding ONLY nameLast/nameFirst
people_trimmed = people[['playerID', 'nameFirst', 'nameLast']]

batting_w_people = pd.merge(batting_trimmed, people_trimmed, on = 'playerID', how = 'inner')
print(batting_w_people.shape)
print(batting_w_people.columns)

(27681, 9)
Index(['playerID', 'yearID', 'G', 'OBP', 'SLG', 'HR', 'RBI', 'nameFirst',
       'nameLast'],
      dtype='object')


In [325]:
# Make the yearID into a datetime object
batting_w_people['yearID'] = pd.to_datetime(batting_w_people['yearID'], format = '%Y')

In [326]:
# Join based on nameFirst/nameLast
free_agents_batting = pd.merge(free_agents, batting_w_people, 
                               left_on = ['nameFirst', 'nameLast', 'Year'],
                               right_on = ['nameFirst', 'nameLast', 'yearID'])
print(free_agents_batting.shape)
print(free_agents_batting.columns)
print(free_agents_batting.head())

(6202, 16)
Index(['index', 'Age', 'Destination', 'Full_Name', 'Origin', 'WAR_3',
       'nameFirst', 'nameLast', 'Year', 'playerID', 'yearID', 'G', 'OBP',
       'SLG', 'HR', 'RBI'],
      dtype='object')
   index  Age          Destination       Full_Name Origin  WAR_3 nameFirst  \
0      0   28     Seattle Mariners    Allen Watson    LAA    1.9     Allen   
1      1   30   Pittsburgh Pirates  Frank Castillo    DET   -2.5     Frank   
2      2   27  Los Angeles Dodgers  Robinson Checo    BOS    0.1  Robinson   
3      3   31    Toronto Blue Jays       Pat Kelly    STL   -0.2       Pat   
4      6   28   Pittsburgh Pirates     Brad Clontz    NYM   -1.1      Brad   

   nameLast       Year   playerID     yearID   G       OBP       SLG  HR   RBI  
0    Watson 1998-01-01  watsoal01 1998-01-01  28  0.000000  0.000000   0   0.0  
1  Castillo 1998-01-01  castifr01 1998-01-01  27  0.000000  0.000000   0   0.0  
2     Checo 1998-01-01  checoro01 1998-01-01   2  0.000000  0.000000   0   0.0  
3 

In [327]:
# Pull out only the desired columns
free_agents_batting = free_agents_batting.drop(['index', 'Full_Name', 'Year'], axis = 1)
print(free_agents_batting.head())

   Age          Destination Origin  WAR_3 nameFirst  nameLast   playerID  \
0   28     Seattle Mariners    LAA    1.9     Allen    Watson  watsoal01   
1   30   Pittsburgh Pirates    DET   -2.5     Frank  Castillo  castifr01   
2   27  Los Angeles Dodgers    BOS    0.1  Robinson     Checo  checoro01   
3   31    Toronto Blue Jays    STL   -0.2       Pat     Kelly  kellypa03   
4   28   Pittsburgh Pirates    NYM   -1.1      Brad    Clontz  clontbr01   

      yearID   G       OBP       SLG  HR   RBI  
0 1998-01-01  28  0.000000  0.000000   0   0.0  
1 1998-01-01  27  0.000000  0.000000   0   0.0  
2 1998-01-01   2  0.000000  0.000000   0   0.0  
3 1998-01-01  53  0.284024  0.326797   4  14.0  
4 1998-01-01  18  0.000000  0.000000   0   0.0  


## Task 2: Add positions

This will require data from our new "free_agents_batting" and "appearances". Basically:

* Pull appearances data
* Collapse "appearances" data into positions
* Join it with free_agents_batting data

In [328]:
# Bring in Appearances data to add positions
appearances = pullFullTable('appearances', engine)
    
print(appearances.head())

   index  yearID teamID lgID   playerID  G_all    GS  G_batting  G_defense  \
0  76591    1998    CHA   AL  abbotje01     89  61.0         89       76.0   
1  76592    1998    CHA   AL  abbotji01      5   5.0          0        5.0   
2  76593    1998    COL   NL  abbotku01     42  15.0         42       25.0   
3  76594    1998    OAK   AL  abbotku01     35  32.0         35       32.0   
4  76595    1998    SEA   AL  abbotpa01      4   4.0          0        4.0   

   G_p  ...   G_2b  G_3b  G_ss  G_lf  G_cf  G_rf  G_of  G_dh  G_ph  G_pr  
0    0  ...      0     0     0    20    38    27    76   2.0  15.0   0.0  
1    5  ...      0     0     0     0     0     0     0   0.0   0.0   0.0  
2    0  ...      7     3     7     4     0     5     9   1.0  19.0   1.0  
3    0  ...      0     1    28     5     0     1     5   3.0   1.0   1.0  
4    4  ...      0     0     0     0     0     0     0   0.0   0.0   0.0  

[5 rows x 22 columns]


In [329]:
# Subset to only positional data and group by playerID/yearID
appearances_compact = appearances.drop(['index', 'teamID','lgID', 'G_batting', 
                                        'G_defense','G_all','GS', 'G_ph', 'G_pr'], 
                                       axis = 1).groupby(['playerID','yearID']).sum()

# Check data
print(appearances_compact.head())

                  G_p  G_c  G_1b  G_2b  G_3b  G_ss  G_lf  G_cf  G_rf  G_of  \
playerID  yearID                                                             
aardsda01 2004     11    0     0     0     0     0     0     0     0     0   
          2006     45    0     0     0     0     0     0     0     0     0   
          2007     25    0     0     0     0     0     0     0     0     0   
          2008     47    0     0     0     0     0     0     0     0     0   
          2009     73    0     0     0     0     0     0     0     0     0   

                  G_dh  
playerID  yearID        
aardsda01 2004     0.0  
          2006     0.0  
          2007     0.0  
          2008     0.0  
          2009     0.0  


In [330]:
# Figure out primary position by melting, then grouping and finding the max value
appearances_melt = pd.melt(appearances_compact.reset_index(), id_vars= ['playerID', 'yearID'],
                           value_name = 'Games', var_name = 'Position')
print(appearances_melt.tail())

         playerID  yearID Position  Games
279010  zuninmi01    2016     G_dh    2.0
279011  zuninmi01    2017     G_dh    2.0
279012   zychto01    2015     G_dh    0.0
279013   zychto01    2016     G_dh    0.0
279014   zychto01    2017     G_dh    0.0


In [331]:
# Grab the index for the maximum games
primary_idx = appearances_melt.groupby(['playerID','yearID'])['Games'].idxmax()

# Use it to screen out the proper rows
primary_position = appearances_melt.loc[primary_idx]

# Turn the "Position" Column into the right contents by pulling just the position and capitalizing
primary_position['Position'] = primary_position.Position.str.split("_").str.get(1).str.upper()
print(primary_position.tail())

        playerID  yearID Position  Games
50725  zuninmi01    2016        C   52.0
50726  zuninmi01    2017        C  120.0
25362   zychto01    2015        P   13.0
25363   zychto01    2016        P   12.0
25364   zychto01    2017        P   45.0


In [332]:
# Finally, turn the yearID into a Datetime
primary_position['yearID'] = pd.to_datetime(primary_position.yearID, format = '%Y')

In [333]:
# Do the join on the 6202 x 13 free_agents_batting
# Join based on nameFirst/nameLast
fa_bat_pos = pd.merge(free_agents_batting, primary_position, 
                      on = ['playerID', 'yearID']).drop(['Games'], axis = 1)
print(fa_bat_pos.head())
print(fa_bat_pos.shape)

   Age          Destination Origin  WAR_3 nameFirst  nameLast   playerID  \
0   28     Seattle Mariners    LAA    1.9     Allen    Watson  watsoal01   
1   30   Pittsburgh Pirates    DET   -2.5     Frank  Castillo  castifr01   
2   27  Los Angeles Dodgers    BOS    0.1  Robinson     Checo  checoro01   
3   31    Toronto Blue Jays    STL   -0.2       Pat     Kelly  kellypa03   
4   28   Pittsburgh Pirates    NYM   -1.1      Brad    Clontz  clontbr01   

      yearID   G       OBP       SLG  HR   RBI Position  
0 1998-01-01  28  0.000000  0.000000   0   0.0        P  
1 1998-01-01  27  0.000000  0.000000   0   0.0        P  
2 1998-01-01   2  0.000000  0.000000   0   0.0        P  
3 1998-01-01  53  0.284024  0.326797   4  14.0       2B  
4 1998-01-01  18  0.000000  0.000000   0   0.0        P  
(6202, 14)


## Task 3 Add Team WAR for position

Basically I see this as:

1. Load the Team WAR data
2. Change column names to be more concise
3. Join it to the existing data frame using yearID + position. This should necessarily remove pitchers

In [334]:
# Pull the data but drop the index
position_war = pullFullTable('position_team_war', engine).drop(['index'], axis = 1)
print(position_war.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 32 columns):
Angels          180 non-null float64
Astros          165 non-null float64
Athletics       180 non-null float64
Blue Jays       180 non-null float64
Braves          160 non-null float64
Brewers         163 non-null float64
Cardinals       161 non-null float64
Cubs            160 non-null float64
Diamondbacks    167 non-null float64
Dodgers         163 non-null float64
Giants          162 non-null float64
Indians         180 non-null float64
Mariners        180 non-null float64
Marlins         162 non-null float64
Mets            162 non-null float64
Nationals       162 non-null float64
Orioles         180 non-null float64
Padres          164 non-null float64
Phillies        162 non-null float64
Pirates         162 non-null float64
Position        180 non-null object
Rangers         180 non-null float64
Rays            180 non-null float64
Red Sox         180 non-null float64
Reds    

In [335]:
# Change the Year to a date-time and call it "yearID"
position_war['yearID'] = pd.to_datetime(position_war.Year, format = '%Y')
position_war = position_war.drop(['Year'], axis = 1)
print(position_war.columns)

Index(['Angels', 'Astros', 'Athletics', 'Blue Jays', 'Braves', 'Brewers',
       'Cardinals', 'Cubs', 'Diamondbacks', 'Dodgers', 'Giants', 'Indians',
       'Mariners', 'Marlins', 'Mets', 'Nationals', 'Orioles', 'Padres',
       'Phillies', 'Pirates', 'Position', 'Rangers', 'Rays', 'Red Sox', 'Reds',
       'Rockies', 'Royals', 'Tigers', 'Twins', 'White Sox', 'Yankees',
       'yearID'],
      dtype='object')


In [336]:
# Create a dictionary for converting these to abbreviations
team_dict = {'Angels' : 'LAA', 'Astros' : 'HOU', 'Athletics' : 'OAK', 'Blue Jays' : 'TOR', 
             'Braves' : 'ATL', 'Brewers': 'MIL', 'Cardinals' : 'STL', 'Cubs' : 'CHN',
             'Diamondbacks' : 'ARI', 'Dodgers' : 'LAN', 'Giants' : 'SFN', 'Indians' : 'CLE',
             'Mariners' : 'SEA', 'Marlins' : 'MIA', 'Mets' : 'NYN', 'Nationals' : 'WAS',
             'Orioles' : 'BAL', 'Padres' : 'SDN', 'Phillies' : 'PHI', 'Pirates' : 'PIT', 
             'Rangers' : 'TEX', 'Rays' : 'TBR', 'Red Sox' : 'BOS', 'Reds' : 'CIN', 
             'Rockies' : 'COL', 'Royals' : 'KCR', 'Tigers' : 'DET', 'Twins' : 'MIN', 
             'White Sox' : 'CHA', 'Yankees' : 'NYA'}

# Alter it to include WAR
team_dict = {key : value + "_WAR" for key, value in team_dict.items()}
print(team_dict)

{'Angels': 'LAA_WAR', 'Astros': 'HOU_WAR', 'Athletics': 'OAK_WAR', 'Blue Jays': 'TOR_WAR', 'Braves': 'ATL_WAR', 'Brewers': 'MIL_WAR', 'Cardinals': 'STL_WAR', 'Cubs': 'CHN_WAR', 'Diamondbacks': 'ARI_WAR', 'Dodgers': 'LAN_WAR', 'Giants': 'SFN_WAR', 'Indians': 'CLE_WAR', 'Mariners': 'SEA_WAR', 'Marlins': 'MIA_WAR', 'Mets': 'NYN_WAR', 'Nationals': 'WAS_WAR', 'Orioles': 'BAL_WAR', 'Padres': 'SDN_WAR', 'Phillies': 'PHI_WAR', 'Pirates': 'PIT_WAR', 'Rangers': 'TEX_WAR', 'Rays': 'TBR_WAR', 'Red Sox': 'BOS_WAR', 'Reds': 'CIN_WAR', 'Rockies': 'COL_WAR', 'Royals': 'KCR_WAR', 'Tigers': 'DET_WAR', 'Twins': 'MIN_WAR', 'White Sox': 'CHA_WAR', 'Yankees': 'NYA_WAR'}


In [337]:
position_war = position_war.rename(columns = team_dict)
print(position_war.head())

   LAA_WAR  HOU_WAR  OAK_WAR  TOR_WAR  ATL_WAR  MIL_WAR  STL_WAR  CHN_WAR  \
0     -0.2      2.4      0.8      1.9      5.7     -0.2      1.3      0.6   
1     -0.8      1.7      0.4      2.0      2.4      1.2      0.4      0.9   
2      2.2      3.0      1.5      2.6      2.2      2.6      3.3      2.6   
3      2.3      1.4      2.8      0.3      2.4      1.0      1.1      0.1   
4     -1.4      0.3      1.8     -0.1     -0.3     -1.4      3.8      0.4   

   ARI_WAR  LAN_WAR    ...      TBR_WAR  BOS_WAR  CIN_WAR  COL_WAR  KCR_WAR  \
0      2.8      2.3    ...         -0.3      2.3      1.7      1.1      1.1   
1      2.1      0.4    ...          3.3      2.7      1.9     -0.1      0.7   
2      2.4      5.6    ...          0.2      1.6     -0.5      0.0      1.0   
3      0.7      6.4    ...          0.3      1.8      2.3      0.1      0.0   
4      2.0      2.9    ...          1.0      2.4      2.3     -0.7      1.0   

   DET_WAR  MIN_WAR  CHA_WAR  NYA_WAR     yearID  
0      0.8 

In [338]:
position_war.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 32 columns):
LAA_WAR     180 non-null float64
HOU_WAR     165 non-null float64
OAK_WAR     180 non-null float64
TOR_WAR     180 non-null float64
ATL_WAR     160 non-null float64
MIL_WAR     163 non-null float64
STL_WAR     161 non-null float64
CHN_WAR     160 non-null float64
ARI_WAR     167 non-null float64
LAN_WAR     163 non-null float64
SFN_WAR     162 non-null float64
CLE_WAR     180 non-null float64
SEA_WAR     180 non-null float64
MIA_WAR     162 non-null float64
NYN_WAR     162 non-null float64
WAS_WAR     162 non-null float64
BAL_WAR     180 non-null float64
SDN_WAR     164 non-null float64
PHI_WAR     162 non-null float64
PIT_WAR     162 non-null float64
Position    180 non-null object
TEX_WAR     180 non-null float64
TBR_WAR     180 non-null float64
BOS_WAR     180 non-null float64
CIN_WAR     162 non-null float64
COL_WAR     166 non-null float64
KCR_WAR     180 non-null float64
DET_W

### Now add these WAR data to the batting data, by position/year


In [339]:
fa_bat_pos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6202 entries, 0 to 6201
Data columns (total 14 columns):
Age            6202 non-null int64
Destination    6202 non-null object
Origin         6202 non-null object
WAR_3          6202 non-null float64
nameFirst      6202 non-null object
nameLast       6202 non-null object
playerID       6202 non-null object
yearID         6202 non-null datetime64[ns]
G              6202 non-null int64
OBP            6202 non-null float64
SLG            6202 non-null float64
HR             6202 non-null int64
RBI            6202 non-null float64
Position       6202 non-null object
dtypes: datetime64[ns](1), float64(4), int64(3), object(6)
memory usage: 726.8+ KB


In [340]:
fa_bat_pos['Position'].value_counts()

P     3012
OF     837
C      623
3B     365
2B     347
1B     346
SS     279
DH     135
LF     106
RF      77
CF      75
Name: Position, dtype: int64

# ALERT ALERT: Grab Pitcher info and add in to fix this!!

In [341]:
# Merge the 2 data frames
# This is only the ~2350 position players for now; do a left join
fa_bat_team_war = pd.merge(fa_bat_pos, position_war, how = 'left',
                           on = ['Position', 'yearID'], )

print(fa_bat_team_war.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6202 entries, 0 to 6201
Data columns (total 44 columns):
Age            6202 non-null int64
Destination    6202 non-null object
Origin         6202 non-null object
WAR_3          6202 non-null float64
nameFirst      6202 non-null object
nameLast       6202 non-null object
playerID       6202 non-null object
yearID         6202 non-null datetime64[ns]
G              6202 non-null int64
OBP            6202 non-null float64
SLG            6202 non-null float64
HR             6202 non-null int64
RBI            6202 non-null float64
Position       6202 non-null object
LAA_WAR        2353 non-null float64
HOU_WAR        2240 non-null float64
OAK_WAR        2353 non-null float64
TOR_WAR        2353 non-null float64
ATL_WAR        2218 non-null float64
MIL_WAR        2246 non-null float64
STL_WAR        2220 non-null float64
CHN_WAR        2218 non-null float64
ARI_WAR        2269 non-null float64
LAN_WAR        2238 non-null float64
SFN_WAR   

## Task ???: Pull Team data

Pull this to help with team -> teamID

In [342]:
# Change column names to team abbreviations using Team Data
teams = pullFullTable('teams', engine)
print(teams.head())
print(list(teams.columns))

   index  yearID lgID teamID franchID divID  Rank    G  Ghome    W  \
0   2265    1998   AL    ANA      ANA     W     2  162   81.0   85   
1   2266    1998   NL    ARI      ARI     W     5  162   81.0   65   
2   2267    1998   NL    ATL      ATL     E     1  162   81.0  106   
3   2268    1998   AL    BAL      BAL     E     4  162   81.0   79   
4   2269    1998   AL    BOS      BOS     E     2  162   81.0   92   

      ...        DP     FP                  name                         park  \
0     ...       146  0.983        Anaheim Angels   Edison International Field   
1     ...       125  0.984  Arizona Diamondbacks            Bank One Ballpark   
2     ...       139  0.985        Atlanta Braves                 Turner Field   
3     ...       144  0.987     Baltimore Orioles  Oriole Park at Camden Yards   
4     ...       128  0.983        Boston Red Sox               Fenway Park II   

  attendance  BPF  PPF  teamIDBR  teamIDlahman45  teamIDretro  
0  2519280.0  102  102      

In [343]:
# Pull just a handful of these columns (W, G, teamID, name, yearID)
teams_short = teams[['yearID', 'teamID', 'name', 'W', 'G']]
teams_short['teamID'].value_counts()

PIT    20
OAK    20
CHA    20
NYN    20
DET    20
TOR    20
PHI    20
MIL    20
BOS    20
COL    20
CLE    20
ARI    20
BAL    20
TEX    20
KCA    20
CHN    20
LAN    20
TBA    20
CIN    20
NYA    20
HOU    20
ATL    20
SEA    20
SDN    20
SLN    20
SFN    20
MIN    20
FLO    14
WAS    13
LAA    13
MON     7
ANA     7
MIA     6
Name: teamID, dtype: int64

In [344]:
# Convert altered names/teamID 
name_change = {'Anaheim Angels': 'Los Angeles Angels of Anaheim', 
                   'Tampa Bay Devil Rays' : 'Tampa Bay Rays',
                   'Montreal Expos' : 'Washington Nationals', 
                   'Florida Marlins' : 'Miami Marlins'
                  }
    
origin_change = {'ANA': 'LAA', 'TBD':'TBR', 'MON':'WAS', 'FLO':'MIA'}

teams_short['name'] = teams_short['name'].replace(name_change)
teams_short['teamID'] = teams_short['teamID'].replace(origin_change)

# Change W/G to W_Pct
    
teams_short['W_Pct'] = teams_short['W'].divide(teams_short.G)

teams_short = teams_short.drop(['W','G'], axis = 1)

teams_short.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,yearID,teamID,name,W_Pct
0,1998,LAA,Los Angeles Angels of Anaheim,0.524691
1,1998,ARI,Arizona Diamondbacks,0.401235
2,1998,ATL,Atlanta Braves,0.654321
3,1998,BAL,Baltimore Orioles,0.487654
4,1998,BOS,Boston Red Sox,0.567901


In [345]:
teams_short.name.value_counts()

Boston Red Sox                   20
Seattle Mariners                 20
New York Yankees                 20
Colorado Rockies                 20
Cleveland Indians                20
Milwaukee Brewers                20
San Diego Padres                 20
Cincinnati Reds                  20
Oakland Athletics                20
Chicago Cubs                     20
Washington Nationals             20
Toronto Blue Jays                20
Detroit Tigers                   20
Miami Marlins                    20
San Francisco Giants             20
Pittsburgh Pirates               20
St. Louis Cardinals              20
Kansas City Royals               20
Arizona Diamondbacks             20
Houston Astros                   20
Minnesota Twins                  20
Baltimore Orioles                20
Los Angeles Angels of Anaheim    20
Tampa Bay Rays                   20
New York Mets                    20
Atlanta Braves                   20
Los Angeles Dodgers              20
Philadelphia Phillies       

### Next task: Use these data to change the free_agents data a bit

We need to convert the Destination data from free_agents...this is where to do it!

*This will also remove the FAs without teams*

In [346]:
# Check destination data
fa_bat_team_war.Destination.value_counts()

Los Angeles Dodgers              269
Chicago Cubs                     248
New York Mets                    247
New York Yankees                 243
Washington Nationals             239
Texas Rangers                    237
Philadelphia Phillies            236
Cleveland Indians                235
Boston Red Sox                   235
Colorado Rockies                 231
San Diego Padres                 223
Pittsburgh Pirates               223
Baltimore Orioles                221
Cincinnati Reds                  217
Kansas City Royals               215
Toronto Blue Jays                213
Tampa Bay Rays                   210
San Francisco Giants             208
Atlanta Braves                   194
Seattle Mariners                 188
Milwaukee Brewers                188
St. Louis Cardinals              181
Houston Astros                   178
Arizona Diamondbacks             176
Chicago White Sox                171
Miami Marlins                    167
Detroit Tigers                   166
O

In [347]:
#Fix the weird angels data
fa_bat_team_war['Destination'] = fa_bat_team_war['Destination'].replace({'Los Angeles Angels' :
                                                                         'Los Angeles Angels of Anaheim'})
fa_bat_team_war.Destination.value_counts()

Los Angeles Dodgers              269
Chicago Cubs                     248
New York Mets                    247
New York Yankees                 243
Washington Nationals             239
Texas Rangers                    237
Philadelphia Phillies            236
Boston Red Sox                   235
Cleveland Indians                235
Colorado Rockies                 231
Pittsburgh Pirates               223
San Diego Padres                 223
Baltimore Orioles                221
Cincinnati Reds                  217
Kansas City Royals               215
Toronto Blue Jays                213
Tampa Bay Rays                   210
San Francisco Giants             208
Atlanta Braves                   194
Seattle Mariners                 188
Milwaukee Brewers                188
St. Louis Cardinals              181
Houston Astros                   178
Arizona Diamondbacks             176
Chicago White Sox                171
Miami Marlins                    167
Detroit Tigers                   166
L

In [353]:
# Do a join to the sub-team DF
team_translate = teams_short[['teamID', 'name']].drop_duplicates()


fa_bat_team_war_teamID = pd.merge(fa_bat_team_war, team_translate, how = 'left',
                          left_on = ['Destination'], right_on = ['name'])
fa_bat_team_war_teamID.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6202 entries, 0 to 6201
Data columns (total 46 columns):
Age            6202 non-null int64
Destination    6202 non-null object
Origin         6202 non-null object
WAR_3          6202 non-null float64
nameFirst      6202 non-null object
nameLast       6202 non-null object
playerID       6202 non-null object
yearID         6202 non-null datetime64[ns]
G              6202 non-null int64
OBP            6202 non-null float64
SLG            6202 non-null float64
HR             6202 non-null int64
RBI            6202 non-null float64
Position       6202 non-null object
LAA_WAR        2353 non-null float64
HOU_WAR        2240 non-null float64
OAK_WAR        2353 non-null float64
TOR_WAR        2353 non-null float64
ATL_WAR        2218 non-null float64
MIL_WAR        2246 non-null float64
STL_WAR        2220 non-null float64
CHN_WAR        2218 non-null float64
ARI_WAR        2269 non-null float64
LAN_WAR        2238 non-null float64
SFN_WAR   

# Final Task: Save the data

For now, save a test set

In [354]:
import pickle

test_set = fa_bat_team_war_teamID.dropna()
final_data = test_set

final_data.to_pickle('final_data.pickle')

