# Merging and Aggregating Projections



In [1]:
import pandas as pd
import numpy as np
import os

from CBS_fantasy_baseball.stats import StatCalculator

In [2]:
steam = pd.read_csv('data/steamer_hitters_2020.csv')
steam.drop(['-1','-1.1','-1.2','-1.3'], axis =1, inplace=True)
zips = pd.read_csv('data/zips_hitters_2020.csv')

In [3]:
score_cols = ['Name','playerid','HR','R','RBI','SB','AVG']
pit_score_cols = ['Name','playerid','W','K','ERA','WHIP','S']
league_positions = ['C','1B','2B','3B','SS','MI','CI','OF','RF','CF','LF','U']

In [4]:
both = pd.concat([zips,steam]).sort_index()
names = both[['playerid','Name']].groupby('playerid').agg('max')
both = both.groupby('playerid').agg('mean')
both['Name'] = names
both.reset_index(inplace=True)

In [5]:
stats = StatCalculator()
stats.replacement_level.values[:] = stats.hitter_replacement_level(both)
both['FWAR'] = both.apply(stats.hitterFWAR, axis = 1)

In [6]:
both.sort_values(by='FWAR', ascending=False).head()

Unnamed: 0,playerid,G,PA,AB,H,2B,3B,HR,R,RBI,...,wOBA,Fld,BsR,WAR,ADP,wRC+,Off,Def,Name,FWAR
545,18401,55.5,252.5,221.0,63.0,10.5,1.0,14.0,41.5,36.0,...,0.3725,0.7,1.1,1.65,1.6,128.0,10.2,-1.5,Ronald Acuna Jr.,7.476666
91,11477,53.5,236.5,202.0,61.0,12.5,1.0,13.0,37.0,38.5,...,0.3975,0.6,1.2,2.0,1.8,145.0,15.6,-2.4,Christian Yelich,6.495792
8,10155,51.5,227.5,179.0,53.0,10.0,1.0,15.0,41.5,38.0,...,0.4265,-0.3,0.85,2.95,6.8,173.0,22.9,0.2,Mike Trout,6.090397
180,12916,56.0,253.5,227.0,66.0,14.5,1.0,13.0,39.5,36.0,...,0.364,3.4,0.35,2.25,7.4,126.0,8.2,5.5,Francisco Lindor,6.018901
420,15998,56.0,236.0,201.0,58.0,11.5,1.0,15.0,38.0,43.0,...,0.3945,1.15,0.5,2.15,3.9,148.0,15.2,0.3,Cody Bellinger,5.961833


In [7]:
with open('data/cbs_hitter_projections_3aug2020.csv') as cbs:
    print(cbs.readline())
    print(cbs.readline())
cbs = pd.read_csv('data/cbs_elig_3aug2020.csv',skiprows=1,skipfooter=1,engine='python')
cbs.drop('Unnamed: 18',axis=1,inplace=True)

All Players  Season   Projections Standard Categories

Avail,Player,AB,R,H,1B,2B,3B,HR,RBI,BB,K,SB,CS,AVG,OBP,SLG,Rank,



In [8]:
positions = [x+'_elig' for x in league_positions]
for pos in positions:
    cbs[pos] = False
def set_eligibility(row): #this is slow. 
    elig = row['Eligible'].split(',')
    for e in elig:
        pos = e+'_elig'
        row[pos] = True
    return row
elig = cbs.apply(set_eligibility, axis=1)
elig = elig[list(['Player','Avail','Eligible',]+positions)]

In [9]:
from CBS_fantasy_baseball.file_handlers.name_handler import NameToFangraphsID
namer = NameToFangraphsID(both,elig)

In [10]:
namer.transform_suffix('Jr.')
cbs_to_fg_names = { #identified problem cases, among hitters
    'Nick Castellanos':'Nicholas Castellanos',
    'Gio Urshela':'Giovanny Urshella',
    'DJ Stewart': 'D.J. Stewart',
    'Abraham Toro-Hernandez':'Abraham Toro',
    'Michael Taylor': 'Michael A. Talyor',
    'JT Riddle': 'J.T. Riddle',
    #'Bobby Witt': 'Robert Witt', #can't find it, probably not in fangraphs....
    #'Mark Payton',
    #'Taylor Trammell',
    'Nate Lowe':'Nathaniel Lowe',
    'Yu Chang':'Yu-Cheng Chang',
    #'Andrew Vaughn',
    #'Elehuris Montero',
    #'Jose Garcia',
    'Cedric Mullins': 'Cedric Mullins II',
    #'Jordan Weems',
    #"Brian O'Keefe",
    #'Cal Raleigh',
    'Stevie Wilkerson': 'Steve Wilkerson'
}
player_to_fg_id = { #manually...
    'Austin Adams RP | SEA': '13801',
    'David Peterson SP | NYM' : '20302',
    'Tyler Alexander SP | DET': '17735',
    'Javy Guerra RP | WAS': '7407',
    'Cody Reed RP | CIN'  : '15232',
    'Javy Guerra RP | SD' : '17292',
    'Wander Franco SS | TB' : 'sa3007033',
    'Jose Martinez RF | TB' : '7996'


}

#namer.fg_data.reset_index(inplace=True)
namer.add_ids_from_fg_data()
namer.add_ids_from_dict(player_to_fg_id, name_data_key='Player')
namer.add_ids_from_dict(cbs_to_fg_names, name_data_key='Player')


In [11]:
df = both.merge(namer.name_data, on='playerid').drop('Name_y',axis=1)
name = df['Name_x'].copy()
df.drop('Name_x',inplace=True,axis=1)
df.insert(1,'Name',name)
df.head()


Unnamed: 0,playerid,Name,G,PA,AB,H,2B,3B,HR,R,...,2B_elig,3B_elig,SS_elig,MI_elig,CI_elig,OF_elig,RF_elig,CF_elig,LF_elig,U_elig
0,10030,Chris Owings,25.5,92.5,85.5,21.5,4.5,0.5,2.5,10.5,...,True,False,False,True,False,False,False,False,False,True
1,10047,Wil Myers,49.0,194.0,171.5,40.5,9.0,1.0,8.0,22.5,...,False,False,False,False,False,True,False,True,True,True
2,10059,Max Stassi,23.5,82.0,73.5,15.0,3.5,0.0,2.5,8.5,...,False,False,False,False,False,False,False,False,False,True
3,10067,Tomas Telis,18.0,66.0,62.0,17.0,3.0,0.5,1.0,7.0,...,False,False,False,False,False,False,False,False,False,True
4,10071,Jonathan Villar,54.0,234.5,209.5,54.0,9.5,1.0,6.0,29.0,...,True,False,True,True,False,False,False,False,False,True


In [12]:
cbs.head()
#cbs.columns

Unnamed: 0,Avail,Player,Eligible,BPA,AB,2B,3B,HR,BB,K,...,2B_elig,3B_elig,SS_elig,MI_elig,CI_elig,OF_elig,RF_elig,CF_elig,LF_elig,U_elig
0,You Drink Bitch's Wine,Aaron Judge RF | NYY,"OF,RF,U",36,31,1,0,6,2,11,...,False,False,False,False,False,False,False,False,False,False
1,You Drink Bitch's Wine,Dansby Swanson SS | ATL,"MI,SS,U",40,38,3,0,2,1,14,...,False,False,False,False,False,False,False,False,False,False
2,12oz Darlings,Nick Castellanos RF | CIN,"OF,RF,U",36,31,3,0,4,4,8,...,False,False,False,False,False,False,False,False,False,False
3,12oz Darlings,Kyle Lewis RF | SEA,"CF,OF,RF,U",44,40,0,0,3,4,17,...,False,False,False,False,False,False,False,False,False,False
4,Cackleberry Czars,Trevor Story SS | COL,"MI,SS,U",36,29,0,0,4,7,4,...,False,False,False,False,False,False,False,False,False,False


In [13]:
df['Avail'].unique()

array(['FA', 'Omak Goat Rodeo', 'Bus 3', 'Droitwich Murdercocks',
       'DJ Dootchy Dootch and 30-50 Feral Berm Angels',
       'Cackleberry Czars', "You Drink Bitch's Wine", 'W ( 8/4)',
       'Springer International', 'Screaming Prairie Camels',
       'Johnny and the Rockers', 'SweepTheLegJohnny',
       'Chicken Inn of Utica', 'The Midnight Sillies', '12oz Darlings',
       'Union State Connectors', 'W ( 8/5)', 'Suburban Lifestyle Dream'],
      dtype=object)

In [31]:
import re

class LeagueViewer:
    def __init__(self,hitters,pitchers):
        self.hitters = hitters
        self.pitchers = pitchers
        self.hitter_cols = ['Name','Eligible','PA','AVG','R','HR','RBI','SB','FWAR']
        self.pitcher_cols = ['Name','IP','W','K','S','ERA','WHIP','FWAR']
        
        self.waivers_reg_ex = r'W \( (\d){1,2}/(\d){1,2}\)'
        self.fa_regex = r'(FA)|W \( (\d){1,2}/(\d){1,2}\)'
        
    def list_teams(self):
        out = self.hitters['Avail'].unique()
        out = [x for x in out if not re.match(self.fa_regex,x)]
        return out
    
    def best_free_hitters(self,num=10,pos='U'):
        out = self.hitters[self.hitters['Avail'].str.match(self.fa_regex)]
        out = out[out[pos+'_elig']]
        return out[self.hitter_cols].sort_values(by='FWAR',ascending=False)
    
    def best_free_pitchers(self,num=10):
        out = self.pitchers[self.pitchers['Avail'].str.match(self.fa_regex)]
        return out[self.pitcher_cols].sort_values(by='FWAR',ascending=False)
    
    def view_team(self,team):
        out = self.hitters[self.hitters['Avail'].str.match(team,case=False)]
        return out[self.hitter_cols].sort_values(by='FWAR',ascending=False)

In [15]:
#test that reg_ex
test = LeagueViewer(df,False)
reg_ex = 'W \( (\d){1,2}'
string = "FA"
print(re.match(test.fa_regex, string))

<_sre.SRE_Match object; span=(0, 2), match='FA'>


In [16]:
test.best_free_hitters(pos='1B')
test.list_teams()
test.view_team('bus')

Unnamed: 0,Name,Eligible,PA,AVG,R,HR,RBI,SB,FWAR
439,Alex Bregman,"3B,CI,MI,SS,U",243.5,0.289,39.5,12.5,40.0,3.0,4.846472
4,Jonathan Villar,"2B,MI,SS,U",234.5,0.258,29.0,6.0,21.0,13.5,3.64327
495,Pete Alonso,"1B,CI,U",235.5,0.254,33.5,16.0,38.5,0.5,3.393745
131,Eugenio Suarez,"3B,CI,U",234.0,0.2575,31.0,13.5,37.5,1.0,2.834886
344,Austin Meadows,"LF,OF,RF,U",205.5,0.271,28.0,9.0,29.0,5.0,2.59636
422,Ramon Laureano,"CF,OF,U",198.5,0.259,27.0,8.0,24.5,5.5,1.930973
645,Avisail Garcia,"OF,RF,U",190.0,0.275,23.0,8.0,27.0,3.0,1.405923
55,Jonathan Schoop,"2B,MI,U",203.0,0.264,26.0,9.5,28.0,0.5,1.056399
215,David Dahl,"CF,LF,OF,RF,U",176.5,0.276,24.5,7.5,23.5,2.5,0.985355
35,Corey Dickerson,"LF,OF,U",184.0,0.272,21.0,6.5,23.0,1.0,0.020773


In [17]:
os.listdir('data')

['cbs_elig_3aug2020.csv',
 'cbs_hitter_projections_3aug2020.csv',
 'cbs_pitcher_projections_4aug2020.csv',
 'projection_data_files',
 'steamer_hitters_2020.csv',
 'steamer_pitchers_2020.csv',
 'TestFiles',
 'zips_hitters_2020.csv',
 'zips_pitchers_2020.csv']

In [18]:
steam = pd.read_csv('data/steamer_pitchers_2020.csv')
zips = pd.read_csv('data/zips_pitchers_2020.csv')
cbs = pd.read_csv('data/cbs_pitcher_projections_4aug2020.csv', skiprows=1, skipfooter=1,engine='python')

steam.columns

Index(['Name', 'Team', 'W', 'L', 'ERA', 'GS', 'G', 'SV', 'IP', 'H', 'ER', 'HR',
       'SO', 'BB', 'WHIP', 'K/9', 'BB/9', 'FIP', 'WAR', 'RA9-WAR', 'ADP',
       'playerid'],
      dtype='object')

In [19]:
zips.columns
zips.rename(columns={'SO':'K'}, inplace=True)

steam.rename(columns={'SO':'K','SV':'S'}, inplace=True)

In [20]:
print(cbs.columns)
cbs.drop('Unnamed: 17',axis = 1, inplace=True)
cbs.rename(columns={'INNs':'IP','APP':'G'}, inplace=True)

Index(['Avail', 'Player', 'INNs', 'APP', 'GS', 'QS', 'CG', 'W', 'L', 'S', 'BS',
       'K', 'BB', 'H', 'ERA', 'WHIP', 'Rank', 'Unnamed: 17'],
      dtype='object')


In [21]:
pitchers = pd.concat([zips,steam])
names = pitchers[['Name','playerid']].groupby('playerid').agg('max')
pitchers = pitchers.groupby('playerid').agg('mean')
pitchers.insert(0,'Name',names)
pitchers.reset_index(inplace=True)

In [22]:
pitchers.head()

Unnamed: 0,playerid,Name,W,L,ERA,GS,G,IP,H,ER,...,K,BB,WHIP,K/9,BB/9,FIP,WAR,ADP,S,RA9-WAR
0,10021,Mike Minor,4.0,3.5,4.47,11.0,11.0,69.0,67.5,34.5,...,65.0,21.0,1.28,8.46,2.765,4.475,1.15,168.6,0.0,1.4
1,10029,Patrick Schuster,0.0,0.0,4.95,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.52,7.12,4.16,5.16,0.0,999.0,0.0,0.0
2,10039,Keyvius Sampson,1.0,1.5,4.98,2.5,4.5,18.5,15.0,10.0,...,19.5,10.5,1.45,8.93,4.925,4.83,0.2,999.0,0.0,0.0
3,10044,Ashur Tolliver,0.0,0.0,5.28,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.61,7.38,5.17,5.57,0.0,999.0,0.0,0.0
4,10058,Tyler Matzek,0.0,0.0,6.43,0.0,7.0,7.0,6.0,5.0,...,9.0,10.0,2.29,11.57,12.86,7.21,-0.2,999.0,,


In [23]:
namer = NameToFangraphsID(pitchers,cbs)
namer.transform_suffix('Jr.')
cbs_pit_to_fg_names = { ## we are going to comment out unhandled cases so we have a record later.
    #'Lance McCullers',# a junior, will fix elsewise
    'Kwang Hyun Kim': 'Kwang-hyun Kim',
    'Jake Junis': 'Jakob Junis',
    #'Carl Edwards', #jr.
    'J.T. Brubaker':'Johnathan Brubaker',
    #'Nick Lodolo',##not forcast in either set!
    #'Brooks Raley',#not found
    'Cam Hill':'Cameron Hill',
    #'Stephen Woods':'', #jr.
    #'Duane Underwood',#jr.
    'Mike Shawaryn':'Michael Shawaryn'
}
player_to_fg_id = { #manually...
    'Austin Adams RP | SEA': '13801',
    'David Peterson SP | NYM' : '20302',
    'Tyler Alexander SP | DET': '17735',
    'Javy Guerra RP | WAS': '7407',
    'Cody Reed RP | CIN'  : '15232',
    'Javy Guerra RP | SD' : '17292',
    'Wander Franco SS | TB' : 'sa3007033',
    'Jose Martinez RF | TB' : '7996'


}

#namer.fg_data.reset_index(inplace=True)
namer.add_ids_from_fg_data()
namer.add_ids_from_dict(player_to_fg_id, name_data_key='Player')
namer.add_ids_from_dict(cbs_to_fg_names, name_data_key='Player')

In [24]:
namer.name_data.head()

Unnamed: 0,Avail,Player,IP,G,GS,QS,CG,W,L,S,BS,K,BB,H,ERA,WHIP,Rank,playerid,Name
0,Suburban Lifestyle Dream,Gerrit Cole SP | NYY,80,13,13,11,0,7,3,0,0,99,27,66,3.25,1.16,7,13125,Gerrit Cole
1,Union State Connectors,Jacob deGrom SP | NYM,73,12,12,10,0,5,2,0,0,82,18,60,2.59,1.07,13,10954,Jacob deGrom
2,Screaming Prairie Camels,Max Scherzer SP | WAS,74,12,12,9,0,6,3,0,0,89,21,64,3.27,1.14,14,3137,Max Scherzer
3,Johnny and the Rockers,Justin Verlander SP | HOU,78,13,13,10,1,7,3,0,0,98,21,63,3.32,1.07,17,8700,Justin Verlander
4,Screaming Prairie Camels,Shane Bieber SP | CLE,66,11,11,8,1,5,3,0,0,76,14,62,3.53,1.15,19,19427,Shane Bieber


In [25]:
pit_df = pd.merge(pitchers, namer.name_data[['Name','playerid','Avail']], on='playerid')
pit_df.drop('Name_y',inplace = True,axis=1)
pit_df.rename(columns={'Name_x':'Name'}, inplace = True)

In [26]:
pit_df.head()

Unnamed: 0,playerid,Name,W,L,ERA,GS,G,IP,H,ER,...,BB,WHIP,K/9,BB/9,FIP,WAR,ADP,S,RA9-WAR,Avail
0,10021,Mike Minor,4.0,3.5,4.47,11.0,11.0,69.0,67.5,34.5,...,21.0,1.28,8.46,2.765,4.475,1.15,168.6,0.0,1.4,Chicken Inn of Utica
1,10058,Tyler Matzek,0.0,0.0,6.43,0.0,7.0,7.0,6.0,5.0,...,10.0,2.29,11.57,12.86,7.21,-0.2,999.0,,,FA
2,10061,Brooks Raley,3.0,3.0,5.24,10.0,10.0,55.0,60.0,32.0,...,24.0,1.53,6.87,3.93,5.29,0.3,999.0,,,FA
3,10066,Ian Krol,0.5,0.5,4.615,0.0,10.0,10.0,9.5,5.0,...,4.5,1.425,9.155,4.185,4.705,0.05,999.0,0.0,0.0,FA
4,10078,Caleb Thielbar,0.5,0.5,4.725,0.0,8.5,11.5,12.0,6.5,...,3.0,1.32,8.055,2.595,4.715,0.0,999.0,0.0,0.0,FA


In [27]:
stats.replacement_level['P'] = stats.pitcher_replacement_level(pit_df)
pit_df['FWAR'] = pit_df.apply(stats.pitcherFWAR, axis = 1)

In [28]:
pit_df.sort_values(by='FWAR',ascending=False)[20:30]

Unnamed: 0,playerid,Name,W,L,ERA,GS,G,IP,H,ER,...,WHIP,K/9,BB/9,FIP,WAR,ADP,S,RA9-WAR,Avail,FWAR
166,13074,Yu Darvish,3.5,3.0,3.855,10.5,10.5,63.0,54.0,27.0,...,1.185,10.635,2.955,3.72,1.25,52.4,0.0,1.3,Suburban Lifestyle Dream,3.846521
710,20099,Chris Paddack,4.0,3.5,3.825,10.5,10.5,61.5,56.0,26.5,...,1.155,9.46,2.235,3.965,1.15,50.0,0.0,1.0,Cackleberry Czars,3.675436
765,3548,Liam Hendriks,2.0,1.0,3.13,0.0,25.5,26.0,20.0,9.0,...,1.055,12.13,2.565,2.925,0.55,75.8,11.0,0.4,Screaming Prairie Camels,3.584777
258,14168,Jose Berrios,5.0,4.0,4.375,12.0,12.0,72.0,69.5,35.0,...,1.275,8.875,2.78,4.32,1.2,67.3,0.0,1.1,Johnny and the Rockers,3.555354
64,11486,Robbie Ray,4.0,3.5,4.14,11.0,11.0,63.0,53.5,29.0,...,1.32,11.31,4.225,4.18,1.0,152.6,0.0,1.0,Union State Connectors,3.477003
300,14710,Edwin Diaz,1.5,1.0,2.96,0.0,25.0,24.5,17.0,8.0,...,1.035,13.565,3.07,2.865,0.45,113.7,11.0,0.4,Springer International,3.460779
373,15440,Matthew Boyd,4.0,4.0,4.36,11.0,11.0,66.0,61.5,32.0,...,1.23,9.925,2.665,4.24,1.2,152.0,0.0,1.1,Suburban Lifestyle Dream,3.388703
337,15038,German Marquez,4.0,3.5,4.145,11.0,11.0,68.0,66.0,31.5,...,1.225,9.17,2.3,3.905,1.4,197.0,0.0,1.5,You Drink Bitch's Wine,3.371957
851,6632,Carlos Carrasco,4.0,3.0,3.87,9.0,10.0,54.5,51.5,23.5,...,1.17,9.98,2.11,3.95,1.05,124.0,0.0,1.3,Springer International,3.32317
271,14374,Tyler Glasnow,3.5,2.5,3.58,9.0,9.0,49.0,39.5,19.5,...,1.21,11.74,3.65,3.6,1.05,63.6,0.0,1.1,Omak Goat Rodeo,3.256246


In [32]:
test = LeagueViewer(df,pit_df)
test.best_free_pitchers()

Unnamed: 0,Name,IP,W,K,S,ERA,WHIP,FWAR
736,Corey Kluber,61.0,4.0,62.0,0.0,4.005,1.200,3.238097
283,Jose Leclerc,24.5,1.0,35.0,10.0,3.560,1.295,2.070665
378,Marco Gonzales,67.5,4.5,52.0,0.0,4.405,1.330,1.931365
742,Rick Porcello,64.5,3.5,56.0,0.0,4.590,1.300,1.609364
253,Kevin Gausman,55.0,3.0,51.0,0.0,4.075,1.280,1.606104
...,...,...,...,...,...,...,...,...
786,Rafael Dolis,18.0,2.0,18.0,,4.000,1.330,
813,Anthony Gose,9.0,0.0,9.0,,8.000,2.330,
1246,Stetson Allie,14.0,0.0,16.0,,7.070,2.000,
1459,Carlos Belen,25.0,1.0,20.0,,3.960,1.400,


In [34]:
test.view_team('omak')

Unnamed: 0,Name,Eligible,PA,AVG,R,HR,RBI,SB,FWAR
538,Juan Soto,"LF,OF,U",243.5,0.2935,39.0,13.0,42.5,3.5,5.310864
608,Giancarlo Stanton,"LF,OF,U",204.5,0.2625,32.0,16.0,41.0,1.0,3.75917
71,Manny Machado,"3B,CI,MI,SS,U",245.0,0.2735,31.5,12.5,37.5,2.5,3.554445
517,Vladimir Guerrero Jr.,"3B,CI,U",214.5,0.2915,27.5,9.0,33.5,1.0,2.224069
15,Danny Santana,"1B,CF,CI,MI,OF,U",194.0,0.255,25.0,7.5,25.0,6.5,1.873642
1,Wil Myers,"CF,LF,OF,U",194.0,0.2355,22.5,8.0,23.5,6.0,1.175735
127,Kolten Wong,"2B,MI,U",197.5,0.2695,23.5,4.5,19.0,6.0,0.928869
473,Brian Anderson,"3B,CI,OF,RF,U",225.5,0.26,27.0,7.0,25.5,2.0,0.781626
502,Nick Senzel,"CF,OF,U",179.0,0.2615,22.0,5.5,20.5,5.0,0.704187
542,David Peralta,"LF,OF,U",192.0,0.2765,23.0,6.5,25.0,1.0,0.433398
