# Sandbox

This is a top level notebook for developing and playing with code in the local directory.

The expected use is not generally to share code, but to have a space for notebook activity of exploratory nature.

In [18]:
import os
import sys
import warnings
import re

import pandas as pd

sys.path.append(r'/home/lenhart/Repos/phi-utils')

from philosofool.data_sources.utils import read_yml
from fantasy_baseball_draft.utils import StatSynonyms, load_cbs_data, DataLoader

data_path = read_yml('local/config.yml')['paths']['local_data']


In [19]:
class AssociatePlayers:
    """Associate entities from different datasets.

    Given two dataframes that share entities but not a common single
    join key, this can construct a merge of the data based on columns
    that are assumed to find unique one.

    Example: Given dataframes of pitchers in 2019 that use different
    naming conventions ('Mike', 'Michael', etc.) we can align these by 
    assuming that two pitchers are the same if they started the same
    number of games, had the same number of walks, hits and strikeouts.
    """
    syns = StatSynonyms()

    def associate(self, df1, df2, index_cols):
        if df1.duplicated().sum():
            warnings.warn("Found duplicated in df1.")
        if df2.duplicated().sum():
            warnings.warn("Found duplicated in df2.")
        df1 = self.syns.normalize_df(df1)
        df2 = self.syns.normalize_df(df2)
        df = df1.set_index(index_cols).merge(df2.set_index(index_cols), left_index=True, right_index=True)
        return df


In [20]:
associate = AssociatePlayers()
#

index_cols = ['H', 'BB', 'RBI', 'K']
index_cols = ['IP', 'W', 'G', 'K']

In [21]:
# syns = AssociatePlayers.syns

loader = DataLoader(os.path.join(data_path, 'historical'))
fg22 = loader.load_csv('fg_hitters_2022.csv')
cbs22 = loader.load_cbs_csv('cbs_hitters_2022.csv')
fg22[fg22.Name.str.contains('France')]
#cbs22[cbs22.Player.str.contains('France')]

Unnamed: 0,Name,Team,G,AB,PA,H,1B,2B,3B,HR,...,IBB,K,HBP,SF,SH,GDP,SB,CS,BA,playerid
96,Ty France,SEA,140,551,613,151,103,27,1,20,...,3,94,21,5,0,18,0,0,0.274,17982


In [22]:
def build_id_map(cbs_path, fg_path, index_cols, duplicated=False):
    data_path = read_yml('local/config.yml')['paths']['local_data']
    hist_path = os.path.join(data_path, 'historical')
    loader = DataLoader(hist_path)
    cbs = loader.load_cbs_csv(cbs_path)
    fg = loader.load_csv(fg_path)
    
    cbs = cbs[cbs[index_cols].sum(axis=1) > 0]
    org = len(cbs)
    if duplicated:
        return cbs[cbs.duplicated(subset=index_cols, keep=False)]
    cbs = cbs.drop_duplicates(subset=index_cols, keep=False)
    print(f"Dropped {org - len(cbs)} records.")

    cbs_to_fg = associate.associate(cbs, fg, index_cols)
    cbs_to_fg = dict(zip(cbs_to_fg.Player.str.strip(), cbs_to_fg.playerid))
    return cbs_to_fg

# import json
# with open(data_path + '/hitter_ids.json', 'w') as j:
#     j.write(json.dumps(cbs_to_fg_h))

pitcher_ids = build_id_map('cbs_pitchers_2022.csv', 'fg_pitchers_2022.csv', ['IP', 'W', 'G', 'K'])
hitter_ids = build_id_map('cbs_hitters_2022.csv', 'fg_hitters_2022.csv', ['H', 'BB', 'RBI', 'K'])
#build_id_map('cbs_hitters_2022.csv', 'fg_hitters_2022.csv', ['H', 'BB', 'RBI', 'K'], True)
{k: v for k, v in hitter_ids.items() if 'Franc' in k}# [hitter_ids..str.contains('Franc')]

Dropped 34 records.
Dropped 33 records.


{'Francisco Alvarez C | NYM': 26121,
 'Franchy Cordero 1B | BAL': 14567,
 'Francisco Mejia C | TB': 16403,
 'Maikel Franco 3B | WAS': 12179,
 'Wander Franco SS | TB': 23667,
 'Francisco Lindor SS | NYM': 12916}

In [23]:
projections = DataLoader(os.path.join(data_path, 'projections'))
cbs_hitters = projections.load_cbs_csv('cbs_hitters.csv')
cbs_pitchers = projections.load_cbs_csv('cbs_pitchers.csv')
print(data_path)
fg_hitters = projections.load_csv('fg-depth-chart_hitters.csv')
fg_pitcher = projections.load_csv('fg-depth-chart_pitchers.csv')

cbs_hitters['playerid'] = cbs_hitters.Player.map(hitter_ids).fillna(-1).astype(int)
cbs_pitchers['playerid'] = cbs_pitchers.Player.map(pitcher_ids).fillna(-1).astype(int)
cbs_hitters['PA'] = cbs_hitters.AB + cbs_hitters.BB

/home/lenhart/Dropbox/baseball/fantasy_data


In [24]:
fg_hitters[fg_hitters.Name.str.contains("Tatis")]

Unnamed: 0,Name,Team,G,PA,AB,H,1B,2B,3B,HR,...,BsR,Fld,Off,Def,WAR,ADP,InterSD,InterSK,IntraSD,playerid
16,Fernando Tatis Jr.,SDP,123,532,466,128,65,24,2,36,...,2.77516,-2.38732,38.830564,-8.437318,5.07054,18.290001,,,,19709


In [25]:
hitter_ids['Ty France 1B | SEA'] = 17982
hitter_ids["Fernando Tatis SS | SD"] = 19709

In [26]:
cbs_pitchers['ER'] = cbs_pitchers.ERA * cbs_pitchers.IP / 9

In [27]:
def merge_fg(cbs, base_path, cols, name_filter=lambda x: x):
    cbs = cbs.copy()
    ids = cbs.playerid.to_numpy()
    loader = DataLoader(base_path)
    for path in os.listdir(base_path):
        if name_filter(path) and 'cbs' not in path:
            #print(path)
            suffixes = ('', '_' + path[:5].strip('_'))
            df = loader.load_csv(path)
            if len(df) == 0:
                #print(f'{path} data is empty...')
                continue
            #print(df.columns, df.playerid[:10])
            df = df[~df.playerid.astype(str).str.startswith('sa')]

            df.playerid = df.playerid.astype(int)
            df = df[df.playerid.isin(list(ids))]
            assert len(df) > 0, 'df empty, which is sort of like wdf?'
            cbs = cbs.merge(df[cols], on='playerid', suffixes=suffixes, how='left')
    return cbs

def merge_fg_hitters(cbs, base_path):
    cols = ['playerid', 'PA',  'AB', 'H', 'HR', 'R', 'RBI', 'SB']
    return merge_fg(cbs, base_path, cols, lambda x: 'hitter' in x)

def merge_fg_pitchers(cbs, base_path):
    cols = ['playerid', 'IP', 'ER', 'WHIP', 'K', 'S', 'W']
    return merge_fg(cbs, base_path, cols, lambda x: 'pitch' in x)

merge_fg_hitters(cbs_hitters, data_path + '/projections')
merge_fg_pitchers(cbs_pitchers, data_path + '/projections')
#cbs_pitchers

Unnamed: 0,Avail,Player,IP,G,GS,QS,CG,W,L,S,...,WHIP_zips,K_zips,S_zips,W_zips,IP_steam,ER_steam,WHIP_steam,K_steam,S_steam,W_steam
0,SweepTheLegJohnny,Shohei Ohtani DH | LAA,159,24,24,17,0,13,6,0,...,1.070510,193.0,0.0,13.0,173.0,62.0,1.10357,217.0,0.0,12.0
1,SweepTheLegJohnny,Gerrit Cole SP | NYY,191,31,31,22,1,15,8,0,...,1.035450,224.0,0.0,14.0,197.0,71.0,1.06028,248.0,0.0,14.0
2,Droitwich Murdercocks,Corbin Burnes SP | MIL,177,30,30,21,0,10,7,0,...,1.049910,215.0,0.0,12.0,197.0,69.0,1.09431,240.0,0.0,13.0
3,Strike Four,Max Scherzer SP | NYM,185,30,30,21,0,14,7,0,...,1.019110,172.0,0.0,10.0,191.0,71.0,1.04863,228.0,0.0,13.0
4,Marjorie Taylor Greene's Only Fans Page,Justin Verlander SP | NYM,182,30,30,23,0,17,5,0,...,0.963636,177.0,0.0,14.0,180.0,67.0,1.06439,202.0,0.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2973,W,Jack Labosky RP | TB,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2974,W,Dominic Hamel SP | NYM,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2975,W,Braden Shipley RP | SEA,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2976,W,Derek West RP | HOU,0,0,0,0,0,0,0,0,...,,,,,,,,,,


In [28]:
def add_weighted_stats(df, stats, playtime_stat):
    df = df.copy()
    pt = playtime_stat
    suffixes = set(['_'.join(col.split('_')[1:]) for col in df.columns if len(col.split('_')) > 1])
    df[f'{pt}_adj'] = (df[f'{pt}_fg-de'] + df[pt]) / 2

    for suf in suffixes:
        for col in stats:
            df[f'{col}_{suf}'] = df[f'{col}_{suf}'] / df[f'{pt}_{suf}'] * df[f'{pt}_adj']
    for col in stats:
        adj_cols = [col] + [c for c in df.columns if re.match(f'{col}_', c)]
        df[f'{col}_adj'] = (df[adj_cols].sum(axis=1) / df[adj_cols].notna().sum(axis=1)).round()
    return df

#add_weighted_stats(cbs_hitters, cols[2:], 'PA')
hitter_stats = ['playerid', 'PA',  'AB', 'H', 'HR', 'R', 'RBI', 'SB']
pitcher_stats = ['playerid', 'IP', 'ER', 'WHIP', 'K', 'S', 'W']
full_hitters = (cbs_hitters
    .pipe(merge_fg_hitters, data_path + '/projections')
    .pipe(add_weighted_stats, hitter_stats[2:], 'PA'))
full_pitchers = (cbs_pitchers
    .pipe(merge_fg_pitchers, data_path + '/projections')
    .pipe(add_weighted_stats, pitcher_stats[2:], 'IP')
)

In [29]:
from fantasy_baseball_draft.spg import FantasyValuator, spgs_from_standings_html

valuator = FantasyValuator(spgs_from_standings_html(os.path.join(data_path, 'standings/cbs_2021_standings.html')))


In [30]:
elig = DataLoader(os.path.join(data_path, 'eligibility')).load_cbs_csv('eligibility.csv')

def extract_projection(df):
    extract = ['Avail', 'Player', 'playerid'] + [col for col in df.columns if '_adj' in col]
    df = df[extract].rename(columns={k: re.sub('_adj', '', k) for k in extract})
    return df

hitter_proj = extract_projection(full_hitters)
hitter_proj = hitter_proj.merge(elig[['Player', 'Eligible']],  how='left', on='Player')
hitter_proj['fwar'] = valuator.valuate_hitters(hitter_proj, 16*12)

pitcher_proj = extract_projection(full_pitchers)
pitcher_proj['ERA'] = pitcher_proj.ER / pitcher_proj.IP * 9
pitcher_proj['fwar'] = valuator.valuate_pitchers(pitcher_proj, 16*12)

hitter_proj.sort_values('fwar', ascending=False).head(20)

6.161625074475289 -0.08089606460063217


Unnamed: 0,Avail,Player,playerid,PA,AB,H,HR,R,RBI,SB,Eligible,fwar
12,Walk-In Clostes,Fernando Tatis SS | SD,-1,,523.0,153.0,37.0,100.0,96.0,25.0,"MI,SS,U",11.925616
0,You Drink Bitches' Wine,Aaron Judge CF | NYY,15640,656.5,552.0,156.0,45.0,109.0,112.0,11.0,"CF,RF,U",11.438076
2,Walk-In Clostes,Trea Turner SS | PHI,16252,653.0,597.0,176.0,22.0,95.0,80.0,28.0,"MI,SS,U",10.228253
1,Walk-In Clostes,Jose Ramirez 3B | CLE,13510,657.0,578.0,157.0,31.0,94.0,105.0,23.0,"3B,CI,U",10.162302
4,Walk-In Clostes,Ronald Acuna RF | ATL,18401,622.0,535.0,144.0,28.0,99.0,77.0,33.0,"RF,U",9.940224
3,Walk-In Clostes,Julio Rodriguez CF | SEA,23697,641.0,580.0,158.0,29.0,93.0,87.0,26.0,"CF,U",9.467323
10,Freddie &amp;amp; the Mets,Freddie Freeman 1B | LAD,5361,663.0,573.0,172.0,25.0,102.0,95.0,10.0,"1B,CI,U",9.221877
7,Freddie &amp;amp; the Mets,Kyle Tucker RF | HOU,18345,620.0,548.0,149.0,30.0,85.0,99.0,21.0,"RF,U",8.990278
11,Omak Wrong Players,Vladimir Guerrero 1B | TOR,19611,639.5,567.0,164.0,35.0,94.0,102.0,6.0,"1B,CI,U",8.895796
8,Omak Wrong Players,Juan Soto RF | SD,20123,687.0,539.0,149.0,32.0,111.0,90.0,10.0,"RF,U",8.609521


In [31]:
hitter_proj.to_csv('local/hitter_proj.csv')
pitcher_proj.to_csv('local/pitcher_proj.csv')