# Sandbox

This is a top level notebook for developing and playing with code in the local directory.

The expected use is not generally to share code, but to have a space for notebook activity of exploratory nature.

In [25]:
import os
import sys
import warnings
import re

import pandas as pd

sys.path.append(r'C:\Users\lenha\Repos\phi-utils')

from philosofool.data_sources.utils import read_yml
from fantasy_baseball_draft.utils import StatSynonyms, load_cbs_data, DataLoader

data_path = read_yml('local/config.yml')['paths']['local_data']


In [None]:
class AssociatePlayers:
    """Associate entities from different datasets.

    Given two dataframes that share entities but not a common single
    join key, this can construct a merge of the data based on columns
    that are assumed to find unique one.

    Example: Given dataframes of pitchers in 2019 that use different
    naming conventions ('Mike', 'Michael', etc.) we can align these by 
    assuming that two pitchers are the same if they started the same
    number of games, had the same number of walks, hits and strikeouts.
    """
    syns = StatSynonyms()

    def associate(self, df1, df2, index_cols):
        if df1.duplicated().sum():
            warnings.warn("Found duplicated in df1.")
        if df2.duplicated().sum():
            warnings.warn("Found duplicated in df2.")
        df1 = self.syns.normalize_df(df1)
        df2 = self.syns.normalize_df(df2)
        df = df1.set_index(index_cols).merge(df2.set_index(index_cols), left_index=True, right_index=True)
        return df


In [None]:
associate = AssociatePlayers()
#

index_cols = ['H', 'BB', 'RBI', 'K']
index_cols = ['IP', 'W', 'G', 'K']

In [90]:
# syns = AssociatePlayers.syns

loader = DataLoader(os.path.join(data_path, 'historical'))
fg22 = loader.load_csv('fg_hitters_2022.csv')
cbs22 = loader.load_cbs_csv('cbs_hitters_2022.csv')
fg22[fg22.Name.str.contains('France')]
#cbs22[cbs22.Player.str.contains('France')]

Unnamed: 0,Name,Team,G,AB,PA,H,1B,2B,3B,HR,...,IBB,K,HBP,SF,SH,GDP,SB,CS,BA,playerid
96,Ty France,SEA,140,551,613,151,103,27,1,20,...,3,94,21,5,0,18,0,0,0.274,17982


In [101]:
france = cbs22[cbs22.Player.map(hitter_ids).isna()].sort_values('H').loc[66, 'Player']
#cbs22.Player.map(hitter_ids)
hitter_ids[france] = 17982

'Ty France 1B | SEA'

In [80]:
def build_id_map(cbs_path, fg_path, index_cols, duplicated=False):
    data_path = read_yml('local/config.yml')['paths']['local_data']
    hist_path = os.path.join(data_path, 'historical')
    loader = DataLoader(hist_path)
    cbs = loader.load_cbs_csv(cbs_path)
    fg = loader.load_csv(fg_path)
    
    cbs = cbs[cbs[index_cols].sum(axis=1) > 0]
    org = len(cbs)
    if duplicated:
        return cbs[cbs.duplicated(subset=index_cols, keep=False)]
    cbs = cbs.drop_duplicates(subset=index_cols, keep=False)
    print(f"Dropped {org - len(cbs)} records.")

    cbs_to_fg = associate.associate(cbs, fg, index_cols)
    cbs_to_fg = dict(zip(cbs_to_fg.Player.str.strip(), cbs_to_fg.playerid))
    return cbs_to_fg

# import json
# with open(data_path + '/hitter_ids.json', 'w') as j:
#     j.write(json.dumps(cbs_to_fg_h))

pitcher_ids = build_id_map('cbs_pitchers_2022.csv', 'fg_pitchers_2022.csv', ['IP', 'W', 'G', 'K'])
hitter_ids = build_id_map('cbs_hitters_2022.csv', 'fg_hitters_2022.csv', ['H', 'BB', 'RBI', 'K'])
#build_id_map('cbs_hitters_2022.csv', 'fg_hitters_2022.csv', ['H', 'BB', 'RBI', 'K'], True)
{k: v for k, v in hitter_ids.items() if 'Franc' in k}# [hitter_ids..str.contains('Franc')]

Dropped 34 records.
Dropped 33 records.


{'Francisco Alvarez C | NYM': 26121,
 'Franchy Cordero 1B | BAL': 14567,
 'Francisco Mejia C | TB': 16403,
 'Maikel Franco 3B | WAS': 12179,
 'Wander Franco SS | TB': 23667,
 'Francisco Lindor SS | NYM': 12916}

In [97]:
pd.concat([fg22[fg22.Name.str.contains('France')][['H', 'BB', 'RBI', 'K']],
cbs22[cbs22.Player.str.contains('France')][['H', 'BB', 'RBI', 'K']]])

Unnamed: 0,H,BB,RBI,K
96,151,35,83,94
66,152,35,84,94


In [85]:
fg_hitters[fg_hitters.Name.str.contains('France')]

Unnamed: 0,Name,Team,G,PA,AB,H,1B,2B,3B,HR,...,BsR,Fld,Off,Def,WAR,ADP,InterSD,InterSK,IntraSD,playerid
78,Ty France,SEA,141,609,543,147,98,29,1,19,...,-1.43511,3.18909,17.127338,-7.735913,3.02265,178.660004,,,,17982


In [83]:
cbs_hitters[cbs_hitters.Player.str.contains('Franc')]

Unnamed: 0,Avail,Player,AB,R,H,1B,2B,3B,HR,RBI,BB,K,SB,CS,BA,OBP,SLG,Rank,playerid,PA
23,Freddie &amp;amp; the Mets,Francisco Lindor SS | NYM,589,89,159,106,25,4,24,95,56,119,15,5,0.27,0.339,0.448,28,12916,645
53,permanent recess,Wander Franco SS | TB,582,86,164,108,38,5,13,68,43,77,11,2,0.282,0.332,0.431,86,23667,625
100,W,Ty France 1B | SEA,571,72,167,111,34,2,20,81,35,104,0,0,0.292,0.354,0.464,161,-1,606
171,W,Francisco Alvarez C | NYM,191,24,45,29,9,1,6,24,22,46,1,0,0.236,0.326,0.387,282,26121,213
326,W,Franchy Cordero 1B | BAL,310,42,73,41,18,2,12,41,32,99,5,2,0.235,0.308,0.423,493,14567,342
341,W,Francisco Mejia C | TB,398,43,100,60,27,2,11,44,18,85,0,0,0.251,0.292,0.412,514,16403,416
592,W,Francisco Diaz C | NYY,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,9999,-1,0
898,W,Francisco Hernandez 3B | STL,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,9999,-1,0
1037,W,Junior Franco LF | ARI,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,9999,-1,0
1131,W,Maikel Franco 3B | WAS,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,9999,12179,0


In [None]:
projections = DataLoader(os.path.join(data_path, 'projections'))
cbs_hitters = projections.load_cbs_csv('cbs_hitters.csv')
cbs_pitchers = projections.load_cbs_csv('cbs_pitchers.csv')
print(data_path)
fg_hitters = projections.load_csv('fg-depth-chart_hitters.csv')
fg_pitcher = projections.load_csv('fg-depth-chart_pitchers.csv')

cbs_hitters['playerid'] = cbs_hitters.Player.map(hitter_ids).fillna(-1).astype(int)
cbs_pitchers['playerid'] = cbs_pitchers.Player.map(pitcher_ids).fillna(-1).astype(int)
cbs_hitters['PA'] = cbs_hitters.AB + cbs_hitters.BB

In [None]:
def merge_fg(cbs, base_path, cols, name_filter=lambda x: x):
    cbs = cbs.copy()
    ids = cbs.playerid.to_numpy()
    loader = DataLoader(base_path)
    for path in os.listdir(base_path):
        if name_filter(path) and 'cbs' not in path:
            print(path)
            suffixes = ('', '_' + path[:5].strip('_'))
            df = loader.load_csv(path)
            if len(df) == 0:
                print(f'{path} data is empty...')
                continue
            #print(df.columns, df.playerid[:10])
            df = df[~df.playerid.astype(str).str.startswith('sa')]

            df.playerid = df.playerid.astype(int)
            df = df[df.playerid.isin(list(ids))]
            assert len(df) > 0, 'df empty, which is sort of like wdf?'
            cbs = cbs.merge(df[cols], on='playerid', suffixes=suffixes, how='left')
    return cbs

def merge_fg_hitters(cbs, base_path):
    cols = ['playerid', 'PA',  'AB', 'H', 'HR', 'R', 'RBI', 'SB']
    return merge_fg(cbs, base_path, cols, lambda x: 'hitter' in x)

def merge_fg_pitchers(cbs, base_path):
    cols = ['playerid', 'IP', 'ER', 'WHIP', 'K', 'S', 'W']
    return merge_fg(cbs, base_path, cols, lambda x: 'pitch' in x)

merge_fg_hitters(cbs_hitters, data_path + '/projections')
merge_fg_pitchers(cbs_pitchers, data_path + '/projections')
#cbs_pitchers

In [None]:
cbs_hitters['PA_adj'] = (cbs_hitters['PA_fg-de'] * 3 + cbs_hitters.PA) / 4
suffixes.remove('adj')
suffixes


In [35]:
cbs_pitchers['ER'] = cbs_pitchers.ERA * cbs_pitchers.IP / 9

In [37]:
def add_weighted_stats(df, stats, playtime_stat):
    df = df.copy()
    pt = playtime_stat
    suffixes = set(['_'.join(col.split('_')[1:]) for col in df.columns if len(col.split('_')) > 1])
    df[f'{pt}_adj'] = (df[f'{pt}_fg-de'] + df[pt]) / 2

    for suf in suffixes:
        for col in stats:
            df[f'{col}_{suf}'] = df[f'{col}_{suf}'] / df[f'{pt}_{suf}'] * df[f'{pt}_adj']
    for col in stats:
        adj_cols = [col] + [c for c in df.columns if re.match(f'{col}_', c)]
        df[f'{col}_adj'] = (df[adj_cols].sum(axis=1) / df[adj_cols].notna().sum(axis=1)).round()
    return df

#add_weighted_stats(cbs_hitters, cols[2:], 'PA')
hitter_stats = ['playerid', 'PA',  'AB', 'H', 'HR', 'R', 'RBI', 'SB']
pitcher_stats = ['playerid', 'IP', 'ER', 'WHIP', 'K', 'S', 'W']
(cbs_hitters
    .pipe(merge_fg_hitters, data_path + '/projections')
    .pipe(add_weighted_stats, hitter_stats[2:], 'PA'))
(cbs_pitchers
    .pipe(merge_fg_pitchers, data_path + '/projections')
    .pipe(add_weighted_stats, pitcher_stats[2:], 'IP')
)

[15640 13510 16252 23697 18401 13611 19556 18345 20123 19755]
bat_x_proj_hitters.csv
fg-depth-chart_hitters.csv
steamer_hitters.csv
zips_hitters.csv
[19755 13125 19361  3137  8700 18684 16162 16137 21483 14710]
bat_x_pitchers.csv
bat_x_pitchers.csv data is empty...
fg-depth-chart_pitchers.csv
steamer_pitchers.csv
zips_pitchers.csv


Unnamed: 0,Avail,Player,IP,G,GS,QS,CG,W,L,S,...,WHIP_zips,K_zips,S_zips,W_zips,IP_adj,ER_adj,WHIP_adj,K_adj,S_adj,W_adj
0,SweepTheLegJohnny,Shohei Ohtani DH | LAA,159,24,24,17,0,13,6,0,...,1.132270,204.134615,0.0,13.750000,165.0,53.0,1.0,207.0,0.0,13.0
1,SweepTheLegJohnny,Gerrit Cole SP | NYY,191,31,31,22,1,15,8,0,...,1.136256,245.807541,0.0,15.362971,195.0,67.0,1.0,244.0,0.0,15.0
2,Droitwich Murdercocks,Corbin Burnes SP | MIL,177,30,30,21,0,10,7,0,...,1.123398,230.048766,0.0,12.839931,186.5,63.0,1.0,225.0,0.0,12.0
3,Strike Four,Max Scherzer SP | NYM,185,30,30,21,0,14,7,0,...,1.337897,225.803255,0.0,13.128096,185.5,64.0,1.0,222.0,0.0,13.0
4,Marjorie Taylor Greene's Only Fans Page,Justin Verlander SP | NYM,182,30,30,23,0,17,5,0,...,1.054159,193.627273,0.0,15.315152,180.5,64.0,1.0,195.0,0.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2973,W,Jack Labosky RP | TB,0,0,0,0,0,0,0,0,...,,,,,,0.0,0.0,0.0,0.0,0.0
2974,W,Dominic Hamel SP | NYM,0,0,0,0,0,0,0,0,...,,,,,,0.0,0.0,0.0,0.0,0.0
2975,W,Braden Shipley RP | SEA,0,0,0,0,0,0,0,0,...,,,,,,0.0,0.0,0.0,0.0,0.0
2976,W,Derek West RP | HOU,0,0,0,0,0,0,0,0,...,,,,,,0.0,0.0,0.0,0.0,0.0


In [None]:
from fantasy_baseball_draft import utils as fb_utils
cbs_hitter_names = [fb_utils.process_cbs_player(p) for p in cbs_hitters[cbs_hitters.PA > 50].Player]
cbs_hitter_names = [p[0] for p in cbs_hitter_names]
len(cbs_hitter_names)
[p for p in fg_hitters.Name if p not in set(cbs_hitter_names) ]
fg_ids = fg_hitters[~fg_hitters.playerid.str.startswith('sa')].playerid.astype(int)
len(np.intersect1d(cbs_hitters[cbs_hitters.PA > 50].playerid, fg_ids))
cbs_ids = set(cbs_hitters[cbs_hitters.PA > 50].playerid)
cbs_ids.difference(set(fg_ids))
set(fg_ids).difference(cbs_ids)
fg