# Sandbox

This is a top level notebook for developing and playing with code in the local directory.

The expected use is not generally to share code, but to have a space for notebook activity of exploratory nature.

In [None]:
import os
import sys
import warnings
import re

import pandas as pd

sys.path.append(r'/home/lenhart/Repos/phi-utils')

from philosofool.data_sources.utils import read_yml
from fantasy_baseball_draft.utils import StatSynonyms, load_cbs_data, DataLoader

data_path = read_yml('local/config.yml')['paths']['local_data']


In [None]:
class AssociatePlayers:
    """Associate entities from different datasets.

    Given two dataframes that share entities but not a common single
    join key, this can construct a merge of the data based on columns
    that are assumed to find unique one.

    Example: Given dataframes of pitchers in 2019 that use different
    naming conventions ('Mike', 'Michael', etc.) we can align these by 
    assuming that two pitchers are the same if they started the same
    number of games, had the same number of walks, hits and strikeouts.
    """
    syns = StatSynonyms()

    def associate(self, df1, df2, index_cols):
        if df1.duplicated().sum():
            warnings.warn("Found duplicated in df1.")
        if df2.duplicated().sum():
            warnings.warn("Found duplicated in df2.")
        df1 = self.syns.normalize_df(df1)
        df2 = self.syns.normalize_df(df2)
        df = df1.set_index(index_cols).merge(df2.set_index(index_cols), left_index=True, right_index=True)
        return df


In [None]:
associate = AssociatePlayers()
#

index_cols = ['H', 'BB', 'RBI', 'K']
index_cols = ['IP', 'W', 'G', 'K']

In [None]:
# syns = AssociatePlayers.syns

loader = DataLoader(os.path.join(data_path, 'historical'))
fg22 = loader.load_csv('fg_hitters_2022.csv')
cbs22 = loader.load_cbs_csv('cbs_hitters_2022.csv')
fg22[fg22.Name.str.contains('France')]
#cbs22[cbs22.Player.str.contains('France')]

In [None]:
def build_id_map(cbs_path, fg_path, index_cols, duplicated=False):
    data_path = read_yml('local/config.yml')['paths']['local_data']
    hist_path = os.path.join(data_path, 'historical')
    loader = DataLoader(hist_path)
    cbs = loader.load_cbs_csv(cbs_path)
    fg = loader.load_csv(fg_path)
    
    cbs = cbs[cbs[index_cols].sum(axis=1) > 0]
    org = len(cbs)
    if duplicated:
        return cbs[cbs.duplicated(subset=index_cols, keep=False)]
    cbs = cbs.drop_duplicates(subset=index_cols, keep=False)
    print(f"Dropped {org - len(cbs)} records.")

    cbs_to_fg = associate.associate(cbs, fg, index_cols)
    cbs_to_fg = dict(zip(cbs_to_fg.Player.str.strip(), cbs_to_fg.playerid))
    return cbs_to_fg

# import json
# with open(data_path + '/hitter_ids.json', 'w') as j:
#     j.write(json.dumps(cbs_to_fg_h))

pitcher_ids = build_id_map('cbs_pitchers_2022.csv', 'fg_pitchers_2022.csv', ['IP', 'W', 'G', 'K'])
hitter_ids = build_id_map('cbs_hitters_2022.csv', 'fg_hitters_2022.csv', ['H', 'BB', 'RBI', 'K'])
#build_id_map('cbs_hitters_2022.csv', 'fg_hitters_2022.csv', ['H', 'BB', 'RBI', 'K'], True)
{k: v for k, v in hitter_ids.items() if 'Franc' in k}# [hitter_ids..str.contains('Franc')]

In [None]:
projections = DataLoader(os.path.join(data_path, 'projections'))
cbs_hitters = projections.load_cbs_csv('cbs_hitters.csv')
cbs_pitchers = projections.load_cbs_csv('cbs_pitchers.csv')
print(data_path)
fg_hitters = projections.load_csv('fg-depth-chart_hitters.csv')
fg_pitcher = projections.load_csv('fg-depth-chart_pitchers.csv')

cbs_hitters['playerid'] = cbs_hitters.Player.map(hitter_ids).fillna(-1).astype(int)
cbs_pitchers['playerid'] = cbs_pitchers.Player.map(pitcher_ids).fillna(-1).astype(int)
cbs_hitters['PA'] = cbs_hitters.AB + cbs_hitters.BB

In [None]:
fg_hitters[fg_hitters.Name.str.contains("Tatis")]

In [None]:
hitter_ids['Ty France 1B | SEA'] = 17982
hitter_ids["Fernando Tatis SS | SD"] = 19709

In [None]:
cbs_pitchers['ER'] = cbs_pitchers.ERA * cbs_pitchers.IP / 9

In [None]:
def merge_fg(cbs, base_path, cols, name_filter=lambda x: x):
    cbs = cbs.copy()
    ids = cbs.playerid.to_numpy()
    loader = DataLoader(base_path)
    for path in os.listdir(base_path):
        if name_filter(path) and 'cbs' not in path:
            #print(path)
            suffixes = ('', '_' + path[:5].strip('_'))
            df = loader.load_csv(path)
            if len(df) == 0:
                #print(f'{path} data is empty...')
                continue
            #print(df.columns, df.playerid[:10])
            df = df[~df.playerid.astype(str).str.startswith('sa')]

            df.playerid = df.playerid.astype(int)
            df = df[df.playerid.isin(list(ids))]
            assert len(df) > 0, 'df empty, which is sort of like wdf?'
            cbs = cbs.merge(df[cols], on='playerid', suffixes=suffixes, how='left')
    return cbs

def merge_fg_hitters(cbs, base_path):
    cols = ['playerid', 'PA',  'AB', 'H', 'HR', 'R', 'RBI', 'SB']
    return merge_fg(cbs, base_path, cols, lambda x: 'hitter' in x)

def merge_fg_pitchers(cbs, base_path):
    cols = ['playerid', 'IP', 'ER', 'WHIP', 'K', 'S', 'W']
    return merge_fg(cbs, base_path, cols, lambda x: 'pitch' in x)

merge_fg_hitters(cbs_hitters, data_path + '/projections')
merge_fg_pitchers(cbs_pitchers, data_path + '/projections')
#cbs_pitchers

In [None]:
def add_weighted_stats(df, stats, playtime_stat):
    df = df.copy()
    pt = playtime_stat
    suffixes = set(['_'.join(col.split('_')[1:]) for col in df.columns if len(col.split('_')) > 1])
    df[f'{pt}_adj'] = (df[f'{pt}_fg-de'] + df[pt]) / 2

    for suf in suffixes:
        for col in stats:
            df[f'{col}_{suf}'] = df[f'{col}_{suf}'] / df[f'{pt}_{suf}'] * df[f'{pt}_adj']
    for col in stats:
        adj_cols = [col] + [c for c in df.columns if re.match(f'{col}_', c)]
        df[f'{col}_adj'] = (df[adj_cols].sum(axis=1) / df[adj_cols].notna().sum(axis=1))
    return df

#add_weighted_stats(cbs_hitters, cols[2:], 'PA')
hitter_stats = ['playerid', 'PA',  'AB', 'H', 'HR', 'R', 'RBI', 'SB']
pitcher_stats = ['playerid', 'IP', 'ER', 'WHIP', 'K', 'S', 'W']
full_hitters = (cbs_hitters
    .pipe(merge_fg_hitters, data_path + '/projections')
    .pipe(add_weighted_stats, hitter_stats[2:], 'PA'))
full_pitchers = (cbs_pitchers
    .pipe(merge_fg_pitchers, data_path + '/projections')
    .pipe(add_weighted_stats, pitcher_stats[2:], 'IP')
)

In [None]:
from fantasy_baseball_draft.spg import FantasyValuator, spgs_from_standings_html

valuator = FantasyValuator(spgs_from_standings_html(os.path.join(data_path, 'standings/cbs_2021_standings.html')))


In [None]:
elig = DataLoader(os.path.join(data_path, 'eligibility')).load_cbs_csv('eligibility.csv')

def extract_projection(df):
    extract = ['Avail', 'Player', 'playerid'] + [col for col in df.columns if '_adj' in col]
    df = df[extract].rename(columns={k: re.sub('_adj', '', k) for k in extract})
    return df

hitter_proj = extract_projection(full_hitters)
hitter_proj = hitter_proj.merge(elig[['Player', 'Eligible']],  how='left', on='Player')
hitter_proj['fwar'] = valuator.valuate_hitters(hitter_proj, 16*12)

pitcher_proj = extract_projection(full_pitchers)
pitcher_proj['ERA'] = pitcher_proj.ER / pitcher_proj.IP * 9
pitcher_proj['fwar'] = valuator.valuate_pitchers(pitcher_proj, 16*12)

hitter_proj.sort_values('fwar', ascending=False).head(20)

In [None]:
hitter_proj.to_csv('local/hitter_proj.csv', index_label='index')
pitcher_proj.to_csv('local/pitcher_proj.csv', index_label='index')