# Draft Prep

Lots prepare csv files for draft activity. Also prepare weighted csv files for other uses.

In [None]:
import functools
from functools import partial
import os
import sys
import warnings
import re
from collections.abc import Callable, Iterable, Mapping


import pandas as pd
import numpy as np
from numpy.typing import ArrayLike
from scipy.stats import rankdata

from philosofool.data_sources.utils import read_yml  # type: ignore
from philosofool.data_science.graph import MetricGraph
from fantasy_baseball_draft.utils import StatSynonyms, load_cbs_data, DataLoader
from fantasy_baseball_draft.utils import cbs_player_col_to_df
from fantasy_baseball_draft.stats import StatCalculator

data_path = read_yml('local/config.yml')['paths']['local_data']


## ID Functions

Why is this so complicated? We need to join players expected to play in the current year with Fangraphs ID data to create a unique ID column for merging.
TL;DR: we're working with disjoint lists to make it all work.
Historical data solves this problem simply for players who have played.
But there are players expected who did not appear in the previous year.
For those, the name of the player is helpful for the merging.
But the result is that we need (1) a dataset of historical MLB play and (2) some preseason projections.
The players 

In [None]:
pitching_metrics = ['ERA', 'WHIP', 'W', 'S', 'K']
hitting_metrics = ['R', 'HR', 'RBI', 'SB', 'BA']
scoring_metrics = pitching_metrics + hitting_metrics

class AssociatePlayers:
    """Associate entities from different datasets.

    Given two dataframes that share entities but not a common single
    join key, this can construct a merge of the data based on columns
    that are assumed to find unique one.

    Example: Given dataframes of pitchers in 2019 that use different
    naming conventions ('Mike', 'Michael', etc.) we can align these by
    assuming that two pitchers are the same if they started the same
    number of games, had the same number of walks, hits and strikeouts.
    """
    synonyms = StatSynonyms()

    def associate(self, df1: pd.DataFrame, df2: pd.DataFrame, index_cols: list) -> pd.DataFrame:
        if df1.duplicated().sum():
            warnings.warn("Found duplicated in df1.")
        if df2.duplicated().sum():
            warnings.warn("Found duplicated in df2.")
        df1 = self.synonyms.normalize_df(df1)
        df2 = self.synonyms.normalize_df(df2)
        df = df1.set_index(index_cols).merge(df2.set_index(index_cols), left_index=True, right_index=True)
        return df


In [None]:
def merge_on_name(cbs: pd.DataFrame, fg: pd.DataFrame) -> pd.DataFrame:
    cbs_player = (
        cbs_player_col_to_df(cbs.Player) #.Team.fillna('---').unique().tolist()
        .merge(cbs[['Player']], left_index=True, right_index=True)
        .merge(fg[['Name', 'playerid']], on='Name', how='left')
    )
    return cbs_player

def map_cbs_player_col_to_id_by_name(cbs: pd.DataFrame, fg: pd.DataFrame) -> pd.Series:
    """Use cbs "Player" field to return series of CBS player names to fg player ids."""
    df = merge_on_name(cbs, fg).drop_duplicates(subset=['playerid'], keep=False)
    playerid = df.playerid
    ids = df.set_index('Player').playerid.dropna().astype(str)
    return ids.where(ids.str.startswith('sa'), ids.str.strip('sa').astype(int))


In [None]:
def build_id_map(df: pd.DataFrame, fg_df: pd.DataFrame, ids: pd.Series) -> pd.Series:
    """Map cbs Player column to fg ids.

    Parameters
    ----------
    df:
        CBS data.
    fg_df:
        Fangraphs data.
    ids:
        Mapping of Player:fg_id; this is a mapping of known cases.
    """
    df = df.copy()
    df['playerid'] = df.Player.map(ids).fillna(-1)
    name_ids = map_cbs_player_col_to_id_by_name(df[df.playerid == -1], fg_df)
    name_ids = name_ids[~name_ids.duplicated()]
    ids = pd.concat([ids, name_ids]).to_dict()
    return df.Player.map(ids).fillna(-1)

In [None]:
def _build_id_map_from_stat_associations(cbs, fg, index_cols, duplicated=False) -> pd.Series:
    """Return series mapping CBS player "Player" column to a fangraphs ID column."""
    cbs_with_playtime = cbs[cbs[index_cols].sum(axis=1) > 0]
    n_players_with_pt = len(cbs_with_playtime)
    if duplicated:
        return cbs_with_playtime[cbs_with_playtime.duplicated(subset=index_cols, keep=False)]
    cbs_with_playtime = cbs_with_playtime.drop_duplicates(subset=index_cols, keep=False)
    print(f"Dropped {n_players_with_pt - len(cbs_with_playtime)} duplicated records.")

    cbs_to_fg = AssociatePlayers().associate(cbs_with_playtime, fg, index_cols)
    cbs_to_fg.Player = cbs_to_fg.Player.str.strip()
    as_dict = dict(zip(cbs_to_fg.Player.str.strip(), cbs_to_fg.playerid))
    return pd.Series(as_dict)


## Add Fangraphs Player Id to CBS data

In [None]:
hitter_match = ['AB', 'H', 'BB', 'RBI', 'K']
pitcher_match = ['IP', 'W', 'G', 'K', 'H', 'W']

In [None]:
data_path = read_yml('local/config.yml')['paths']['local_data']
hist_path = os.path.join(data_path, 'historical')
loader = DataLoader(hist_path)

pitcher_ids = _build_id_map_from_stat_associations(
    loader.load_cbs_csv('cbs_pitchers_2023.csv'),
    loader.load_csv('fg_pitchers_2023.csv'), pitcher_match
)

hitter_ids = _build_id_map_from_stat_associations(
    loader.load_cbs_csv('cbs_hitters_2023.csv'),
    loader.load_csv('fg_hitters_2023.csv'), hitter_match
)
#build_id_map('cbs_hitters_2022.csv', 'fg_hitters_2022.csv', ['H', 'BB', 'RBI', 'K'], True)
#{k: v for k, v in hitter_ids.items() if 'Franc' in k}# [hitter_ids..str.contains('Franc')]
hitter_ids

In [None]:
def weight_stats(df1, df2, stats: list[str]) -> dict:
    """Combine stats from two dataframes."""
    # stat_calculator = StatCalculator()
    # df1 = stat_calculator.add_metrics(df1, stats)
    # df2 = stat_calculator.add_metrics(df2, stats)

    shared_idx = df2.index.intersection(df1.index)
    df1_idx = df1.index.difference(shared_idx)
    df2_idx = df2.index.difference(shared_idx)

    return {
        stat: pd.concat([
            (df1.loc[shared_idx, stat] + df2.loc[shared_idx, stat]) / 2,
            df1.loc[df1_idx, stat],
            df2.loc[df2_idx, stat]
        ])
        for stat in stats
    }

# cbs_hitter_proj.assign(**weight_stats(cbs_hitter_proj, fg_hitter_proj, ['PA', 'AB', 'BB', 'H', 'HR', 'K', 'RBI', 'R', 'BA', 'SB']))


In [None]:
elig = DataLoader(os.path.join(data_path, 'eligibility')).load_cbs_csv('eligibility.csv')
projections = DataLoader(os.path.join(data_path, 'projections/2024'))
fg_hitter_proj = projections.load_csv('fg_depth_hitters.csv')
fg_pitcher_proj = projections.load_csv('fg_depth_pitchers.csv')

cbs_hitter_proj = (
    projections
    .load_cbs_csv('cbs_hitters.csv')
    .assign(
        playerid=lambda df: build_id_map(df, fg_hitter_proj, hitter_ids),
        PA=lambda df: df.AB + df.BB
    )
    .merge(elig[["Player", "Eligible"]], how='left', on="Player")
)

cbs_pitchers_proj = (
    projections
    .load_cbs_csv('cbs_pitchers.csv')
    .assign(
        playerid=lambda df: build_id_map(df, fg_pitcher_proj, pitcher_ids),
        ER=lambda df: df.ERA * df.IP / 9
    )
)


In [None]:
cbs_pitchers_proj.head(3)


In [None]:
cbs_hitter_proj.head(3)

In [None]:
from sklearn.linear_model import LinearRegression

def standings_html_to_df(standings: list[pd.DataFrame]) -> pd.DataFrame:
    stat_standings = []
    for df in standings:
        metrics = StatCalculator().metrics
        for i, row in df.iterrows():
            if row.loc[0] == 'Team':
                idx = i
                break
        stat_standings.append(df.loc[i + 1:, 0:1].rename(columns=df.loc[i]))
    standings = functools.reduce(lambda a, b: a.merge(b, on='Team'), stat_standings)
    return standings.astype({col: float for col in standings.columns if col != 'Team'})


standings = standings_html_to_df(pd.read_html(os.path.join(data_path, 'standings/klf_2023.html'))[5:])


def weighted_ratio(x: ArrayLike, weight: ArrayLike, median: ArrayLike):
    return weight * (x - median)

def model_spg(arr: np.ndarray|pd.Series, low_better=False) -> Callable[[ArrayLike], np.ndarray]:
    """Calculate a linear regression for spg weights and return a function that applies it."""
    if isinstance(arr, pd.Series):
        arr = arr.values
    arr = arr.reshape(-1, 1)
    points = rankdata(arr * -1) if low_better else rankdata(arr)
    slope = LinearRegression().fit(arr, points).coef_[0]

    def spg_value(x: ArrayLike) -> np.ndarray:
        return x * slope

    return spg_value


spg_model = MetricGraph.from_model({
    # 'IP': (lambda _: 1200., ('ERA',)),
    # 'AB': (lambda _: 5600., ('BA',)),
    'median_ERA': (np.median, ('ERA',)),
    'median_WHIP': (np.median, ('WHIP',)),
    'median_BA': (np.median, ('BA',)),
    'xER': (weighted_ratio, ('ERA', 'IP', 'median_ERA')),
    'xWHIP': (weighted_ratio, ('WHIP', 'IP', 'median_WHIP')),
    'xH': (weighted_ratio, ('BA', 'AB', 'median_BA')),
    'W_spg': (model_spg, ('W',)),
    'S_spg': (model_spg, ('S',)),
    'K_spg': (model_spg, ('K',)),
    'ERA_spg': (partial(model_spg, low_better=True), ('xER',)),
    'WHIP_spg': (partial(model_spg, low_better=True), ('xWHIP',)),
    'R_spg': (model_spg, ('R',)),
    'HR_spg': (model_spg, ('HR',)),
    'RBI_spg': (model_spg, ('RBI',)),
    'SB_spg': (model_spg, ('SB',)),
    'BA_spg': (model_spg, ('xH',)),
})

def extract_model(df: pd.DataFrame, metric_graph: MetricGraph, metrics: Iterable) -> MetricGraph:
    calculated_model = metric_graph.calculate_metrics(df, metrics)
    model_graph = {metric: metric_graph.dependency_graph[metric] for metric in metrics}
    model_fns = {metric: calculated_model[metric] for metric in metrics}
    return MetricGraph(model_graph, model_fns)


In [None]:
spg_names = [f'{metric}_spg' for metric in pitching_metrics + hitting_metrics]

fwar_model = {
    'pitcher_fWAR': (StatCalculator.reduce_sum, tuple(f'{metric}_spg' for metric in pitching_metrics)),
    'hitter_fWAR': (StatCalculator.reduce_sum, ('R_spg', 'RBI_spg', 'HR_spg', 'BA_spg', 'SB_spg'))
}

fantasy_stat_model = StatCalculator.from_model(
    extract_model(standings.assign(AB=lambda _: 5600, IP=lambda _: 1200), spg_model, spg_names).model()
    | spg_model.model(['xER', 'xH', 'xWHIP'])
    | fwar_model
)
median_stats = spg_model.calculate_metrics(standings, ['median_BA', 'median_ERA', 'median_WHIP'])

In [None]:
fantasy_stat_model.add_metrics(cbs_hitter_proj.assign(median_BA=lambda _: median_stats['median_BA']), metrics=['HR_spg', 'BA_spg', 'hitter_fWAR'])

In [None]:
{1: 2} | {3: 4} | {1: 3}

In [None]:
print(fantasy_stat_model.get_metric_dependencies(['hitter_fWAR']))
#fantasy_stat_model.add_metrics(cbs_hitter_proj, ['hitter_fWAR'])

In [None]:
def position_value(hitter_fwar: ArrayLike, eligible: ArrayLike, pos: str, n_rostered: int) -> float:
    eligible_idx = np.nonzero(matches_eligible(eligible, pos))
    best = np.argpartition(hitter_fwar[eligible_idx], kth=n_rostered)[-n_rostered:]
    return np.min(best)

def matches_eligbible(eligbible: ArrayLike, pos: str) -> np.ndarray:
    elig_series = pd.Series(eligbible)
    pattern = f'^{pos}$|,{pos},|^{pos},|,{pos}$'
    clean_elig = elig_series.str.replace(r'\s', '', regex=True)
    return clean_elig.str.contains(pattern, regex=True)

def test_matches_eligible():
    """Test of matches_eligible."""
    eligible = ['C', 'C, CF', '1B, C', '1B, C, SS', 'CF', 'SS']
    result = matches_eligbible(eligible, 'C')
    assert np.array_equal(result, [True, True, True, True, False, False])

test_matches_eligible()

## Combine Projections

In [None]:
# I think everything below this needs to be jettisoned.
# Current state of above allows calculation of everything except replacement level.

In [None]:
def merge_fg(cbs, base_path, cols, name_filter=lambda x: x):
    # WTF is this mess of shit.
    cbs = cbs.copy()
    ids = cbs.playerid.to_numpy()
    loader = DataLoader(base_path)
    for path in os.listdir(base_path):
        if name_filter(path) and 'cbs' not in path:
            #print(path)
            suffixes = ('', '_' + path[:5].strip('_'))
            df = loader.load_csv(path)
            if len(df) == 0:
                #print(f'{path} data is empty...')
                continue
            #print(df.columns, df.playerid[:10])
            df = df[~df.playerid.astype(str).str.startswith('sa')]

            df.playerid = df.playerid.astype(int)
            df = df[df.playerid.isin(list(ids))]
            assert len(df) > 0, 'df empty, which is sort of like wdf?'
            cbs = cbs.merge(df[cols], on='playerid', suffixes=suffixes, how='left')
    return cbs

def merge_fg_hitters(cbs, base_path):
    cols = ['playerid', 'PA',  'AB', 'H', 'HR', 'R', 'RBI', 'SB']
    return merge_fg(cbs, base_path, cols, lambda x: 'hitter' in x)

def merge_fg_pitchers(cbs, base_path):
    cols = ['playerid', 'IP', 'ER', 'WHIP', 'K', 'S', 'W']
    return merge_fg(cbs, base_path, cols, lambda x: 'pitch' in x)


In [None]:
def add_weighted_stats(df, stats, playtime_stat):
    df = df.copy()
    pt = playtime_stat
    suffixes = set(['_'.join(col.split('_')[1:]) for col in df.columns if len(col.split('_')) > 1])
    assert (f'{pt}_fg-de' in df.columns), f"{pt} not in fg-de columns. Something is wrong."
    df[f'{pt}_adj'] = (df[f'{pt}_fg-de'] + df[pt]) / 2

    for suf in suffixes:
        for col in stats:
            df[f'{col}_{suf}'] = df[f'{col}_{suf}'] / df[f'{pt}_{suf}'] * df[f'{pt}_adj']
    for col in stats:
        adj_cols = [col] + [c for c in df.columns if re.match(f'{col}_', c)]
        df[f'{col}_adj'] = (df[adj_cols].sum(axis=1) / df[adj_cols].notna().sum(axis=1))
    return df

#add_weighted_stats(cbs_hitters, cols[2:], 'PA')
hitter_stats = ['playerid', 'PA',  'AB', 'H', 'HR', 'R', 'RBI', 'SB']
pitcher_stats = ['playerid', 'IP', 'ER', 'WHIP', 'K', 'S', 'W']
full_hitters = (cbs_hitter_proj
    .pipe(merge_fg_hitters, data_path + '/projections')
    .pipe(add_weighted_stats, hitter_stats[2:], 'PA')
    )
full_pitchers = (cbs_pitchers_proj
    .pipe(merge_fg_pitchers, data_path + '/projections')
    .pipe(add_weighted_stats, pitcher_stats[2:], 'IP')
)

In [None]:
from fantasy_baseball_draft.spg import FantasyValuator, spgs_from_standings_html

valuator = FantasyValuator(spgs_from_standings_html(os.path.join(data_path, 'standings/cbs_2021_standings.html')))


In [None]:

def extract_projection(df):
    extract =  [col for col in df.columns if '_adj' in col]
    extract = [col for col in ['Avail', 'Player', 'Eligbible', 'playerid'] if col in df.columns] + extract
    df = df[extract].rename(columns={k: re.sub('_adj', '', k) for k in extract})
    return df

hitter_proj = extract_projection(full_hitters).fillna({'Eligible': 'U'})
print(hitter_proj.PA.isna().sum())
hitter_proj = hitter_proj.merge(elig[['Player', 'Eligible']],  how='left', on='Player').fillna({'Eligible': 'U'})
hitter_proj.isna().sum()
#hitter_proj[hitter_proj.Eligible.isna()]


In [None]:
hitter_proj['fwar'] = valuator.valuate_hitters(hitter_proj, 16*12)

pitcher_proj = extract_projection(full_pitchers)
pitcher_proj['ERA'] = pitcher_proj.ER / pitcher_proj.IP * 9
pitcher_proj['fwar'] = valuator.valuate_pitchers(pitcher_proj, 16*12)

hitter_proj.sort_values('fwar', ascending=False).head(5)

In [None]:
hitter_proj.to_csv('local/hitter_proj.csv', index_label='index')
pitcher_proj.to_csv('local/pitcher_proj.csv', index_label='index')

## Matching Unfound Players

Everything above is solid. WIP stuff to find more player ids.

In [None]:
france = unfound_hitters.loc[93, 'Player'] # in hitter_ids
cbs_player_col_to_df(unfound_hitters.Player)#.merge()
cbseam = cbs_player_col_to_df(cbs22.Player).Team.fillna('---').unique().tolist()
teams = list(zip(
    sorted(cbseam),
    sorted(fg22.Team.unique().tolist())
))
{b: a for a, b in teams if a != b}