In [1]:
import pandas as pd
import numpy as np
import altair as alt

This is our main function that will take as input the three names of players you want the frankenstein player to be like. For example, you could input into the function that you want a player that hits like Aaron Judge, fields like Patrick Bailey and swings like Juan Soto. A similarity score will be calculated for each category of statistics. The hit_like statistics are batted ball outcomes such as xwoba, xobp, gb rate and gb rate. The field_like statistics are defensive statistics fielding run value and arm strength. Finally, the swing_like statistics are specific swing characteristics such as attack angle and attack direction. Each specific skillset carries a specific weight to it that can be adjusted in the function call. The batted and offense weights are both associated with the hit_like player, the swing weight is associated with the swing_like player and the defense weight is associated with the field_like player. The default settings are to weight offense to 0.6, the batted ball profile to 0.2, swing mechanics to 0.1 and defense to 0.1. For each skillset, the function will then calculate the cosine similarity of the player you input to all other players in the dataset for that skillset and will multiply the calculated cosine similarity score by the specified weight. It will then add that similarity score for that skillset to each player in the dataset's aggregate score You do not need to input a player for all 3 archetypes, and the function will default to the hit_like player if there is no player input into the field_like and swing_like inputs. For example, if you input Aaron Judge into the hit_like option and nothing into the field_like option, then by default the function will use Aaron Judge for the field_like player. There is also an option to toggle the counterpart filter which essentially acts as a way to filter to more budget level players that are not superstars. So you may be able to find someone that can provide relatively similar profiles to the ones that you input but at a much cheaper price tag compared to your typical superstar. To do this you can toggle counterpart to True, then you can choose a stat that typically signals overall player performance such as xwoba to filter the players down too and then also choose a percentage to filter the players. For example if the percentage is 5% (0.05) then the function will take the players that are less than the 95th percentile of the current players in the dataset for that statistic. The final output will be the top n players that are most similar to those specific skillsets that you specified in the function definition. An example output is provided below which helps detail the inputs and outputs we described here.

In [2]:
def frankenstein_recommend(
    hit_like,
    field_like=None,
    swing_like=None,
    X_scaled=None,
    top_n=15,
    weights=None,          
    position=None,
    bats=None,
    min_shared = 2,
    counterpart=False,               
    counterpart_by="xwoba",          
    counterpart_top_pct=0.05
):
    
    X_scaled = pd.read_csv('../data/cleaned_player_data/cleaned_batters.csv')
    X_scaled = X_scaled.set_index('player_name_final')
    df = X_scaled.drop(columns = ['position', 'side']).copy()

    OFFENSE = [
        "xwoba", "xba", "xslg", "xiso", "xobp",
        "brl_percent", "hard_hit_percent",
        "exit_velocity", "max_ev",
        "k_percent", "bb_percent",
        "whiff_percent", "chase_percent"
    ]

    BATTED_BALL = [
        "gb_rate", "air_rate", "fb_rate", "ld_rate", "pu_rate",
        "pull_rate", "straight_rate", "oppo_rate"
    ]

    SWING_PATH = [
        "attack_angle",
        "attack_direction",
        "swing_tilt",
        "avg_bat_speed"
    ]

    DEFENSE = [
        "total_runs",
        "arm_strength_z"
    ]

    
    if weights is None:
        weights = {"offense": 0.6, "batted": 0.2, "swing": 0.1, "defense": 0.1}

    def masked_cosine_to_all(a_vec, B, min_shared=2):
        sims = np.full(B.shape[0], np.nan, dtype=float)
        for i in range(B.shape[0]):
            b = B[i]
            mask = ~np.isnan(a_vec) & ~np.isnan(b)
            if mask.sum() < min_shared:
                continue
            a_m = a_vec[mask]
            b_m = b[mask]
            denom = np.linalg.norm(a_m) * np.linalg.norm(b_m)
            if denom == 0:
                continue
            sims[i] = np.dot(a_m, b_m) / denom
        return sims

    # helper: similarity vector vs everyone for a given prototype + feature set
    def sims_for(player_name, cols):
        cols = [c for c in cols if c in df.columns]  # in case some cols missing
        if len(cols) == 0:
            return np.zeros(len(df), dtype=float)

        idx = df.index.get_loc(player_name)
        A = df[cols].to_numpy()
        a_vec = A[idx]

        sims = masked_cosine_to_all(a_vec, A, min_shared=min_shared)

        # Treat "not enough overlap" as 0 contribution (so other blocks can still matter)
        return np.nan_to_num(sims, nan=0.0)

    scores_hit = np.zeros(len(df), dtype=float)
    scores_field = np.zeros(len(df), dtype=float)
    scores_swing = np.zeros(len(df), dtype=float)

    # offense prototype is required
    scores_hit += weights["offense"] * sims_for(hit_like, OFFENSE)
    scores_hit += weights["batted"] * sims_for(hit_like, BATTED_BALL)
    
    field_proto = field_like if field_like is not None else hit_like
    scores_field += weights["defense"] * sims_for(field_proto, DEFENSE)

    swing_proto = swing_like if swing_like is not None else hit_like
    scores_swing += weights["swing"] * sims_for(swing_proto, SWING_PATH)

    results = X_scaled.copy()
    results["score_hit"] = scores_hit
    results["score_field"] = scores_field
    results["score_swing"] = scores_swing
    results['total_score'] = results["score_hit"] + results["score_field"] + results["score_swing"]

    # optional filters
    if position is not None:
        results = results[results["position"] == position]
    if bats is not None:
        results = results[results["side"] == bats]

    

    if counterpart:
        cutoff = results[counterpart_by].quantile(1 - counterpart_top_pct)
        results = results[results[counterpart_by] < cutoff]

    # exclude prototypes from output
    exclude = {hit_like, field_like, swing_like}
    exclude = {x for x in exclude if x is not None}
    results = results.drop(index=[x for x in exclude if x in results.index], errors="ignore")

    results = results[['score_hit', 'score_field', 'score_swing', 'total_score']].sort_values("total_score", ascending=False).head(top_n)

    results = results.reset_index(drop = False)

    result_melt = pd.melt(results, id_vars = ['player_name_final', 'total_score'], value_vars = [
        'score_hit', 'score_field', 'score_swing'
    ])

    if hit_like is not None and field_like is not None and swing_like is None:
        title = f'Hitters Most like {hit_like} (Offense and Swing) and {field_like} (Fielding)'
    elif hit_like is not None and field_like is None and swing_like is not None:
        title = f'Hitters Most like {hit_like} (Offense and Fielding) and {swing_like} (Swing)'
    elif hit_like is not None and field_like is not None and swing_like is not None:
        title = f'Hitters Most like {hit_like} (Offense), {field_like} (Fielding) and {swing_like} (Swing)'
    elif hit_like is not None and field_like is None and swing_like is None:
        title = f'Hitters Most like {hit_like} (Offense, Fielding and Swing)'

    chart = alt.Chart(result_melt, title = title).mark_bar().encode(
        x = alt.X('sum(value):Q', title = 'Total Score'),
        y=alt.Y("player_name_final:N",title="Player Name").sort('-x'),
        color = alt.Color('variable', title = 'Prototype Type'),
        tooltip = ['player_name_final', 'variable', 'value', 'total_score']
    )

    return chart

Here is an example output of the function below. In this specific call we are looking for the players who have a combination of the most similar hitting outcomes to Aaron Judge, defensive attributes of Patrick Bailey and swing like Luis Arraez. We also adjust the weighting of the attributes by inputting a dictionary with the specified weights below. Therefore, this specific call will weight the player simnilarity evenly between the offense, batted ball and defense attributes and weight the swing the least at 0.1. Based off the resulting chart, we can see that Wyatt Langford most closely resembles this frankenstein player we are looking for. A majority of his score comes from his similarity in batted ball outcomes with Aaron Judge with a similarity score in this set of statistics of 0.43 out of the total 0.67. 

In [3]:
chart = frankenstein_recommend(hit_like = 'Aaron Judge', field_like='Patrick Bailey', swing_like='Luis Arraez',
                       X_scaled=None, top_n=15, weights={"offense": 0.3, "batted": 0.3, "swing": 0.1, "defense": 0.3},
                       position=None, bats=None, counterpart=True, counterpart_by='xwoba', counterpart_top_pct=0
)

In [4]:
chart