In [2]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import matplotlib.pyplot as plt

import psycopg2
import pymysql
import sqlalchemy_redshift
from configparser import ConfigParser
from sqlalchemy import create_engine
import seaborn as sns
from scipy.stats import gamma, lognorm, norm

from numpy import linalg as la
from scipy.stats import norm, poisson, gamma, rv_discrete

parser = ConfigParser()
_ = parser.read("../notebook.cfg")

# Connect to zack attack
za_user = parser.get("nffddev_2", "user")
za_pwd = parser.get("nffddev_2", "password")

za_engine = create_engine(
    f"mysql+pymysql://{za_user}:{za_pwd}@nffddev.numberfire.com/zack_attack",
    connect_args = dict(host='nffddev.numberfire.com', port=3306)
)
za_conn = za_engine.connect()

# connect to redshift
red_user = parser.get("redshift", "user")
red_pwd = parser.get("redshift", "password")

red_engine = create_engine(
    f"postgresql+psycopg2://{red_user}:{red_pwd}@rs1.usdfs.fdbox.net/fanduel",
    connect_args = dict(port=5439)
)
red_conn = red_engine.connect()

In [3]:
def isPD(B):
    """Returns true when input is positive-definite, via Cholesky"""
    try:
        _ = la.cholesky(B)
        return True
    except la.LinAlgError:
        return False

def nearestPD(A):
    """Find the nearest positive-definite matrix to input
    A Python/Numpy port of John D'Errico's `nearestSPD` MATLAB code [1], which
    credits [2].
    [1] https://www.mathworks.com/matlabcentral/fileexchange/42885-nearestspd
    [2] N.J. Higham, "Computing a nearest symmetric positive semidefinite
    matrix" (1988): https://doi.org/10.1016/0024-3795(88)90223-6
    """
    B = (A + A.T) / 2
    _, s, V = la.svd(B)
    H = np.dot(V.T, np.dot(np.diag(s), V))
    A2 = (B + H) / 2
    A3 = (A2 + A2.T) / 2
    if isPD(A3):
        return A3
    spacing = np.spacing(la.norm(A))
    I = np.eye(A.shape[0])
    k = 1
    while not isPD(A3):
        mineig = np.min(np.real(la.eigvals(A3)))
        A3 += I * (-mineig * k**2 + spacing)
        k += 1
    return A3

### Reception Cholesky

In [117]:
## NF proj
nf_qry = '''
select
h.name as home_name,
a.name as away_name,
skill.*,
st.pass_yards,
st.pass_completions,
st.pass_attempts,
st.rec,
p.name as player_name,
p.position,
case
    when skill.team_id = g.home_team_id then 'a'
    when skill.team_id = g.away_team_id then 'b'
end as tm
from (
    select player_id, team_id, game_id,
    pass_yards as mean_pass_yards, sd_pass_yards,
    pass_completions as mean_pass_completions, sd_pass_completions,
    pass_attempts as mean_pass_attempts, sd_pass_attempts,
    rec as mean_rec, sd_rec
    from nfl_projection_skill
    union
    select player_id, team_id, game_id,
    pass_yards as mean_pass_yards, sd_pass_yards,
    pass_completions as mean_pass_completions, sd_pass_completions,
    pass_attempts as mean_pass_attempts, sd_pass_attempts,
    rec as mean_rec, sd_rec
    from nfl_projection_archive_skill
) skill
left join nfl_player p on skill.player_id = p.id
left join nfl_statline_skill st on st.player_id = skill.player_id and st.game_id = skill.game_id
left join nfl_game g on skill.game_id = g.id
left join nfl_team h on h.id = g.home_team_id
left join nfl_team a on a.id = g.away_team_id
where (skill.team_id = g.home_team_id or skill.team_id = g.away_team_id);
'''
proj = pd.read_sql(nf_qry, za_conn)

# convert projections to long format
prop_names = ["rec"]
proj_long = []
for prop_name in prop_names:
    proj_long_i = proj.copy()\
        [["player_id", "player_name", "game_id", "position", "tm", f"sd_{prop_name}", f"mean_{prop_name}", prop_name]]\
        .rename(columns = {f"sd_{prop_name}":"sd", f"mean_{prop_name}":"line", prop_name:"value"})\
        .query("line > 0")
    proj_long_i["prop_name"] = prop_name
    proj_long.append(proj_long_i[["player_id", "player_name","game_id", "position", "tm", "prop_name", "line", "sd", "value"]])
proj_long = pd.concat(proj_long,axis=0).fillna(0)

proj_long['rank'] = proj_long\
        .groupby(["tm", "game_id", "prop_name"])["line"]\
        .rank(method="first", ascending=False)

proj_long = proj_long.query("rank <= 10")\
    .assign(
        var_name = lambda x: x["prop_name"].astype(str)+\
            "_"+x["position"].astype(str)+\
            "_"+x["rank"].astype(int).astype(str)+\
            "_"+x["tm"].astype(str)
    )

proj = proj_long; del proj_long

In [62]:
prop_charts_qry = '''
select *
from analyst_dev.nfl_prop_charts
where prop_name in ('pass_yards', 'pass_attempts', 'pass_completions', 'rec')
'''
#prop_charts = pd.read_sql(prop_charts_qry, red_conn)

In [63]:
game_qry = '''
select 
g.id as game_id,
g.date as gamedate, 
g.away_team_id,
a.abbrev as away_abbrev,
g.home_team_id,
h.abbrev as home_abbrev
from nfl_game g
inner join nfl_team h on h.id = g.home_team_id
inner join nfl_team a on a.id = g.away_team_id
'''
nfl_games = pd.read_sql(game_qry, za_conn)
nfl_games["gamedate"] = pd.to_datetime(nfl_games["gamedate"])

prop_qry = '''
with prop as (
    select
    gamedate, season, week, prop_name,
    position, name, id as player_id, team, opp,
    coalesce(fd, pinny, dk, czr, mgm) as line
    from analyst_dev.nfl_prop_data
    union
    select
    cast(left(gamedate,10) as date) gamedate, season, week, prop_name,
    position, name, id as player_id, team, opp,
    coalesce(fd, pinny) as line
    from analyst_dev.nfl_prop_data_archive
)
select * from prop
where gamedate = '2023-02-12'
and prop_name in ('pass_yards', 'pass_attempts', 'pass_completions', 'rec')
order by line desc
'''
prop = pd.read_sql(prop_qry, red_conn).rename(columns={"name":"player_name"})
prop["gamedate"] = pd.to_datetime(prop["gamedate"])

prop = prop\
    .merge(
        pd.concat([
            nfl_games.rename(columns={"home_abbrev":"team","away_abbrev":"opp", "home_team_id":"team_id"}),
            nfl_games.rename(columns={"home_abbrev":"opp","away_abbrev":"team", "away_team_id":"team_id"}),
        ]),
        on = ["gamedate", "team", "opp"]
    )\
    .drop(["home_team_id", "away_team_id"], axis=1)\
    .merge(nfl_games.drop("gamedate",axis=1), on = 'game_id')
prop["tm"] = prop.apply(
        lambda x: "a" if x["team_id"] == x["home_team_id"] else "b",
        axis=1
    )

prop['rank'] = prop\
        .groupby(["team_id", "game_id", "prop_name"])["line"]\
        .rank(method="first", ascending=False)

prop = prop.query("rank <= 10")\
    .assign(
        var_name = lambda x: x["prop_name"].astype(str)+\
            "_"+x["position"].astype(str)+\
            "_"+x["rank"].astype(int).astype(str)+\
            "_"+x["tm"].astype(str)
    )

line_cols = [c for c in prop_charts.columns if "u_" in c[0:2]]
id_cols = ["chart_type", "prop_name", "position", "mean"]
prop_piv = prop_charts.melt(
    id_vars=id_cols, value_vars=line_cols, 
    var_name='x', value_name='cdf'
).dropna()
prop_piv["x"] = [x.replace("u_", "") for x in prop_piv["x"]]
prop_piv["x"] = prop_piv["x"].astype(int)

In [281]:
qry = '''
select
receiver_id as player_id,
player.name as player_name,
player.position,
total_yards as yards
from nfl_plays plays
inner join nfl_player player on player.id = plays.receiver_id
where category = 'PASS'
and is_complete_pass = 1
and season = 2022
and position in ('WR', 'TE', 'RB')
'''
rec_df = pd.read_sql(qry, za_conn)

### Analysis

In [282]:
corr_mat = proj\
    .query("position in ['WR', 'TE', 'RB']")\
    .pivot(
        index = 'game_id', 
        columns='var_name',
        values='value'
    )\
    .reset_index()\
    .drop("game_id",axis=1)\
    .corr().fillna(0)

In [283]:
# Player Param Dictionary
sb_plrs = prop.query("game_id == 8297 and prop_name == 'rec'").to_dict("records")

In [284]:
# Position Param Dictionary for lognomral recpetion yards per catch
rec_df_pos = rec_df.query("yards > 0")
pos_lognorm_params = {
    pos:lognorm.fit(rec_df.query(f"yards > 0 and position == '{pos}'").yards, loc=0) 
    for pos in ["WR", "TE", "RB"]
}

def add_distributions(plr):

    plr_id = plr["player_id"]
    plr_pos = plr["position"]
    line = plr["line"]
    
    # empical receptions
    dist = prop_piv\
        .query(f"prop_name == 'rec'")\
        .sort_values(["mean", "x"])\
        .assign(diff = lambda x: np.abs(x["mean"] - line))
    dist["pdf"] = dist["cdf"] - dist["cdf"].shift(1)
    dist["pdf"] = np.where(dist["pdf"].isnull(), dist["cdf"], dist["pdf"])
    plr["rec_dist"] = dist[dist['diff'] == dist['diff'].min()]

    # lognormal reception yards per catch
    plr_yrds = rec_df.query(f"yards > 0 and player_id == {plr_id}").yards.values
    pos_yrds = lognorm(*pos_lognorm_params[plr_pos]).rvs(10)
    plr["rec_lognorm_params"] = lognorm.fit(np.concatenate([plr_yrds, pos_yrds]), loc = 0)

    return plr

sb_plrs = [add_distributions(plr) for plr in sb_plrs]

In [293]:
variables = [x["var_name"] for x in sb_plrs]

c = nearestPD(corr_mat.loc[variables, variables])
L = np.linalg.cholesky(c)

s_samples = 100_000
uncorrelated = np.random.standard_normal((len(variables), s_samples))
correlated = np.dot(L, uncorrelated)

z  = [(x - np.mean(x))/np.std(x) for x in correlated]
cdf = [norm.cdf(zi) for zi in z]

In [294]:
sim_results = []
i = 0
for i in tqdm(range(len(sb_plrs))):
    rec_dist = sb_plrs[i]["rec_dist"]
    rec = np.array([rec_dist[rec_dist["cdf"] <= c]["x"].max() for c in cdf[i]])
    rec[np.isnan(rec)] = 0
    rec = rec.astype(int)

    rec_yrd_param = sb_plrs[i]["rec_lognorm_params"]
    rec_yrds_dist = lognorm(*rec_yrd_param)

    def get_max_yrds(x):
        if x > 0:
            return rec_yrds_dist.rvs(x).max()
        else:
            return 0

    max_rec_yrds = [get_max_yrds(x) for x in rec]

    sim_results.append(max_rec_yrds)

sim_results = np.vstack(sim_results).T

100%|██████████| 11/11 [03:49<00:00, 20.84s/it]


In [295]:
def any_60(x):
    return (x.max() > 60).astype(int)

np.apply_along_axis(any_60, 1, sim_results).mean()

0.19891

In [296]:
def is_longest(x):
    out = np.zeros(len(x))
    y = np.where(x == np.max(x))[0][0]
    out[y] = 1
    return out

prob_longest = np.apply_along_axis(np.mean, 0, np.apply_along_axis(is_longest, 1, sim_results))
pd.DataFrame({
    "player_name": [x["player_name"] for x in sb_plrs],
    "prob_longest": prob_longest
})

Unnamed: 0,player_name,prob_longest
0,Travis Kelce,0.15614
1,JuJu Smith-Schuster,0.05859
2,Jerick McKinnon,0.06493
3,Marquez Valdes-Scantling,0.1024
4,Isiah Pacheco,0.02917
5,DeVonta Smith,0.14158
6,A.J. Brown,0.31053
7,Dallas Goedert,0.08191
8,Kenneth Gainwell,0.0009
9,Quez Watkins,0.05325
