In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import matplotlib.pyplot as plt
from scipy.special import logit, expit
import statsmodels.api as sm
import statsmodels.formula.api as smf

def softmax(x):
    return np.exp(x)/np.sum(np.exp(x))

def normalize(x):
    return x/np.sum(x)

import psycopg2
import pymysql
import sqlalchemy_redshift
from configparser import ConfigParser
from sqlalchemy import create_engine

parser = ConfigParser()
_ = parser.read("../notebook.cfg")

# Connect to zack attack
za_user = parser.get("nffddev_2", "user")
za_pwd = parser.get("nffddev_2", "password")

za_engine = create_engine(
    f"mysql+pymysql://{za_user}:{za_pwd}@nffddev.numberfire.com/zack_attack",
    connect_args = dict(host='nffddev.numberfire.com', port=3306)
)
za_conn = za_engine.connect()

# # connect to redshift
# red_user = parser.get("redshift", "user")
# red_pwd = parser.get("redshift", "password")

# red_engine = create_engine(
#     f"postgresql+psycopg2://{red_user}:{red_pwd}@rs1.usdfs.fdbox.net/fanduel",
#     connect_args = dict(port=5439)
# )
# red_conn = red_engine.connect()

### Prop Data

In [3]:
# # Team
# nfl_team = pd.read_sql("select * from nfl_team", za_conn)\
#     .rename(columns={"id":"team_id"})

# # Game
# nfl_game = pd.read_sql("select * from nfl_game", za_conn)\
#     .rename(columns={"id":"game_id", "date":"gamedate"})
# nfl_game["gamedate"] = pd.to_datetime(nfl_game["gamedate"])


# prop_qry = '''
# with nfl_prop_full as (
#     select
#     left(gamedate,10) gamedate,
#     season,week,prop_name,position,name,
#     id as player_id,team, opp,
#     fd, pinny, dk, czr, mgm
#     from analyst_dev.nfl_prop_data
#     union
#     select
#     left(gamedate,10) gamedate,
#     season,week,prop_name,position,name,
#     id as player_id,team, opp,
#     fd, pinny, NULL as dk, NULL as czr, mgm
#     from analyst_dev.nfl_prop_data_archive
# ),
# prop_filt as (
#     select *,
#     coalesce(fd, pinny, dk, czr, mgm) as line
#     from nfl_prop_full
#     where prop_name in ('rush_yards', 'rec_yards', 'pass_yards', 'rec')
# )
# select * from prop_filt where line is not null
# '''
# prop = pd.read_sql(prop_qry, red_conn)\
#     .merge(
#         nfl_team[["abbrev", "team_id"]]\
#             .rename(columns={"abbrev":"team"}), 
#         on = "team",
#         how = "left"
#     )
# prop["gamedate"] = pd.to_datetime(prop["gamedate"])

# # build game dataset
# nfl_prop_game = nfl_game.merge(
#     nfl_team[["team_id", "abbrev"]]\
#         .rename(columns={"team_id":"home_team_id", "abbrev":"home_team_abbrev"}),
#     on = ["home_team_id"]
# ).merge(
#     nfl_team[["team_id", "abbrev"]]\
#         .rename(columns={"team_id":"away_team_id", "abbrev":"away_team_abbrev"}),
#     on = ["away_team_id"]
# )[["game_id", "gamedate", "home_team_abbrev", "away_team_abbrev"]]

# nfl_prop_game = pd.concat([
#     nfl_prop_game.rename(columns={"home_team_abbrev":"team", "away_team_abbrev":"opp", "home_team_id":"team_id"}),
#     nfl_prop_game.rename(columns={"away_team_abbrev":"team", "home_team_abbrev":"opp", "away_team_id":"team_id"})
# ],axis=0)

# # join game data set
# prop = prop.merge(nfl_prop_game, on = ["gamedate", "team", "opp"])

# ## rank rushers and recievers
# prop["rank_team_pos"] = prop\
#     .groupby(["game_id", "team_id", "prop_name", "position"])["line"]\
#     .rank(method="first",ascending=False).astype(int)

# prop["rank_team"] = prop\
#     .groupby(["game_id", "team_id", "prop_name"])["line"]\
#     .rank(method="first",ascending=False).astype(int)

# # join on actual data
# prop = prop.merge(skill_df, on = ["game_id", "player_id", "prop_name"], how = "left").fillna(0)
# prop.head()

### Projection Data

In [4]:
proj_qry = """
select 
t.name as team_name,
p.name as player_name,
p.position,
s.player_id, s.team_id, s.game_id, g.date as gamedate,
s.rush_yards, s.pass_yards, s.rec_yards, s.rec
from nfl_projection_skill s
inner join nfl_team t on s.team_id = t.id
inner join nfl_player p on p.id = s.player_id
inner join nfl_game g on g.id = s.game_id
"""
proj = pd.read_sql(proj_qry, za_conn)\
.melt(
    id_vars=["team_name", "player_name", "position", "player_id", "team_id", "game_id", "gamedate"],
    value_vars=["rec", "rec_yards", "rush_yards", "pass_yards"]
).rename(columns={"variable":"prop_name", "value":"line"})

proj["gamedate"] = pd.to_datetime(proj["gamedate"])

proj["rank_team_pos"] = proj\
    .groupby(["game_id", "team_id", "prop_name", "position"])["line"]\
    .rank(method="first",ascending=False).astype(int)

proj["rank_team"] = proj\
    .groupby(["game_id", "team_id", "prop_name"])["line"]\
    .rank(method="first",ascending=False).astype(int)

### PBP Data

In [5]:
pbp_qry = '''
select yards.*,
mx_pl.n_plays
from (
    select game_id, offense_team_id as team_id,
    play_number, receiver_id as player_id,
    'rec_yards' as prop_name, total_yards as value
    from nfl_plays where category = 'PASS'
    and is_complete_pass = 1
    union
    select game_id, offense_team_id as team_id,
    play_number, rusher_id as player_id,
    'rush_yards' as prop_name, total_yards as value
    from nfl_plays where category = 'RUSH'
    union
    select game_id, offense_team_id as team_id,
    play_number, passer_id as player_id,
    'pass_yards' as prop_name, total_yards as value
    from nfl_plays where category = 'PASS'
    and is_complete_pass = 1
    union
    select game_id, offense_team_id as team_id,
    play_number, receiver_id as player_id,
    'rec' as prop_name, 1 as value
    from nfl_plays where category = 'PASS'
    and is_complete_pass = 1
) yards
inner join (
    select game_id, max(play_number) n_plays
    from nfl_plays group by 1
) mx_pl on yards.game_id = mx_pl.game_id
'''
pbp_df = pd.read_sql(pbp_qry, za_conn)
pbp_df["player_id"] = pbp_df["player_id"].fillna(0).astype("int")

### Model Data

In [6]:
prop_cols = [
    "game_id", "player_id", "team_id", "gamedate",
    "position", "prop_name", "line", "rank_team", "rank_team_pos"
]
#mod_df = prop[prop_cols] change to this if you get redshift to work
mod_df = proj[prop_cols]\
    .query("rank_team <= 5")\
    .merge(
        pbp_df,
        on = ["game_id", "player_id", "team_id", "prop_name"],
        how = 'left'
    )\
    .assign(
        n_plays_max = lambda x: x.groupby("game_id").transform("max")["n_plays"],
        n_plays = lambda x: x.n_plays_max # this imputes NA values for max plays for playes with no real stats
    )\
    .fillna(value = {"play_number":0, "value":0})\
    .drop("n_plays_max",axis=1)


mod_df["player_id"] = mod_df["player_id"].astype(int)
mod_df["prop_id"] = mod_df.agg('{0[prop_name]}_{0[game_id]}_{0[player_id]}'.format, axis = 1)

mod_df["value_cum"] = mod_df\
    .sort_values(["prop_id", "play_number"])\
    .groupby(["prop_id"])['value'].cumsum()

### Model

In [7]:
def build_multi_logit_model(prop_name, value, position):
    mod_df_sub = mod_df\
        .query(f"""
            prop_name == '{prop_name}' and \
            rank_team_pos in [1,2,3] and \
            position in {position} and \
            gamedate < '{datetime.today()}'
        """)\
        .drop("position", axis=1)\
        .copy()

    # Deterine plays for player to cross threshold
    mod_df_sub["hit"] = mod_df_sub["value_cum"].apply(lambda x: 1 if x > value else 0)
    mod_df_sub["no_hit"] = 1-mod_df_sub.groupby(["game_id", "player_id"])['hit'].transform("max")
    # if hit: return play_number times hit (1 or 0) // if no_hit: return number of plays in game
    mod_df_sub["t"] = mod_df_sub.apply(lambda x: x["n_plays"] if x["no_hit"] == 1 else x["play_number"]*x["hit"], axis=1)
    # filter to where t > 0 (if no_hit, then it is n_plays)
    mod_df_filt = mod_df_sub\
        .query("t > 0")\
        .groupby(["game_id", "team_id", "player_id", "line", "rank_team_pos"])\
        .agg({"t":"min", "hit":"max"}).reset_index()

    mod_df_filt["t_min"] = mod_df_filt.groupby("game_id")["t"].transform("min")
    mod_df_filt["fastest"] = mod_df_filt.apply(lambda x: x["t"]==x["t_min"],axis=1) * mod_df_filt["hit"]
    mod_df_filt["n_teammates"] = mod_df_filt.assign(cnt=1).groupby(["game_id", "team_id"])['cnt'].transform("sum")-1
    mod_df_filt["is_starter"] = mod_df_filt["rank_team_pos"].apply(lambda x: 1 if x == 1 else 0)

    valid_games = mod_df_filt\
        .groupby("game_id")\
        .agg({"fastest":"sum"})\
        .reset_index()\
        .rename(columns={"fastest":"n_hit"})\
        .query("n_hit > 0")

    mod_df_join = valid_games.merge(mod_df_filt, on = 'game_id')
    df = mod_df_join.copy()

    model = smf.glm(
            formula = "fastest ~ line",
            data = df,
            family = sm.families.Binomial()
        ).fit()

    pos = "_".join(position)
    nme = f"{pos}_{prop_name}_{value}"

    return nme, model

In [8]:
model_list =[
    # RB RUSH YD MARKETS
    ("rush_yards", 10,["RB"]), 
    ("rush_yards",25,["RB"]), 
    ("rush_yards",50,["RB"]),
    # WR REC YD MARKETS
    ("rec_yards", 20,["WR", "TE", "RB"]),
    ("rec_yards", 40,["WR", "TE", "RB"]),
    ("rec_yards", 60,["WR", "TE", "RB"]),
    ("rec_yards", 25,["WR", "TE", "RB"])
]
multi_logit_models = {}
for mod_param in tqdm(model_list):
    nme, multi_logit_model = build_multi_logit_model(*mod_param)
    multi_logit_models[nme] = multi_logit_model

100%|██████████| 7/7 [00:00<00:00,  7.06it/s]


In [11]:
def get_multi_logit_predictions(market, models, proj):
    # unpack market
    game_id = market["game_id"]
    prop_name = market["prop_name"]
    position = market["position"]
    value = market["value"]
    pred_df = proj.query(f"""
            game_id == {game_id}\
            and position in {position}\
            and prop_name in '{prop_name}'
        """)\
        [["player_name", "position", "team_id","rank_team_pos","line"]]

    # assign vars
    pred_df["Intercept"] = 1
    #pred_df["n_teammates"] = pred_df.assign(cnt=1).groupby(["team_id"])['cnt'].transform("sum")-1
    #pred_df["is_starter"] = pred_df["rank_team_pos"].apply(lambda x: 1 if x == 1 else 0)

    pos = "_".join(position)
    nme = f"{pos}_{prop_name}_{value}"  
    model = models[nme]
    X = pred_df[model.params.index.to_list()]
    beta = np.array(model.params).reshape(-1,1)
    pred_df["p"] = normalize(expit(X @ beta))
    pred_df["p"] = softmax(X @ beta)
    pred_df["price"] = 1/pred_df["p"]
    return pred_df


market = {
    "game_id":8297,
    "position": ["WR", "TE", "RB"],
    "prop_name": "rec_yards",
    "value": 25
}
get_multi_logit_predictions(market, multi_logit_models, proj).sort_values("line",ascending=False)

Unnamed: 0,player_name,position,team_id,rank_team_pos,line,Intercept,p,price
9673,Travis Kelce,TE,16,1,83.88,1,0.24763,4.038277
13322,A.J. Brown,WR,24,1,73.8,1,0.166654,6.000463
15697,DeVonta Smith,WR,24,2,65.96,1,0.122475,8.164955
12490,Marquez Valdes-Scantling,WR,16,1,57.49,1,0.087807,11.388646
12231,Dallas Goedert,TE,24,1,52.62,1,0.072516,13.79005
9914,Jerick McKinnon,RB,16,1,29.71,1,0.029481,33.920701
17259,Isiah Pacheco,RB,16,2,23.68,1,0.023262,42.988365
11042,JuJu Smith-Schuster,WR,16,2,21.51,1,0.021361,46.814056
17009,Skyy Moore,WR,16,3,20.44,1,0.020482,48.823961
15350,Quez Watkins,WR,24,3,18.2,1,0.018756,53.315407
