In [679]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import matplotlib.pyplot as plt
from scipy.special import logit, expit
import statsmodels.api as sm
import statsmodels.formula.api as smf

def softmax(x):
    return np.exp(x)/np.sum(np.exp(x))

def normalize(x):
    return x/np.sum(x)

In [680]:
import psycopg2
import pymysql
import sqlalchemy_redshift
from configparser import ConfigParser
from sqlalchemy import create_engine

parser = ConfigParser()
_ = parser.read("../notebook.cfg")

# Connect to zack attack
za_user = parser.get("nffddev_2", "user")
za_pwd = parser.get("nffddev_2", "password")

za_engine = create_engine(
    f"mysql+pymysql://{za_user}:{za_pwd}@nffddev.numberfire.com/zack_attack",
    connect_args = dict(host='nffddev.numberfire.com', port=3306)
)
za_conn = za_engine.connect()

# connect to redshift
red_user = parser.get("redshift", "user")
red_pwd = parser.get("redshift", "password")

red_engine = create_engine(
    f"postgresql+psycopg2://{red_user}:{red_pwd}@rs1.usdfs.fdbox.net/fanduel",
    connect_args = dict(port=5439)
)
red_conn = red_engine.connect()

## Process Data

### NFL Line Data

In [21]:
total_implied = pd.read_csv("../data/fb_ou_implied_mean.csv")
spread_implied = pd.read_csv("../data/fb_spread_implied_mean.csv")


nfl_game = pd.read_sql("select * from nfl_game", za_conn)\
    .merge(total_implied, on = "ou", how = "left")\
    .merge(spread_implied, on = "spread", how = "left")\
    .rename(columns={"id":"game_id", "date":"gamedate"})
nfl_game["gamedate"] = pd.to_datetime(nfl_game["gamedate"])

# concatenate the home and away lines
line_cols = ["game_id", "team_id", "ou_equiv", "relative_spread"]
nfl_lines = pd.concat([
    # Home Lines
    nfl_game.rename(columns={"home_team_id":"team_id"})\
        .assign(relative_spread = lambda x: x.spread_equiv)[line_cols],
    # Away Lines
    nfl_game.rename(columns={"away_team_id":"team_id"})\
        .assign(relative_spread = lambda x: -1*x.spread_equiv)[line_cols]
    ], axis=0
)

### Team Data

In [22]:
nfl_team = pd.read_sql("select * from nfl_team", za_conn)\
    .rename(columns={"id":"team_id"})

### Prop Charts

In [23]:
qry = open("../data/prop_charts.sql", "r").read()
prop_charts = pd.read_sql(qry, red_conn)

### Skill Data

In [24]:
qry = open("../data/skill_data.sql", "r").read()
skill_df = pd.read_sql(qry, za_conn)

### First Possession Data

This was brought in incase the Bengals made the super bowl to add as a feature (they always take the ball first).

In [25]:
qry = open("../data/first_possession_data.sql", "r").read()
first_pos = pd.read_sql(qry, za_conn)

### Prop Line Data

In [232]:
qry = open("../data/prop_query.sql", "r").read()
prop = pd.read_sql(qry, red_conn)\
    .merge(
        nfl_team[["abbrev", "team_id"]]\
            .rename(columns={"abbrev":"team"}), 
        on = "team",
        how = "left"
    )
prop["gamedate"] = pd.to_datetime(prop["gamedate"])

# build game dataset
nfl_prop_game = nfl_game.merge(
    nfl_team[["team_id", "abbrev"]]\
        .rename(columns={"team_id":"home_team_id", "abbrev":"home_team_abbrev"}),
    on = ["home_team_id"]
).merge(
    nfl_team[["team_id", "abbrev"]]\
        .rename(columns={"team_id":"away_team_id", "abbrev":"away_team_abbrev"}),
    on = ["away_team_id"]
)[["game_id", "gamedate", "home_team_abbrev", "away_team_abbrev"]]

nfl_prop_game = pd.concat([
    nfl_prop_game.rename(columns={"home_team_abbrev":"team", "away_team_abbrev":"opp", "home_team_id":"team_id"}),
    nfl_prop_game.rename(columns={"away_team_abbrev":"team", "home_team_abbrev":"opp", "away_team_id":"team_id"})
],axis=0)

# join game data set
prop = prop.merge(nfl_prop_game, on = ["gamedate", "team", "opp"])

## rank rushers and recievers
prop["rank_team_pos"] = prop\
    .groupby(["game_id", "team_id", "prop_name", "position"])["line"]\
    .rank(method="first",ascending=False).astype(int)

prop["rank_team"] = prop\
    .groupby(["game_id", "team_id", "prop_name"])["line"]\
    .rank(method="first",ascending=False).astype(int)

# join on actual data
prop = prop.merge(skill_df, on = ["game_id", "player_id", "prop_name"], how = "left").fillna(0)
prop.head()

Unnamed: 0,gamedate,season,week,prop_name,position,name,player_id,team,opp,fd,pinny,dk,czr,mgm,line,team_id,game_id,rank_team_pos,rank_team,actual_yards
0,2021-10-04,2021,4,rec_yards,WR,Zay Jones,53424,LV,LAC,0.0,0.0,0.0,14.3,0.0,14.3,35,7780,4,7,5.0
1,2021-10-04,2021,4,rush_yards,RB,Peyton Barber,53296,LV,LAC,0.0,0.0,0.0,24.3,0.0,24.3,35,7780,3,3,0.0
2,2021-10-04,2021,4,rush_yards,QB,Derek Carr,52444,LV,LAC,11.2,0.0,0.0,10.5,0.0,11.2,35,7780,1,4,6.0
3,2021-10-04,2021,4,rec_yards,RB,Josh Jacobs,54151,LV,LAC,0.0,0.0,0.0,14.2,13.8,14.2,35,7780,2,8,17.0
4,2021-10-04,2021,4,rec,WR,Henry Ruggs,54534,LV,LAC,3.4,0.0,0.0,0.0,0.0,3.4,35,7780,2,2,0.0


### Projections

In [None]:
#query("game_id == 8297")
proj_qry = """
select 
t.name as team_name,
p.name as player_name,
p.position,
s.player_id, s.team_id, s.game_id, g.date as gamedate,
s.rush_yards, s.pass_yards, s.rec_yards, s.rec
from nfl_projection_skill s
inner join nfl_team t on s.team_id = t.id
inner join nfl_player p on p.id = s.player_id
inner join nfl_game g on g.id = s.game_id
where game_id = 8297
"""
proj = pd.read_sql(proj_qry, za_conn)\
.melt(
    id_vars=["team_name", "player_name", "position", "player_id", "team_id", "game_id", "gamedate"],
    value_vars=["rec", "rec_yards", "rush_yards", "pass_yards"]
).rename(columns={"variable":"prop_name", "value":"line"})

proj["rank_team_pos"] = proj\
    .groupby(["game_id", "team_id", "prop_name", "position"])["line"]\
    .rank(method="first",ascending=False).astype(int)

proj["rank_team"] = proj\
    .groupby(["game_id", "team_id", "prop_name"])["line"]\
    .rank(method="first",ascending=False).astype(int)

### Model Data

In [850]:
pbp_qry = open("../data/pbp_data.sql", "r").read()
pbp_df = pd.read_sql(pbp_qry, za_conn)

prop_cols = [
    "game_id", "player_id", "team_id", "gamedate",
    "position", "prop_name", "line", "rank_team", "rank_team_pos"
]
mod_df = proj[prop_cols]\
    .query("rank_team <= 5")\
    .merge(
        pbp_df,
        on = ["game_id", "player_id", "team_id", "prop_name"],
        how = 'left'
    )\
    .merge(first_pos, on = ["game_id", "team_id"])\
    .assign(
        n_plays_max = lambda x: x.groupby("game_id").transform("max")["n_plays"],
        n_plays = lambda x: x.n_plays_max # this imputes NA values for max plays for playes with no real stats
    )\
    .fillna(value = {"play_number":0, "value":0})\
    .drop("n_plays_max",axis=1)
del pbp_df

mod_df["player_id"] = mod_df["player_id"].astype(int)
mod_df["prop_id"] = mod_df.agg('{0[prop_name]}_{0[game_id]}_{0[player_id]}'.format, axis = 1)

mod_df["value_cum"] = mod_df\
    .sort_values(["prop_id", "play_number"])\
    .groupby(["prop_id"])['value'].cumsum()

## Model

In [735]:
### Logit Model
import statsmodels.api as sm
import statsmodels.formula.api as smf

def build_market_name(prop_name, value, position, rank):
    return f"plays_to_{value}_{prop_name}_{position}_{rank}"

def flatten_column_index(df):
    df = df.copy()
    level_one = df.columns.get_level_values(0).astype(str)
    level_two = df.columns.get_level_values(1).astype(str)
    column_separator = ['_' if x != '' else '' for x in level_two]
    df.columns = level_one + column_separator + level_two
    return df

class certain_model:

    def __init__(self):
        self.pred = [1]

    def predict(self, exog):
        pred = self.pred
        return pred

def build_logit_model(prop_name, value, position):
    #prop_name = 'rec'; position='RB'; value = 1
    # subset model df to desired position (if any)
    mod_df_sub = mod_df\
        .query(f"""
            prop_name == '{prop_name}' and \
            rank_team_pos in [1,2] and \
            position == '{position}' and \
            gamedate < '{datetime.today()}'
        """)\
        .drop("position", axis=1)\
        .copy()

    # Deterine plays for player to cross threshold
    mod_df_sub["hit"] = mod_df_sub["value_cum"].apply(lambda x: 1 if x > value else 0)
    mod_df_sub["no_hit"] = 1-mod_df_sub.groupby(["game_id", "player_id"])['hit'].transform("max")
    # if hit: return play_number times hit (1 or 0) // if no_hit: return number of plays in game
    mod_df_sub["t"] = mod_df_sub.apply(lambda x: x["n_plays"] if x["no_hit"] == 1 else x["play_number"]*x["hit"], axis=1)
    # filter to where t > 0 (if no_hit, then it is n_plays)
    mod_df_filt = mod_df_sub\
        .query("t > 0")\
        .groupby(["game_id", "team_id", "player_id", "line", "rank_team_pos"])\
        .agg({"t":"min", "hit":"max"}).reset_index()

    # filter to games with more than 1 position line
    valid_games = mod_df_filt\
        .groupby("game_id")\
        .agg({"player_id":"count"})\
        .reset_index()\
        .rename(columns={"player_id":"n_players"})\
        .query("n_players > 1")

    # Join on itself to auto mirror every matchup
    mod_df_join = valid_games\
        .merge(mod_df_filt, on = 'game_id')\
        .merge( mod_df_filt, on = 'game_id',suffixes=["_1", "_2"])\
        .query("player_id_1 != player_id_2")

    mod_df_join["push"] = mod_df_join.apply(lambda x: x["hit_1"] + x["hit_2"] == 0, axis=1).astype("int")
    mod_df_join["player_1_win"] = mod_df_join.apply(lambda x: x["t_1"] < x["t_2"], axis=1).astype("int")
    mod_df_join["player_1_win"] = mod_df_join["player_1_win"] * mod_df_join["hit_1"]
    mod_df_join["same_team"] = mod_df_join.apply(lambda x: x["team_id_1"] == x["team_id_2"], axis=1).astype("int")

    # remove probability of push
    df = mod_df_join.query("push==0").copy()

    ## model 2 => predict who wins
    model = smf.glm(
        formula = "player_1_win ~ line_1 + line_2 + rank_team_pos_1 + rank_team_pos_2 + rank_team_pos_1:same_team",
        data = df[df["push"] == 0],
        family = sm.families.Binomial()
    ).fit()

    nme = f"{position}_{prop_name}_{value}"

    return nme, model

In [816]:
model_list =[
    # QB PAS YD MARKETS
    ("pass_yards",20, "QB"), 
    ("pass_yards",40, "QB"), 
    ("pass_yards",100,"QB"), 
    # RB RUSH YD MARKETS
    ("rush_yards",10,"RB"), 
    ("rush_yards",20,"RB"), 
    ("rush_yards",50,"RB"),
    ("rush_yards",10,"RB"), 
    ("rush_yards",20,"RB"), 
    ("rush_yards",50,"RB"),
    # WR REC YD MARKETS
    ("rec_yards", 10,"WR"),
    ("rec_yards", 20,"WR"),
    ("rec_yards", 30,"WR"),
    ("rec_yards", 50,"WR"),
    ("rec_yards", 60,"WR"),
    # RB REC YD MARKET
    ("rec_yards", 10,"RB"),
    ("rec_yards", 20,"RB"),
    ("rec_yards", 40,"RB"),
    # TE REC YD MARKETS
    ("rec_yards", 10, "TE"),
    ("rec_yards", 20, "TE"),
    ("rec_yards", 40, "TE"),
    # Reception Markets
    ("rec", 1, "WR"),
    ("rec", 1, "WR"),
    ("rec", 1, "WR"),
    ("rec", 1, "TE"),
    ("rec", 1, "RB"),
    ("rec", 1, "RB")
]
logit_models = {}
for mod_param in tqdm(model_list):
    nme, logit_model = build_logit_model(*mod_param)
    logit_models[nme] = logit_model

100%|██████████| 26/26 [00:20<00:00,  1.30it/s]


In [819]:
def get_logit_predictions(market, models, proj):
    # unpack market
    players = market["players"]
    prop_name = market["prop_name"]
    position = market["position"]
    value = market["value"]
    pred_df = proj.query(f"""
            player_name in {players}\
            and prop_name in '{prop_name}'
        """)[["player_name", "position", "team_id","rank_team_pos","line"]]

    def unpack_df(pred_df, ix = 1):
        pred_df_i = pred_df.iloc[(ix-1):ix,:]
        pred_df_i.index = [0]
        pred_df_i.columns = [f"{x}_{ix}" for x in pred_df_i.columns]
        return pred_df_i
    pred_df = pd.concat([unpack_df(pred_df, 1), unpack_df(pred_df, 2)],axis=1)
    # assign same team
    pred_df["same_team"] = pred_df.apply(lambda x: x["team_id_1"] == x["team_id_2"],axis=1).astype(int)

    model = logit_models[f"{position}_{prop_name}_{value}"]
    p_player_1_wins = model.predict(exog=pred_df)[0]

    return_df = pred_df.copy()\
        [["player_name_1", "player_name_2", "line_1", "line_2"]]\
        .assign(
            p_player_1 = p_player_1_wins,
            p_player_2 = 1-p_player_1_wins,
            price_player_1 = 1/p_player_1_wins,
            price_player_2 = 1/(1-p_player_1_wins)
        )
    return return_df
    
# params
market = {
    "players": ["Dallas Goedert", "Travis Kelce"],
    "position": 'TE',
    "prop_name": "rec_yards",
    "value": 40
}
get_predictions(market, logit_models, proj)

Unnamed: 0,player_name_1,player_name_2,line_1,line_2,p_player_1,p_player_2,price_player_1,price_player_2
0,Travis Kelce,Dallas Goedert,83.88,53.41,0.695158,0.304842,1.438522,3.280388


### OLD Head to Head Logit

In [686]:
### Logit Model
def build_market_name(prop_name, value, position, rank):
    return f"plays_to_{value}_{prop_name}_{position}_{rank}"

def flatten_column_index(df):
    df = df.copy()
    level_one = df.columns.get_level_values(0).astype(str)
    level_two = df.columns.get_level_values(1).astype(str)
    column_separator = ['_' if x != '' else '' for x in level_two]
    df.columns = level_one + column_separator + level_two
    return df

class certain_model:

    def __init__(self):
        self.pred = [1]

    def predict(self, exog):
        pred = self.pred
        return pred

def build_logit_model(prop_name, value, position, rank):

    # Define Market Name
    market_name = build_market_name(prop_name, value, position, rank)

    # subset model df to desired position (if any)
    mod_df_sub = mod_df\
        .query(f"""
            prop_name == '{prop_name}' & \
            rank_team_pos == {rank} & \
            position == '{position}' &\
            gamedate < '{datetime.today()}'
        """).copy()

    ### DATA PREP
    data_cols = [
        "prop_id", "t", "outcome", "market", "game_id", 
        "team_id", "player_id", "line", "rank_team", "first_possession"
    ]
    # players that hit threshold
    hits = mod_df_sub\
        .query(f"value_cum >= {value}")\
        .groupby(["prop_id", "game_id", "team_id", "player_id", "line", "rank_team", "first_possession"])\
        .agg({"play_number":"min"})\
        .rename(columns={"play_number":"t"})\
        .assign(
            outcome = 1,
            market = market_name
        )\
        .reset_index()[data_cols]
    # players that missed the threshold
    censored = mod_df_sub.copy()
    censored = censored[~censored.prop_id.isin(hits.prop_id)]\
        .query(f"value_cum < {value}")\
        .rename(columns={"n_plays":"t"})\
        .assign(
            outcome = 0,
            market = market_name
        )[data_cols]\
        .drop_duplicates()

    # concatenate hits and misses
    mod_df_sub_concat = pd.concat([hits, censored],axis=0)\
        .sort_values(["game_id", "line"], ascending=[True, False])

    ## MIRROR DATA SET
    def build_mirror(df, asc = True):
        df_mir = df.copy()
        df_mir["game_rank"] = df_mir\
                .groupby(["game_id"])["line"]\
                .rank(method="first", ascending=asc).astype(int)
        df_mir = df_mir[df_mir["game_rank"] <= 2]
        df_mir = pd.pivot_table(
            df_mir, 
            values = ["line", "first_possession", "t", "outcome"], 
            index = ["game_id"], 
            columns="game_rank"
        ).reset_index()
        df_mir = flatten_column_index(df_mir)\
            .drop("first_possession_2",axis=1)
        return df_mir

    df_mir1 = build_mirror(mod_df_sub_concat)
    df_mir2 = build_mirror(mod_df_sub_concat, False)

    # Union Mirrored Data Sets
    df = pd.concat([df_mir1, df_mir2], axis = 0)\
        .sort_values("game_id")\
        .rename(columns = {"first_possession_1":"first_possession"})\
        .dropna(axis=0)

    # compute the probability of wins
    df["win"] = df.apply(lambda x: x.outcome_1 + x.outcome_2 > 0, axis = 1).astype(int)
    df["p1_wins"] = df.apply(lambda x: x.t_1 < x.t_2, axis = 1).astype(int)

    ## model 1 => predict a win occurs
    if np.sum(df["win"] == 0) == 0:
        model_1 = certain_model()
    else:
        model_1 = smf.glm(
            formula = "win ~ line_1 + line_2", 
            data = df, 
            family = sm.families.Binomial()
        ).fit()

    ## model 2 => predict who wins
    model_2 = smf.glm(
        formula = "p1_wins ~ line_1 + line_2",
        data = df[df["win"] == 1],
        family = sm.families.Binomial()
    ).fit()

    model = {"model_1": model_1, "model_2": model_2}

    return market_name, model

In [687]:
model_list =[
    # QB PAS YD MARKETS
    ("pass_yards",20,"QB",1), 
    ("pass_yards",40,"QB",1), 
    ("pass_yards",100,"QB",1), 
    # RB RUSH YD MARKETS
    ("rush_yards", 10,"RB",1), 
    ("rush_yards",20,"RB",1), 
    ("rush_yards",50,"RB",1),
    ("rush_yards", 10,"RB",2), 
    ("rush_yards",20,"RB",2), 
    ("rush_yards",50,"RB",2),
    # WR REC YD MARKETS
    ("rec_yards", 10,"WR",1),
    ("rec_yards", 20,"WR",1),
    ("rec_yards", 20,"WR",2),
    ("rec_yards", 30,"WR",1),
    ("rec_yards", 50,"WR",1),
    ("rec_yards", 60,"WR",1),
    ("rec_yards", 60,"WR",2),
    # RB REC YD MARKET
    ("rec_yards", 10,"RB",1),
    ("rec_yards", 20,"RB",1),
    ("rec_yards", 40,"RB",1),
    # TE REC YD MARKETS
    ("rec_yards", 10, "TE",1),
    ("rec_yards", 20, "TE",1),
    # Reception Markets
    ("rec", 1, "WR", 1),
    ("rec", 1, "WR", 2),
    ("rec", 1, "WR", 3),
    ("rec", 1, "TE", 1),
    ("rec", 1, "RB", 1),
    ("rec", 1, "RB", 2)
]
logit_models = {}
for mod_param in tqdm(model_list):
    nme, logit_model = build_logit_model(*mod_param)
    logit_models[nme] = logit_model

100%|██████████| 27/27 [00:02<00:00, 12.06it/s]


In [693]:
players = ["Dallas Goedert", "Travis Kelce"]
proj.query(f"player_name in {players}").query("prop_name == 'rec_yards'")

Unnamed: 0,team_name,player_name,position,player_id,team_id,game_id,prop_name,line,rank_team_pos,rank_team
33,Kansas City Chiefs,Travis Kelce,TE,52104,16,8297,rec_yards,83.88,1,1
40,Philadelphia Eagles,Dallas Goedert,TE,53795,24,8297,rec_yards,53.41,1,3


In [698]:
pred_dict = {
    "prop_name": "rec_yards",
    "value": 10,
    "position" : "TE",
    "rank" : 1,
    "line_1": 83.88,
    "line_2": 53.41
}

def market_logit_prediction(models, pred_dict):
  
    market_name = build_market_name(
        pred_dict["prop_name"],pred_dict["value"],
        pred_dict["position"],pred_dict["rank"]
    )
    p1 = models[market_name]["model_1"].predict(exog=pred_dict)[0]
    p2_1 = models[market_name]["model_2"].predict(exog=pred_dict)[0]
    p2_2 = 1-p2_1
    pred_dict.update({
        "p_plr_1": (p1*p2_1),
        "p_plr_2": (p1*p2_2),
        "p_push": (1-p1)
    })
    return pred_dict

market_logit_prediction(logit_models, pred_dict)

{'prop_name': 'rec_yards',
 'value': 10,
 'position': 'TE',
 'rank': 1,
 'line_1': 83.88,
 'line_2': 53.41,
 'p_plr_1': 0.6039936162282266,
 'p_plr_2': 0.39379668013159724,
 'p_push': 0.0022097036401761416}

In [237]:
mod_df

Unnamed: 0,game_id,player_id,team_id,gamedate,position,prop_name,line,rank_team,rank_team_pos,play_number,value,n_plays,first_possession,prop_id,value_cum
0,8051,54987,27,2022-11-21,WR,rec,1.7,5,3,0.0,0.0,183.0,0,rec_8051_54987,0.0
1,8051,54987,27,2022-11-21,WR,rec_yards,24.3,5,3,0.0,0.0,183.0,0,rec_yards_8051_54987,0.0
2,8051,53461,27,2022-11-21,TE,rec,4.6,3,1,40.0,1.0,183.0,0,rec_8051_53461,1.0
3,8051,53461,27,2022-11-21,TE,rec,4.6,3,1,60.0,1.0,183.0,0,rec_8051_53461,2.0
4,8051,53461,27,2022-11-21,TE,rec,4.6,3,1,63.0,1.0,183.0,0,rec_8051_53461,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104651,7444,52800,33,2019-12-29,RB,rush_yards,69.0,1,1,140.0,14.0,198.0,1,rush_yards_7444_52800,46.0
104652,7444,52800,33,2019-12-29,RB,rush_yards,69.0,1,1,145.0,5.0,198.0,1,rush_yards_7444_52800,51.0
104653,7444,52800,33,2019-12-29,RB,rush_yards,69.0,1,1,146.0,0.0,198.0,1,rush_yards_7444_52800,51.0
104654,7444,52800,33,2019-12-29,RB,rush_yards,69.0,1,1,158.0,8.0,198.0,1,rush_yards_7444_52800,59.0


In [238]:
def build_same_team_logit_model(prop_name, value, position, rank):

    # Define Market Name
    market_name = build_market_name(prop_name, value, position, rank)

    # subset model df to desired position (if any)
    mod_df_sub = mod_df\
        .query(f"""
            prop_name == '{prop_name}' & \
            rank_team_pos == {rank} & \
            position == '{position}' &\
            gamedate < '{datetime.today()}'
        """).copy()

    ### DATA PREP
    data_cols = [
        "prop_id", "t", "outcome", "market", "game_id", 
        "team_id", "player_id", "line", "rank_team", "first_possession"
    ]
    # players that hit threshold
    hits = mod_df_sub\
        .query(f"value_cum >= {value}")\
        .groupby(["prop_id", "game_id", "team_id", "player_id", "line", "rank_team", "first_possession"])\
        .agg({"play_number":"min"})\
        .rename(columns={"play_number":"t"})\
        .assign(
            outcome = 1,
            market = market_name
        )\
        .reset_index()[data_cols]
    # players that missed the threshold
    censored = mod_df_sub.copy()
    censored = censored[~censored.prop_id.isin(hits.prop_id)]\
        .query(f"value_cum < {value}")\
        .rename(columns={"n_plays":"t"})\
        .assign(
            outcome = 0,
            market = market_name
        )[data_cols]\
        .drop_duplicates()

    # concatenate hits and misses
    mod_df_sub_concat = pd.concat([hits, censored],axis=0)\
        .sort_values(["game_id", "line"], ascending=[True, False])

    ## MIRROR DATA SET
    def build_mirror(df, asc = True):
        df_mir = df.copy()
        df_mir["game_rank"] = df_mir\
                .groupby(["game_id"])["line"]\
                .rank(method="first", ascending=asc).astype(int)
        df_mir = df_mir[df_mir["game_rank"] <= 2]
        df_mir = pd.pivot_table(
            df_mir, 
            values = ["line", "first_possession", "t", "outcome"], 
            index = ["game_id"], 
            columns="game_rank"
        ).reset_index()
        df_mir = flatten_column_index(df_mir)\
            .drop("first_possession_2",axis=1)
        return df_mir

    df_mir1 = build_mirror(mod_df_sub_concat)
    df_mir2 = build_mirror(mod_df_sub_concat, False)

    # Union Mirrored Data Sets
    df = pd.concat([df_mir1, df_mir2], axis = 0)\
        .sort_values("game_id")\
        .rename(columns = {"first_possession_1":"first_possession"})\
        .dropna(axis=0)

    # compute the probability of wins
    df["win"] = df.apply(lambda x: x.outcome_1 + x.outcome_2 > 0, axis = 1).astype(int)
    df["p1_wins"] = df.apply(lambda x: x.t_1 < x.t_2, axis = 1).astype(int)

    ## model 1 => predict a win occurs
    if np.sum(df["win"] == 0) == 0:
        model_1 = certain_model()
    else:
        model_1 = smf.glm(
            formula = "win ~ line_1 + line_2", 
            data = df, 
            family = sm.families.Binomial()
        ).fit()

    ## model 2 => predict who wins
    model_2 = smf.glm(
        formula = "p1_wins ~ line_1 + line_2",
        data = df[df["win"] == 1],
        family = sm.families.Binomial()
    ).fit()

    model = {"model_1": model_1, "model_2": model_2}

    return market_name, model

### Multi Logit

In [832]:
prop_name = 'rec_yards'
position = "TE"
value = 10

def build_multi_logit_model(prop_name, value, position):
    mod_df_sub = mod_df\
        .query(f"""
            prop_name == '{prop_name}' and \
            rank_team_pos in [1,2,3] and \
            position in {position} and \
            gamedate < '{datetime.today()}'
        """)\
        .drop("position", axis=1)\
        .copy()

    # Deterine plays for player to cross threshold
    mod_df_sub["hit"] = mod_df_sub["value_cum"].apply(lambda x: 1 if x > value else 0)
    mod_df_sub["no_hit"] = 1-mod_df_sub.groupby(["game_id", "player_id"])['hit'].transform("max")
    # if hit: return play_number times hit (1 or 0) // if no_hit: return number of plays in game
    mod_df_sub["t"] = mod_df_sub.apply(lambda x: x["n_plays"] if x["no_hit"] == 1 else x["play_number"]*x["hit"], axis=1)
    # filter to where t > 0 (if no_hit, then it is n_plays)
    mod_df_filt = mod_df_sub\
        .query("t > 0")\
        .groupby(["game_id", "team_id", "player_id", "line", "rank_team_pos"])\
        .agg({"t":"min", "hit":"max"}).reset_index()

    mod_df_filt["t_min"] = mod_df_filt.groupby("game_id")["t"].transform("min")
    mod_df_filt["fastest"] = mod_df_filt.apply(lambda x: x["t"]==x["t_min"],axis=1) * mod_df_filt["hit"]
    mod_df_filt["n_teammates"] = mod_df_filt.assign(cnt=1).groupby(["game_id", "team_id"])['cnt'].transform("sum")-1
    mod_df_filt["is_starter"] = mod_df_filt["rank_team_pos"].apply(lambda x: 1 if x == 1 else 0)

    valid_games = mod_df_filt\
        .groupby("game_id")\
        .agg({"fastest":"sum"})\
        .reset_index()\
        .rename(columns={"fastest":"n_hit"})\
        .query("n_hit > 0")

    mod_df_join = valid_games.merge(mod_df_filt, on = 'game_id')
    df = mod_df_join.copy()

    model = smf.glm(
            formula = "fastest ~ line",
            data = df,
            family = sm.families.Binomial()
        ).fit()

    pos = "_".join(position)
    nme = f"{pos}_{prop_name}_{value}"

    return nme, model

In [834]:
model_list =[
    # RB RUSH YD MARKETS
    ("rush_yards", 10,["RB"]), 
    ("rush_yards",25,["RB"]), 
    ("rush_yards",50,["RB"]),
    # WR REC YD MARKETS
    ("rec_yards", 20,["WR", "TE", "RB"]),
    ("rec_yards", 40,["WR", "TE", "RB"]),
    ("rec_yards", 60,["WR", "TE", "RB"]),
    ("rec_yards", 25,["WR", "TE", "RB"])
]
multi_logit_models = {}
for mod_param in tqdm(model_list):
    nme, multi_logit_model = build_multi_logit_model(*mod_param)
    multi_logit_models[nme] = multi_logit_model

100%|██████████| 7/7 [00:05<00:00,  1.20it/s]


In [838]:
def get_multi_logit_predictions(market, models, proj):
    # unpack market
    prop_name = market["prop_name"]
    position = market["position"]
    value = market["value"]
    pred_df = proj.query(f"""
            position in {position}\
            and prop_name in '{prop_name}'
        """)\
        [["player_name", "position", "team_id","rank_team_pos","line"]]

    # assign vars
    pred_df["Intercept"] = 1
    #pred_df["n_teammates"] = pred_df.assign(cnt=1).groupby(["team_id"])['cnt'].transform("sum")-1
    #pred_df["is_starter"] = pred_df["rank_team_pos"].apply(lambda x: 1 if x == 1 else 0)

    pos = "_".join(position)
    nme = f"{pos}_{prop_name}_{value}"  
    model = models[nme]
    X = pred_df[model.params.index.to_list()]
    beta = np.array(model.params).reshape(-1,1)
    pred_df["p"] = normalize(expit(X @ beta))
    pred_df["p"] = softmax(X @ beta)
    pred_df["price"] = 1/pred_df["p"]
    return pred_df


market = {
    "position": ["WR", "TE", "RB"],
    "prop_name": "rec_yards",
    "value": 20
}
get_multi_logit_predictions(market, multi_logit_models, proj).sort_values("line",ascending=False)

Unnamed: 0,player_name,position,team_id,rank_team_pos,line,Intercept,p,price
33,Travis Kelce,TE,16,1,83.88,1,0.168636,5.929933
47,A.J. Brown,WR,24,1,74.91,1,0.13078,7.646447
53,DeVonta Smith,WR,24,2,63.35,1,0.094243,10.610829
43,Marquez Valdes-Scantling,WR,16,1,57.49,1,0.079822,12.527916
40,Dallas Goedert,TE,24,1,53.41,1,0.071105,14.063671
34,Jerick McKinnon,RB,16,1,29.71,1,0.036323,27.53069
61,Isiah Pacheco,RB,16,2,23.68,1,0.030617,32.661724
37,JuJu Smith-Schuster,WR,16,2,21.51,1,0.028791,34.733551
60,Skyy Moore,WR,16,3,20.44,1,0.027931,35.803015
51,Quez Watkins,WR,24,3,18.48,1,0.026421,37.848165


In [774]:
def build_multi_logit_model(prop_name, value, position):
    mod_df_sub = mod_df\
    .query(f"""
        prop_name == '{prop_name}' & \
        position == '{position}' &\
        gamedate < '{datetime.today()}'
    """).copy()

    data_cols = [
            "prop_id", "t", "outcome", "game_id", "team_id", "player_id", 
            "line", "rank_team_pos", "first_possession"
        ]
    hits = mod_df_sub\
        .query(f"value_cum >= {value}")\
        .groupby(["prop_id", "game_id", "team_id", "player_id", "line", "rank_team_pos", "first_possession"])\
        .agg({"play_number":"min"})\
        .rename(columns={"play_number":"t"})\
        .assign(
            outcome = 1
        )\
        .reset_index()[data_cols]

    # players that missed the threshold
    censored = mod_df_sub.copy()
    censored = censored[~censored.prop_id.isin(hits.prop_id)]\
        .query(f"value_cum < {value}")\
        .rename(columns={"n_plays":"t"})\
        .assign(
            outcome = 0
        )[data_cols]\
        .drop_duplicates()

    # concatenate hits and misses
    df_concat = pd.concat([hits, censored],axis=0)\
        .sort_values(["game_id", "line"], ascending=[True, False])

    df_concat["fastest"] = df_concat\
        .groupby(["game_id"])["t"]\
        .rank(method="first",ascending=True).astype(int)
    df_concat["fastest"] = df_concat["fastest"] * df_concat["outcome"]
    df_concat["fastest"] = df_concat["fastest"].apply(lambda x: 1 if x == 1 else 0)

    mod_1_df = df_concat\
        .groupby("game_id")\
        .agg({
            "fastest":"max",
            "line":"sum"
        }).reset_index()

    mod_2_df = df_concat.copy()
    fastest_games = mod_1_df.query("fastest==1").game_id.values
    mod_2_df = mod_2_df[mod_2_df.game_id.isin(fastest_games)]

    ## model 1 => predict anyone win occurs
    if np.sum(mod_1_df["fastest"] == 0) == 0:
        model_1 = certain_model()
    else:
        model_1 = smf.glm(
            formula = "fastest ~ line", 
            data = mod_1_df, 
            family = sm.families.Binomial()
        ).fit()

    ## model 2 => predict who wins
    model_2 = smf.glm(
        formula = "fastest ~ line",
        data = mod_2_df,
        family = sm.families.Binomial()
    ).fit()

    nme = f"{prop_name}_{value}_{position}"
    model = {"model_1": model_1, "model_2": model_2}

    return model

In [775]:
model_list =[
    # QB PAS YD MARKETS
    ("pass_yards",20,"QB"), 
    ("pass_yards",40,"QB"), 
    ("pass_yards",100,"QB"), 
    # RB RUSH YD MARKETS
    ("rush_yards", 10,"RB"), 
    ("rush_yards",20,"RB"), 
    ("rush_yards",50,"RB"),
    ("rush_yards", 10,"RB"), 
    ("rush_yards",20,"RB"), 
    ("rush_yards",50,"RB"),
    # WR REC YD MARKETS
    ("rec_yards", 10,"WR"),
    ("rec_yards", 20,"WR"),
    ("rec_yards", 20,"WR"),
    ("rec_yards", 30,"WR"),
    ("rec_yards", 50,"WR"),
    ("rec_yards", 60,"WR"),
    ("rec_yards", 60,"WR"),
    # RB REC YD MARKET
    ("rec_yards", 10,"RB"),
    ("rec_yards", 20,"RB"),
    ("rec_yards", 40,"RB"),
    # TE REC YD MARKETS
    ("rec_yards", 10, "TE"),
    ("rec_yards", 20, "TE"),
    # Reception Markets
    ("rec", 1, "WR"),
    ("rec", 1, "WR"),
    ("rec", 1, "WR"),
    ("rec", 1, "TE"),
    ("rec", 1, "RB"),
    ("rec", 1, "RB")
]
multi_logit_models = {}
for mod_param in tqdm(model_list):
    nme, multi_logit_model = build_multi_logit_model(*mod_param)
    multi_logit_models[nme] = multi_logit_model

100%|██████████| 27/27 [00:01<00:00, 18.79it/s]


In [779]:
market = {
    "position": "TE",
    "prop_name": "rec_yards",
    "value": 10
}

def market_multi_logit_predictions(market, models, proj):
    # unpack market
    prop_name = market["prop_name"]
    position = market["position"]
    value = market["value"]
    pred_df = proj.query(f"""
            position == '{position}'\
            and prop_name in '{prop_name}'
        """)\
        [["player_name", "position", "team_id","rank_team_pos","line"]]

    pred_df = proj.query(f"""
        prop_name == '{prop_name}' and\
        position == '{position}' and \
        line > 0
    """)\
    .assign(Intercept = 1)\
    [["player_name", "Intercept","line", "rank_team_pos"]]
    
    X = pred_df.drop("player_name",axis=1).values
    beta = np.array(mod.params).reshape(-1,1)
    pred_df["p"] = normalize(expit(X @ beta))
    #pred_df["p"] = softmax(X @ beta)
    pred_df["price"] = 1/pred_df["p"]
    return pred_df
market_multi_logit_predictions(market, multi_logit_models, proj)

Unnamed: 0,player_name,Intercept,line,rank_team_pos,p,price
33,Travis Kelce,1,83.88,1,0.58715,1.703143
36,Blake Bell,1,2.72,4,0.005209,191.992177
40,Dallas Goedert,1,53.41,1,0.324033,3.086105
56,Jack Stoll,1,4.0,2,0.031229,32.021455
57,Jody Fortson,1,2.75,3,0.012545,79.714532
58,Noah Gray,1,10.81,2,0.039835,25.103615


### Survival Model

In [708]:
from lifelines import CoxPHFitter
from sklearn.preprocessing import OneHotEncoder

def build_model(prop_name, value, position):

    market_name = f"plays_to_{value}_{prop_name}"+"_".join(position)
    
    # subset model df to desired position (if any)
    # subset model df to desired position (if any)
    mod_df_sub = mod_df\
        .query(f"""
            prop_name == '{prop_name}' & \
            gamedate < '{datetime.today()}'
        """).copy()

    if len(position) > 0:
        mod_df_sub = mod_df_sub[mod_df_sub.position.isin(position)]
        position_columns = ["position"]
    else:
        encoder = OneHotEncoder(drop = 'first', sparse = False)
        pos = encoder.fit_transform(mod_df_sub["position"].values.reshape(-1,1))
        position_columns = list(encoder.categories_[0])[1:]
        pos = pd.DataFrame(pos, columns= position_columns)
        mod_df_sub = pd.concat([mod_df_sub, pos], axis = 1)
   
    ### DATA PREP
    data_cols = [
        "prop_id", "t", "outcome", "market", "game_id", 
        "team_id", "player_id", "line", "rank_team", "rank_team_pos", "first_possession"
    ] 
    # players that hit threshold
    hits = mod_df_sub\
        .query(f"value_cum >= {value}")\
        .groupby(["prop_id", "game_id", "team_id", "player_id", 
        "line", "rank_team", "rank_team_pos", "first_possession"])\
        .agg({"play_number":"min"})\
        .rename(columns={"play_number":"t"})\
        .assign(
            outcome = 1,
            market = market_name
        )\
        .reset_index()[data_cols]

    # players that missed the threshold
    censored = mod_df_sub.copy()
    censored = censored[~censored.prop_id.isin(hits.prop_id)]\
        .query(f"value_cum < {value}")\
        .rename(columns={"n_plays":"t"})\
        .assign(
            outcome = 0,
            market = market_name
        )[data_cols]\
        .drop_duplicates()

    # concatenate hits and misses
    mod_df_sub_concat = pd.concat([hits, censored],axis=0)\
        .sort_values(["game_id", "line"], ascending=[True, False])

    ## MODEL
    mod_cols = ["t", "outcome", "line", "rank_team_pos"]

    cph = CoxPHFitter()
    cph.fit(mod_df_sub_concat[mod_cols], duration_col='t', event_col='outcome')

    return market_name, cph

model_list =[
    ("rush_yards", 10,["RB"]), 
    ("rush_yards",20,["RB"]), 
    ("rush_yards",50,["RB"]),
    ("rec_yards", 20,["WR"]),
    ("rec_yards", 30,["WR"]),
    ("rec_yards", 50,["WR"]),
    ("rec_yards", 60,["WR"]),
    ("rec_yards", 100,["WR"]),
    ("rec_yards", 50, ["TE"]),
    ("rec_yards", 20, ["TE"]),
    ("rec_yards", 20, ["RB"]),
    ("rec", 1, ["RB"])
]
surv_models = {}
for mod_param in model_list:
    nme, surv_model = build_model(*mod_param)
    surv_models[nme] = surv_model
    

In [710]:
##### RUN PREDICTIONS ########
prop_name = "rec_yards"
value = 20
position = ["TE"]

df_pred = proj\
    .query(f"""
        prop_name == '{prop_name}' and \
        position in {position} and line > 0
    """)\
    .sort_values("line",axis=0,ascending=False)\
    [["player_name","line", "rank_team_pos"]]\
    .iloc[0:2,:]

n_plrs = df_pred.shape[0]

model = surv_models[f"plays_to_{value}_{prop_name}"+"_".join(position)]

# pull out cumulative hazard
cum_hzrd = model.predict_cumulative_hazard(df_pred)
cum_hzrd.columns = [f"H_{x}" for x in range(n_plrs)]

# compute hazard functions
hzrd = cum_hzrd - cum_hzrd.shift(1)
hzrd.iloc[0,:] = cum_hzrd.iloc[0,:]
hzrd.columns = [f"h_{x}" for x in range(n_plrs)]

# pull out survival functions
surv= model.predict_survival_function(df_pred)
surv.columns = [f"S_{x}" for x in range(n_plrs)]

cum_hzrd["i"] = list(cum_hzrd.index)
hzrd["i"] = list(hzrd.index)
surv["i"] = list(surv.index)

prob_df = hzrd.merge(cum_hzrd, on = 'i').merge(surv, on = 'i').drop(["i"], axis = 1)

for i in range(n_plrs):
    prob_df[f"f_{i}"] = prob_df[f"h_{i}"] * prob_df[f"S_{i}"]

for i in range(n_plrs):
    S_col_list = [f"S_{i}" for i in range(n_plrs)]
    S_col_list.remove(f"S_{i}")
    prob_df[f"p_{i}"] = prob_df[f"f_{i}"] * prob_df[S_col_list].product(axis=1).shift(1)


p_pred = prob_df[[x for x in prob_df.columns if "p_" in x]].sum()
#p_pred = normalize(p_pred)
df_pred["p"] = p_pred.values
df_pred["price"] = 1/p_pred.values
df_pred

Unnamed: 0,player_name,line,rank_team_pos,p,price
33,Travis Kelce,83.88,1,0.539332,1.854144
40,Dallas Goedert,53.41,1,0.442372,2.260541


In [474]:
1-df_pred.p.sum()

0.013940517823302523