In [18]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import matplotlib.pyplot as plt


## Connect to DB

In [2]:
import psycopg2
import pymysql
import sqlalchemy_redshift
from configparser import ConfigParser
from sqlalchemy import create_engine

parser = ConfigParser()
_ = parser.read("../notebook.cfg")

# Connect to zack attack
za_user = parser.get("nffddev_2", "user")
za_pwd = parser.get("nffddev_2", "password")

za_engine = create_engine(
    f"mysql+pymysql://{za_user}:{za_pwd}@nffddev.numberfire.com/zack_attack",
    connect_args = dict(host='nffddev.numberfire.com', port=3306)
)
za_conn = za_engine.connect()

# connect to redshift
red_user = parser.get("redshift", "user")
red_pwd = parser.get("redshift", "password")

red_engine = create_engine(
    f"postgresql+psycopg2://{red_user}:{red_pwd}@rs1.usdfs.fdbox.net/fanduel",
    connect_args = dict(port=5439)
)
red_conn = red_engine.connect()

## Process Data

### NFL Line Data

In [3]:
total_implied = pd.read_csv("../data/fb_ou_implied_mean.csv")
spread_implied = pd.read_csv("../data/fb_spread_implied_mean.csv")

nfl_game = pd.read_sql("select * from nfl_game", za_conn)\
    .merge(total_implied, on = "ou", how = "left")\
    .merge(spread_implied, on = "spread", how = "left")\
    .rename(columns={"id":"game_id", "date":"gamedate"})
nfl_game["gamedate"] = pd.to_datetime(nfl_game["gamedate"])

# concatenate the home and away lines
line_cols = ["game_id", "team_id", "ou_equiv", "relative_spread"]
nfl_lines = pd.concat([
    # Home Lines
    nfl_game.rename(columns={"home_team_id":"team_id"})\
        .assign(relative_spread = lambda x: x.spread_equiv)[line_cols],
    # Away Lines
    nfl_game.rename(columns={"away_team_id":"team_id"})\
        .assign(relative_spread = lambda x: -1*x.spread_equiv)[line_cols]
    ], axis=0
)

### Tendency Data

In [4]:
qry = "select team_id, game_id, pass_to_rush_ratio from nfl_rate_stats_team_tendency"
nfl_tend = pd.read_sql(qry, za_conn)

### Team Data

In [5]:
nfl_team = pd.read_sql("select * from nfl_team", za_conn)\
    .rename(columns={"id":"team_id"})

### Prop Charts

In [6]:
qry = open("../data/prop_charts.sql", "r").read()
prop_charts = pd.read_sql(qry, red_conn)

### Skill Data

In [52]:
qry = '''
select player_id, game_id,
'rush_yards' as prop_name, 
rush_yards as actual_yards 
from nfl_statline_skill
union
select player_id, game_id,
'rec_yards' as prop_name, 
rec_yards as actual_yards 
from nfl_statline_skill
union
select player_id, game_id,
'pass_yards' as prop_name, 
pass_yards as actual_yards 
from nfl_statline_skill
'''
skill_df = pd.read_sql(qry, za_conn)

### First Possession Data

In [8]:
qry = '''
select game_id,
defense_team_id as team_id,
1 as "first_possession"
from nfl_plays
where quarter = 1 and time_left_in_quarter = '15:00'
and category = 'KICKOFF'
union
select game_id,
offense_team_id as team_id,
0 as "first_possession"
from nfl_plays
where quarter = 1 and time_left_in_quarter = '15:00'
and category = 'KICKOFF'
'''

first_pos = pd.read_sql(qry, za_conn)

### Prop Line Data

In [53]:
qry = open("../data/rush_rec_prop.sql", "r").read()
prop = pd.read_sql(qry, red_conn)\
    .merge(
        nfl_team[["abbrev", "team_id"]]\
            .rename(columns={"abbrev":"team"}), 
        on = "team",
        how = "left"
    )
prop["gamedate"] = pd.to_datetime(prop["gamedate"])

# join on prop game
nfl_prop_game = nfl_game.merge(
    nfl_team[["team_id", "abbrev"]]\
        .rename(columns={"team_id":"home_team_id", "abbrev":"home_team_abbrev"}),
    on = ["home_team_id"]
).merge(
    nfl_team[["team_id", "abbrev"]]\
        .rename(columns={"team_id":"away_team_id", "abbrev":"away_team_abbrev"}),
    on = ["away_team_id"]
)[["game_id", "gamedate", "home_team_abbrev", "away_team_abbrev"]]

nfl_prop_game = pd.concat([
    nfl_prop_game.rename(columns={"home_team_abbrev":"team", "away_team_abbrev":"opp"}),
    nfl_prop_game.rename(columns={"away_team_abbrev":"team", "home_team_abbrev":"opp"})
],axis=0)

prop = prop.merge(nfl_prop_game, on = ["gamedate", "team", "opp"])

## rank rushers and recievers
prop["rank"] = prop\
    .groupby(["game_id", "team_id", "prop_name", "position"])["line"]\
    .rank(method="first",ascending=False).astype(int)

# join on actual data
prop = prop.merge(skill_df, on = ["game_id", "player_id", "prop_name"], how = "left").fillna(0)

#prop.sort_values(["game_id", "team_id", "prop_name", "rank"]).query("game_id == 7206")


## Model

### Data

In [98]:
qry = '''
select yards.*,
mx_pl.n_plays
from (
    select game_id, offense_team_id as team_id,
    play_number, receiver_id as player_id,
    'rec_yards' as prop_name, total_yards
    from nfl_plays where category = 'PASS'
    and is_complete_pass = 1
    union
    select game_id, offense_team_id as team_id,
    play_number, rusher_id as player_id,
    'rush_yards' as prop_name, total_yards
    from nfl_plays where category = 'RUSH'
    union
    select game_id, offense_team_id as team_id,
    play_number, passer_id as player_id,
    'pass_yards' as prop_name, total_yards
    from nfl_plays where category = 'PASS'
    and is_complete_pass = 1
) yards
inner join (
    select game_id, max(play_number) n_plays
    from nfl_plays group by 1
) mx_pl on yards.game_id = mx_pl.game_id
'''

pbp_df = pd.read_sql(qry, za_conn)

prop_cols = [
    "game_id", "player_id", "team_id", "gamedate",
    "position", "prop_name", "line", "rank"
]
mod_df = prop[prop_cols]\
    .query("rank <= 5")\
    .merge(
        pbp_df,
        on = ["game_id", "player_id", "team_id", "prop_name"],
        how = 'left'
    )\
    .merge(first_pos, on = ["game_id", "team_id"])\
    .fillna(value = {"play_number":0, "total_yards":0,"n_plays":10000})
del pbp_df

mod_df["player_id"] = mod_df["player_id"].astype(int)
mod_df["prop_id"] = mod_df.agg('{0[prop_name]}_{0[game_id]}_{0[player_id]}'.format, axis = 1)

mod_df["total_yards_cum"] = mod_df\
    .sort_values(["prop_id", "play_number"])\
    .groupby(["prop_id"])['total_yards'].cumsum()

### Logit

In [111]:
### Logit Model
import statsmodels.api as sm
import statsmodels.formula.api as smf

def build_market_name(prop_name, yards, position, rank):
    return f"plays_to_{yards}_{prop_name}_{position}_{rank}"

def flatten_column_index(df):
    df = df.copy()
    level_one = df.columns.get_level_values(0).astype(str)
    level_two = df.columns.get_level_values(1).astype(str)
    column_separator = ['_' if x != '' else '' for x in level_two]
    df.columns = level_one + column_separator + level_two
    return df

class certain_model:

    def __init__(self):
        self.pred = [1]

    def predict(self, exog):
        pred = self.pred
        return pred

def build_model(prop_name, yards, position, rank):

    # Define Market Name
    market_name = build_market_name(prop_name, yards, position, rank)

    # subset model df to desired position (if any)
    mod_df_sub = mod_df\
        .query(f"""
            prop_name == '{prop_name}' & \
            rank == {rank} & \
            position == '{position}' &\
            gamedate < '{datetime.today()}'
        """).copy()

    ### DATA PREP
    data_cols = [
        "prop_id", "t", "outcome", "market", "game_id", 
        "team_id", "player_id", "line", "rank", "first_possession"
    ] 
    # players that hit threshold
    hits = mod_df_sub\
        .query(f"total_yards_cum >= {yards}")\
        .groupby(["prop_id", "game_id", "team_id", "player_id", "line", "rank", "first_possession"])\
        .agg({"play_number":"min"})\
        .rename(columns={"play_number":"t"})\
        .assign(
            outcome = 1,
            market = market_name
        )\
        .reset_index()[data_cols]
    # players that missed the threshold
    censored = mod_df_sub.copy()
    censored = censored[~censored.prop_id.isin(hits.prop_id)]\
        .query(f"total_yards_cum < {yards}")\
        .rename(columns={"n_plays":"t"})\
        .assign(
            outcome = 0,
            market = market_name
        )[data_cols]\
        .drop_duplicates()

    # concatenate hits and misses
    mod_df_sub_concat = pd.concat([hits, censored],axis=0)\
        .sort_values(["game_id", "line"], ascending=[True, False])

    ## MIRROR DATA SET
    def build_mirror(df, asc = True):
        df_mir = df.copy()
        df_mir["game_rank"] = df_mir\
                .groupby(["game_id"])["line"]\
                .rank(method="first", ascending=asc).astype(int)
        df_mir = df_mir[df_mir["game_rank"] <= 2]
        df_mir = pd.pivot_table(
            df_mir, 
            values = ["line", "first_possession", "t", "outcome"], 
            index = ["game_id"], 
            columns="game_rank"
        ).reset_index()
        df_mir = flatten_column_index(df_mir)\
            .drop("first_possession_2",axis=1)
        return df_mir

    df_mir1 = build_mirror(mod_df_sub_concat)
    df_mir2 = build_mirror(mod_df_sub_concat, False)

    # Union Mirrored Data Sets
    df = pd.concat([df_mir1, df_mir2], axis = 0)\
        .sort_values("game_id")\
        .rename(columns = {"first_possession_1":"first_possession"})\
        .dropna(axis=0)

    # compute the probability of wins
    df["win"] = df.apply(lambda x: x.outcome_1 + x.outcome_2 > 0, axis = 1).astype(int)
    df["p1_wins"] = df.apply(lambda x: x.t_1 < x.t_2, axis = 1).astype(int)

    ## model 1 => predict a win occurs
    if np.sum(df["win"] == 0) == 0:
        model_1 = certain_model()
    else:
        model_1 = smf.glm(
            formula = "win ~ line_1 + line_2", 
            data = df, 
            family = sm.families.Binomial()
        ).fit()

    ## model 2 => predict who wins
    model_2 = smf.glm(
        formula = "p1_wins ~ line_1 + line_2",
        data = df[df["win"] == 1],
        family = sm.families.Binomial()
    ).fit()

    model = {"model_1": model_1, "model_2": model_2}

    return market_name, model

In [112]:
model_list =[
    # QB PAS YD MARKETS
    ("pass_yards",20,"QB",1), 
    ("pass_yards",40,"QB",1), 
    ("pass_yards",100,"QB",1), 
    # RB RUSH YD MARKETS
    ("rush_yards", 10,"RB",1), 
    ("rush_yards",20,"RB",1), 
    ("rush_yards",50,"RB",1),
    ("rush_yards", 10,"RB",2), 
    ("rush_yards",20,"RB",2), 
    ("rush_yards",50,"RB",2),
    # WR REC YD MARKETS
    ("rec_yards", 10,"WR",1),
    ("rec_yards", 20,"WR",1),
    ("rec_yards", 20,"WR",2),
    ("rec_yards", 30,"WR",1),
    ("rec_yards", 50,"WR",1),
    ("rec_yards", 60,"WR",1),
    ("rec_yards", 60,"WR",2),
    # RB REC YD MARKET
    ("rec_yards", 10,"RB",1),
    ("rec_yards", 20,"RB",1),
    ("rec_yards", 40,"RB",1),
    # TE REC YD MARKETS
    ("rec_yards", 10, "TE",1),
    ("rec_yards", 20, "TE",1)
]
models = {}
for mod_param in tqdm(model_list):
    nme, model = build_model(*mod_param)
    models[nme] = model

100%|██████████| 21/21 [00:02<00:00,  7.55it/s]


In [113]:
pred_dict ={
    "prop_name": "pass_yards",
    "yards": 100,
    "position" : "QB",
    "rank" : 1,
    "line_1": 250,
    "line_2": 250,
    "first_possession":1
}

def market_prediction(models, pred_dict):
    market_name = build_market_name(
        pred_dict["prop_name"],pred_dict["yards"],
        pred_dict["position"],pred_dict["rank"]
    )
    p1 = models[market_name]["model_1"].predict(exog=pred_dict)[0]
    p2_1 = models[market_name]["model_2"].predict(exog=pred_dict)[0]
    p2_2 = 1-p2_1
    return {
        "p_plr_1": p1*p2_1,
        "p_plr_2": p1*p2_2,
        "p_push": 1-p1
    }

market_prediction(models, pred_dict)

{'p_plr_1': 0.5002697292330999,
 'p_plr_2': 0.4989107768275682,
 'p_push': 0.0008194939393318945}

### Survival Model

In [202]:
from lifelines import CoxPHFitter


def build_model(prop_name, yards, position):

    eventnme = f"plays_to_{yards}_{prop_name}"+"_".join(position)
    
    # subset model df to desired position (if any)
    mod_df_sub = mod_df.copy()
    if len(position) > 0:
        mod_df_sub = mod_df_sub[mod_df_sub.position.isin(position)]
   
    ### DATA PREP
    df_cols = ["prop_id", "t", "outcome", "event", "game_id", "player_id", "line", "rank", "first_possession"]  
    hits = mod_df_sub\
        .query(f"prop_name == '{prop_name}' & total_yards_cum >= {yards}")\
        .groupby(["prop_id", "game_id", "player_id", "line", "rank", "first_possession"])\
        .agg({"play_number":"min"})\
        .rename(columns={"play_number":"t"})\
        .assign(
            outcome = 1,
            event = eventnme
        )\
        .reset_index()[df_cols]

    censored = mod_df_sub[~mod_df_sub.prop_id.isin(hits.prop_id)]\
        .query(f"prop_name == '{prop_name}' & total_yards_cum < {yards}")\
        .rename(columns={"n_plays":"t"})\
        .assign(
            outcome = 0,
            event = f"plays_to_{yards}_{prop_name}"
        )[df_cols]\
        .drop_duplicates()

    joined_df = pd.concat([hits, censored],axis=0)

    ## MODEL
    mod_cols = ["t", "outcome", "line", "rank"]

    cph = CoxPHFitter()
    cph.fit(joined_df[mod_cols], duration_col='t', event_col='outcome')

    return eventnme, cph

model_list =[
    ("rush_yards", 10,["RB"]), 
    ("rush_yards",20,["RB"]), 
    ("rush_yards",50,["RB"]),
    ("rec_yards", 20,["WR"]),
    ("rec_yards", 30,["WR"]),
    ("rec_yards", 50,["WR"]),
    ("rec_yards", 60,["WR"]),
    ("rec_yards", 100,["WR"]),
    ("rec_yards", 50, ["TE"])
]
models = {}
for mod_param in model_list:
    nme, model = build_model(*mod_param)
    models[nme] = model

In [212]:
prop_name = "rush_yards"
yards = 20
position = ["RB"]

df_pred = pd.DataFrame({
    "line":[59, 50],
    "rank":[1,1],
    #"first_possession":[1,0]
})

model = models[f"plays_to_{yards}_{prop_name}"+"_".join(position)]

# pull out cumulative hazard
cum_hzrd = model.predict_cumulative_hazard(df_pred)\
    .rename(columns={0:"p1",1:"p2"})

# compute density functions
density = cum_hzrd - cum_hzrd.shift(1)
density.iloc[0,:] = cum_hzrd.iloc[0,:]
density["i"] = list(density.index)

# pull out survival functions
surv= model.predict_survival_function(df_pred)\
    .rename(columns={0:"s1",1:"s2"})
surv["i"] = list(surv.index)

prob_df = density.merge(surv, on = 'i')
prob_df["hit1"] = prob_df["p1"] * prob_df["s1"].shift(1) * prob_df["s2"]
prob_df["hit2"] = prob_df["p2"] * prob_df["s1"] * prob_df["s2"].shift(1)
prob_df = prob_df.query("hit1 != hit2") # remove ties

p_hit1 = prob_df.hit1.sum() #/ (prob_df.hit1.sum() + prob_df.hit2.sum())
p_hit2 = prob_df.hit2.sum() #/ (prob_df.hit1.sum() + prob_df.hit2.sum())
p_none = 1- p_hit1 - p_hit2

print(p_hit1)
print(p_hit2)
print(p_none)

# plt.plot(model.predict_survival_function(df_pred))


# prob_df

0.5186147237359033
0.42650997729133344
0.05487529897276322


In [446]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

def build_market_name(prop_name, yards, position, rank):
    return f"plays_to_{yards}_{prop_name}_{position}_{rank}"

def flatten_column_index(df):
    df = df.copy()
    level_one = df.columns.get_level_values(0).astype(str)
    level_two = df.columns.get_level_values(1).astype(str)
    column_separator = ['_' if x != '' else '' for x in level_two]
    df.columns = level_one + column_separator + level_two
    return df

class certain_model:

    def __init__(self):
        self.pred = [1]

    def predict(self, exog):
        pred = self.pred
        return pred

def build_model(prop_name, yards, position, rank):

# prop_name = 'rush_yards'
# yards = 10
# position = 'RB'
# rank = 1

    market_name = build_market_name(prop_name, yards, position, rank)

    # subset model df to desired position (if any)
    mod_df_sub = mod_df\
        .query(f"""
            prop_name == '{prop_name}' & \
            rank == {rank} & \
            position == '{position}'
        """).copy()

    ### DATA PREP
    df_cols = [
        "prop_id", "t", "outcome", "market", "game_id", 
        "team_id", "player_id", "line", "rank", "first_possession"
    ]  
    hits = mod_df_sub\
        .query(f"total_yards_cum >= {yards}")\
        .groupby(["prop_id", "game_id", "team_id", "player_id", "line", "rank", "first_possession"])\
        .agg({"play_number":"min"})\
        .rename(columns={"play_number":"t"})\
        .assign(
            outcome = 1,
            market = market_name
        )\
        .reset_index()[df_cols]

    # initialize censored data from props
    # censored = prop\
    #     .assign(play_number = 0,total_yards_cum = 0,n_plays = 1000, total_yards = 0)\
    #     .merge(first_pos, on = ["game_id", "team_id"])\
    #     .query(f"""
    #         prop_name == '{prop_name}' & \
    #         rank == {rank} & \
    #         position == '{position}'
    #     """)
    # censored["prop_id"] = censored.agg('{0[prop_name]}_{0[game_id]}_{0[player_id]}'.format, axis = 1)
    # censored = censored[mod_df.columns]\
    censored = mod_df_sub.copy()
    censored = censored[~censored.prop_id.isin(hits.prop_id)]\
        .query(f"total_yards_cum < {yards}")\
        .rename(columns={"n_plays":"t"})\
        .assign(
            outcome = 0,
            market = market_name
        )[df_cols]\
        .drop_duplicates()


    # concatenate hits and non-hits
    mod_df_sub_concat = pd.concat([hits, censored],axis=0)\
        .sort_values(["game_id", "line"], ascending=[True, False])

    ## MIRROR DATA SET
    def build_mirror(df, asc = True):
        df_mir = df.copy()
        df_mir["game_rank"] = df_mir\
                .groupby(["game_id"])["line"].rank(method="first", ascending=asc).astype(int)
        df_mir = df_mir[df_mir["game_rank"] <= 2]
        df_mir = pd.pivot_table(df_mir, values = ["line", "first_possession", "t", "outcome"], index = ["game_id"], columns="game_rank").reset_index()
        df_mir = flatten_column_index(df_mir).drop("first_possession_2",axis=1)
        return df_mir
    # Set 1
    df1 = build_mirror(mod_df_sub_concat)
    df2 = build_mirror(mod_df_sub_concat, False)

    df = pd.concat([df1, df2], axis = 0)\
        .sort_values("game_id")\
        .rename(columns = {"first_possession_1":"first_possession"})

    df["win"] = df.apply(lambda x: x.outcome_1 + x.outcome_2 > 0, axis = 1).astype(int)
    df["p1_wins"] = df.apply(lambda x: x.t_1 < x.t_2, axis = 1).astype(int)

    ## model 1 => predict a win occurs
    if np.sum(df["win"] == 0) == 0:
        model_1 = certain_model()
    else:
        model_1 = smf.glm(
            formula = "win ~ line_1 + line_2", 
            data = df, 
            family = sm.families.Binomial()
        ).fit()

    ## model 2 => predict what win happens
    model_2 = smf.glm(
        formula = "p1_wins ~ line_1 + line_2",
        data = df[df["win"] == 1],
        family = sm.families.Binomial()
    ).fit()

    model = {"model_1": model_1, "model_2": model_2}

    return market_name, model


In [447]:
model_list =[
    ("rush_yards", 10,"RB",1), 
    ("rush_yards",20,"RB",1), 
    ("rush_yards",50,"RB",1),
    ("rec_yards", 20,"WR",1),
    ("rec_yards", 30,"WR",1),
    ("rec_yards", 50,"WR",1),
    ("rec_yards", 60,"WR",1),
    ("rec_yards", 100,"WR",1),
    ("rec_yards", 50, "TE",1)
]
models = {}
for mod_param in model_list:
    print(mod_param)
    nme, model = build_model(*mod_param)
    models[nme] = model

('rush_yards', 10, 'RB', 1)
('rush_yards', 20, 'RB', 1)
('rush_yards', 50, 'RB', 1)
('rec_yards', 20, 'WR', 1)
('rec_yards', 30, 'WR', 1)
('rec_yards', 50, 'WR', 1)
('rec_yards', 60, 'WR', 1)
('rec_yards', 100, 'WR', 1)
('rec_yards', 50, 'TE', 1)


In [451]:
prop_name = "rush_yards"
yards = 10
position = "RB"
rank = 1

market_name = build_market_name(prop_name, yards, position, rank)

pred_dict ={
    "line_1": [50],
    "line_2": [50]
}
pred_df = pd.DataFrame(pred_dict, index=[0])

def market_prediction(models, market_name, df_pred):
    p1 = models[market_name]["model_1"].predict(exog=pred_dict)[0]
    p2_1 = models[market_name]["model_2"].predict(exog=pred_dict)[0]
    p2_2 = 1-p2_1
    return {
        "p_plr_1": p1*p2_1,
        "p_plr_2": p1*p2_2,
        "p_push": 1-p1
    }

market_prediction(models, market_name, df_pred)

{'p_plr_1': 0.501962290843242, 'p_plr_2': 0.49803770915675805, 'p_push': 0}

In [380]:
model_2 = smf.glm(
    formula = "p1_wins ~ line_1 + line_2",
    data = df[df["win"] == 1],
    family = sm.families.Binomial()
).fit()

In [385]:
model_2.predict(exog = {"line_1":[51], "line_2":[50]})

0    0.499047
dtype: float64