In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import matplotlib.pyplot as plt
from scipy.special import logit, expit
import statsmodels.api as sm
import statsmodels.formula.api as smf

def softmax(x):
    return np.exp(x)/np.sum(np.exp(x))

def normalize(x):
    return x/np.sum(x)

import psycopg2
import pymysql
import sqlalchemy_redshift
from configparser import ConfigParser
from sqlalchemy import create_engine

parser = ConfigParser()
_ = parser.read("../notebook.cfg")

# Connect to zack attack
za_user = parser.get("nffddev_2", "user")
za_pwd = parser.get("nffddev_2", "password")

za_engine = create_engine(
    f"mysql+pymysql://{za_user}:{za_pwd}@nffddev.numberfire.com/zack_attack",
    connect_args = dict(host='nffddev.numberfire.com', port=3306)
)
za_conn = za_engine.connect()

# # connect to redshift
# red_user = parser.get("redshift", "user")
# red_pwd = parser.get("redshift", "password")

# red_engine = create_engine(
#     f"postgresql+psycopg2://{red_user}:{red_pwd}@rs1.usdfs.fdbox.net/fanduel",
#     connect_args = dict(port=5439)
# )
# red_conn = red_engine.connect()

### Prop Data

In [4]:
# # Team
# nfl_team = pd.read_sql("select * from nfl_team", za_conn)\
#     .rename(columns={"id":"team_id"})

# # Game
# nfl_game = pd.read_sql("select * from nfl_game", za_conn)\
#     .rename(columns={"id":"game_id", "date":"gamedate"})
# nfl_game["gamedate"] = pd.to_datetime(nfl_game["gamedate"])


# prop_qry = '''
# with nfl_prop_full as (
#     select
#     left(gamedate,10) gamedate,
#     season,week,prop_name,position,name,
#     id as player_id,team, opp,
#     fd, pinny, dk, czr, mgm
#     from analyst_dev.nfl_prop_data
#     union
#     select
#     left(gamedate,10) gamedate,
#     season,week,prop_name,position,name,
#     id as player_id,team, opp,
#     fd, pinny, NULL as dk, NULL as czr, mgm
#     from analyst_dev.nfl_prop_data_archive
# ),
# prop_filt as (
#     select *,
#     coalesce(fd, pinny, dk, czr, mgm) as line
#     from nfl_prop_full
#     where prop_name in ('rush_yards', 'rec_yards', 'pass_yards', 'rec')
# )
# select * from prop_filt where line is not null
# '''
# prop = pd.read_sql(prop_qry, red_conn)\
#     .merge(
#         nfl_team[["abbrev", "team_id"]]\
#             .rename(columns={"abbrev":"team"}), 
#         on = "team",
#         how = "left"
#     )
# prop["gamedate"] = pd.to_datetime(prop["gamedate"])

# # build game dataset
# nfl_prop_game = nfl_game.merge(
#     nfl_team[["team_id", "abbrev"]]\
#         .rename(columns={"team_id":"home_team_id", "abbrev":"home_team_abbrev"}),
#     on = ["home_team_id"]
# ).merge(
#     nfl_team[["team_id", "abbrev"]]\
#         .rename(columns={"team_id":"away_team_id", "abbrev":"away_team_abbrev"}),
#     on = ["away_team_id"]
# )[["game_id", "gamedate", "home_team_abbrev", "away_team_abbrev"]]

# nfl_prop_game = pd.concat([
#     nfl_prop_game.rename(columns={"home_team_abbrev":"team", "away_team_abbrev":"opp", "home_team_id":"team_id"}),
#     nfl_prop_game.rename(columns={"away_team_abbrev":"team", "home_team_abbrev":"opp", "away_team_id":"team_id"})
# ],axis=0)

# # join game data set
# prop = prop.merge(nfl_prop_game, on = ["gamedate", "team", "opp"])

# ## rank rushers and recievers
# prop["rank_team_pos"] = prop\
#     .groupby(["game_id", "team_id", "prop_name", "position"])["line"]\
#     .rank(method="first",ascending=False).astype(int)

# prop["rank_team"] = prop\
#     .groupby(["game_id", "team_id", "prop_name"])["line"]\
#     .rank(method="first",ascending=False).astype(int)

# # join on actual data
# prop = prop.merge(skill_df, on = ["game_id", "player_id", "prop_name"], how = "left").fillna(0)
# prop.head()

### Projection Data

In [5]:
proj_qry = """
select 
t.name as team_name,
p.name as player_name,
p.position,
s.player_id, s.team_id, s.game_id, g.date as gamedate,
s.rush_yards, s.pass_yards, s.rec_yards, s.rec
from nfl_projection_skill s
inner join nfl_team t on s.team_id = t.id
inner join nfl_player p on p.id = s.player_id
inner join nfl_game g on g.id = s.game_id
"""
proj = pd.read_sql(proj_qry, za_conn)\
.melt(
    id_vars=["team_name", "player_name", "position", "player_id", "team_id", "game_id", "gamedate"],
    value_vars=["rec", "rec_yards", "rush_yards", "pass_yards"]
).rename(columns={"variable":"prop_name", "value":"line"})

proj["gamedate"] = pd.to_datetime(proj["gamedate"])

proj["rank_team_pos"] = proj\
    .groupby(["game_id", "team_id", "prop_name", "position"])["line"]\
    .rank(method="first",ascending=False).astype(int)

proj["rank_team"] = proj\
    .groupby(["game_id", "team_id", "prop_name"])["line"]\
    .rank(method="first",ascending=False).astype(int)

### PBP Data

In [6]:
pbp_qry = '''
select yards.*,
mx_pl.n_plays
from (
    select game_id, offense_team_id as team_id,
    play_number, receiver_id as player_id,
    'rec_yards' as prop_name, total_yards as value
    from nfl_plays where category = 'PASS'
    and is_complete_pass = 1
    union
    select game_id, offense_team_id as team_id,
    play_number, rusher_id as player_id,
    'rush_yards' as prop_name, total_yards as value
    from nfl_plays where category = 'RUSH'
    union
    select game_id, offense_team_id as team_id,
    play_number, passer_id as player_id,
    'pass_yards' as prop_name, total_yards as value
    from nfl_plays where category = 'PASS'
    and is_complete_pass = 1
    union
    select game_id, offense_team_id as team_id,
    play_number, receiver_id as player_id,
    'rec' as prop_name, 1 as value
    from nfl_plays where category = 'PASS'
    and is_complete_pass = 1
) yards
inner join (
    select game_id, max(play_number) n_plays
    from nfl_plays group by 1
) mx_pl on yards.game_id = mx_pl.game_id
'''
pbp_df = pd.read_sql(pbp_qry, za_conn)
pbp_df["player_id"] = pbp_df["player_id"].fillna(0).astype("int")

### Model Data

In [7]:
prop_cols = [
    "game_id", "player_id", "team_id", "gamedate",
    "position", "prop_name", "line", "rank_team", "rank_team_pos"
]
#mod_df = prop[prop_cols] change to this if you get redshift to work
mod_df = proj[prop_cols]\
    .query("rank_team <= 5")\
    .merge(
        pbp_df,
        on = ["game_id", "player_id", "team_id", "prop_name"],
        how = 'left'
    )\
    .assign(
        n_plays_max = lambda x: x.groupby("game_id").transform("max")["n_plays"],
        n_plays = lambda x: x.n_plays_max # this imputes NA values for max plays for playes with no real stats
    )\
    .fillna(value = {"play_number":0, "value":0})\
    .drop("n_plays_max",axis=1)


mod_df["player_id"] = mod_df["player_id"].astype(int)
mod_df["prop_id"] = mod_df.agg('{0[prop_name]}_{0[game_id]}_{0[player_id]}'.format, axis = 1)

mod_df["value_cum"] = mod_df\
    .sort_values(["prop_id", "play_number"])\
    .groupby(["prop_id"])['value'].cumsum()

In [8]:
### Logit Model
import statsmodels.api as sm
import statsmodels.formula.api as smf

def build_market_name(prop_name, value, position, rank):
    return f"plays_to_{value}_{prop_name}_{position}_{rank}"

def flatten_column_index(df):
    df = df.copy()
    level_one = df.columns.get_level_values(0).astype(str)
    level_two = df.columns.get_level_values(1).astype(str)
    column_separator = ['_' if x != '' else '' for x in level_two]
    df.columns = level_one + column_separator + level_two
    return df

class certain_model:

    def __init__(self):
        self.pred = [1]

    def predict(self, exog):
        pred = self.pred
        return pred

def build_logit_model(prop_name, value, position):
    #prop_name = 'rec'; position='RB'; value = 1
    # subset model df to desired position (if any)
    mod_df_sub = mod_df\
        .query(f"""
            prop_name == '{prop_name}' and \
            rank_team_pos in [1,2] and \
            position == '{position}' and \
            gamedate < '{datetime.today()}'
        """)\
        .drop("position", axis=1)\
        .copy()

    # Deterine plays for player to cross threshold
    mod_df_sub["hit"] = mod_df_sub["value_cum"].apply(lambda x: 1 if x > value else 0)
    mod_df_sub["no_hit"] = 1-mod_df_sub.groupby(["game_id", "player_id"])['hit'].transform("max")
    # if hit: return play_number times hit (1 or 0) // if no_hit: return number of plays in game
    mod_df_sub["t"] = mod_df_sub.apply(lambda x: x["n_plays"] if x["no_hit"] == 1 else x["play_number"]*x["hit"], axis=1)
    # filter to where t > 0 (if no_hit, then it is n_plays)
    mod_df_filt = mod_df_sub\
        .query("t > 0")\
        .groupby(["game_id", "team_id", "player_id", "line", "rank_team_pos"])\
        .agg({"t":"min", "hit":"max"}).reset_index()

    # filter to games with more than 1 position line
    valid_games = mod_df_filt\
        .groupby("game_id")\
        .agg({"player_id":"count"})\
        .reset_index()\
        .rename(columns={"player_id":"n_players"})\
        .query("n_players > 1")

    # Join on itself to auto mirror every matchup
    mod_df_join = valid_games\
        .merge(mod_df_filt, on = 'game_id')\
        .merge( mod_df_filt, on = 'game_id',suffixes=["_1", "_2"])\
        .query("player_id_1 != player_id_2")

    mod_df_join["push"] = mod_df_join.apply(lambda x: x["hit_1"] + x["hit_2"] == 0, axis=1).astype("int")
    mod_df_join["player_1_win"] = mod_df_join.apply(lambda x: x["t_1"] < x["t_2"], axis=1).astype("int")
    mod_df_join["player_1_win"] = mod_df_join["player_1_win"] * mod_df_join["hit_1"]
    mod_df_join["same_team"] = mod_df_join.apply(lambda x: x["team_id_1"] == x["team_id_2"], axis=1).astype("int")

    # remove probability of push
    df = mod_df_join.query("push==0").copy()

    ## model 2 => predict who wins
    model = smf.glm(
        formula = "player_1_win ~ line_1 + line_2 + rank_team_pos_1 + rank_team_pos_2 + rank_team_pos_1:same_team",
        data = df[df["push"] == 0],
        family = sm.families.Binomial()
    ).fit()

    nme = f"{position}_{prop_name}_{value}"

    return nme, model

In [9]:
model_list =[
    # QB PAS YD MARKETS
    ("pass_yards",20, "QB"), 
    ("pass_yards",40, "QB"), 
    ("pass_yards",100,"QB"), 
    # RB RUSH YD MARKETS
    ("rush_yards",10,"RB"), 
    ("rush_yards",20,"RB"), 
    ("rush_yards",50,"RB"),
    ("rush_yards",10,"RB"), 
    ("rush_yards",20,"RB"), 
    ("rush_yards",50,"RB"),
    # WR REC YD MARKETS
    ("rec_yards", 10,"WR"),
    ("rec_yards", 20,"WR"),
    ("rec_yards", 30,"WR"),
    ("rec_yards", 50,"WR"),
    ("rec_yards", 60,"WR"),
    # RB REC YD MARKET
    ("rec_yards", 10,"RB"),
    ("rec_yards", 20,"RB"),
    ("rec_yards", 40,"RB"),
    # TE REC YD MARKETS
    ("rec_yards", 10, "TE"),
    ("rec_yards", 20, "TE"),
    ("rec_yards", 40, "TE"),
    # Reception Markets
    ("rec", 1, "WR"),
    ("rec", 1, "WR"),
    ("rec", 1, "WR"),
    ("rec", 1, "TE"),
    ("rec", 1, "RB"),
    ("rec", 1, "RB")
]
logit_models = {}
for mod_param in tqdm(model_list):
    nme, logit_model = build_logit_model(*mod_param)
    logit_models[nme] = logit_model

100%|██████████| 26/26 [00:03<00:00,  7.56it/s]


In [27]:
def get_logit_predictions(market, models, proj):
    # unpack market
    players = market["players"]
    game_id = market["game_id"]
    prop_name = market["prop_name"]
    position = market["position"]
    value = market["value"]
    pred_df = proj.query(f"""
            player_name in {players}\
            and game_id == {game_id}\
            and prop_name in '{prop_name}'
        """)[["player_name", "position", "team_id","rank_team_pos","line"]]

    def unpack_df(pred_df, ix = 1):
        pred_df_i = pred_df.iloc[(ix-1):ix,:]
        pred_df_i.index = [0]
        pred_df_i.columns = [f"{x}_{ix}" for x in pred_df_i.columns]
        return pred_df_i
    pred_df = pd.concat([unpack_df(pred_df, 1), unpack_df(pred_df, 2)],axis=1)
    # assign same team
    pred_df["same_team"] = pred_df.apply(lambda x: x["team_id_1"] == x["team_id_2"],axis=1).astype(int)

    model = logit_models[f"{position}_{prop_name}_{value}"]
    p_player_1_wins = model.predict(exog=pred_df)[0]

    return_df = pred_df.copy()\
        [["player_name_1", "player_name_2", "line_1", "line_2"]]\
        .assign(
            p_player_1 = p_player_1_wins,
            p_player_2 = 1-p_player_1_wins,
            price_player_1 = 1/p_player_1_wins,
            price_player_2 = 1/(1-p_player_1_wins)
        )
    return return_df
    
# params
market = {
    "players": ["A.J. Brown", "DeVonta Smith"],
    "game_id":8297,
    "position": 'WR',
    "prop_name": "rec",
    "value": 1
}
get_logit_predictions(market, logit_models, proj)

Unnamed: 0,player_name_1,player_name_2,line_1,line_2,p_player_1,p_player_2,price_player_1,price_player_2
0,A.J. Brown,DeVonta Smith,5.31,5.37,0.467708,0.532292,2.138086,1.878668


In [22]:
proj.query("game_id==8297 and position == 'WR' and prop_name == 'rec'").sort_values("line",ascending=False)

Unnamed: 0,team_name,player_name,position,player_id,team_id,game_id,gamedate,prop_name,line,rank_team_pos,rank_team
6617,Philadelphia Eagles,DeVonta Smith,WR,54877,24,8297,2023-02-12,rec,5.37,1,1
4242,Philadelphia Eagles,A.J. Brown,WR,54161,24,8297,2023-02-12,rec,5.31,2,2
3410,Kansas City Chiefs,Marquez Valdes-Scantling,WR,53904,16,8297,2023-02-12,rec,3.45,1,3
1962,Kansas City Chiefs,JuJu Smith-Schuster,WR,53422,16,8297,2023-02-12,rec,1.81,2,5
7929,Kansas City Chiefs,Skyy Moore,WR,55256,16,8297,2023-02-12,rec,1.63,3,6
6270,Philadelphia Eagles,Quez Watkins,WR,54810,24,8297,2023-02-12,rec,1.6,3,5
4084,Kansas City Chiefs,Mecole Hardman,WR,54140,16,8297,2023-02-12,rec,0.98,4,8
6546,Kansas City Chiefs,Kadarius Toney,WR,54873,16,8297,2023-02-12,rec,0.96,5,9
3524,Kansas City Chiefs,Justin Watson,WR,53976,16,8297,2023-02-12,rec,0.44,6,10
3333,Philadelphia Eagles,Zach Pascal,WR,53868,24,8297,2023-02-12,rec,0.38,4,8
