In [1]:
from os.path import join
import json
import re
import os

import cfbd
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

OUTPUT_DIR = '../../app/data'
DATA_DIR = '../data'
YEARS = range(2013, 2025)

with open('../config.json') as f:
    configuration = cfbd.Configuration(
        access_token = json.load(f)['CFBD_API_KEY']
    )

In [2]:
def convert_to_snake_case(cols):
    cols_new = []
    for c in cols:
        s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', c)
        cols_new.append(re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower())
    return cols_new

In [3]:
df = pd.read_parquet('../data/predictions/predictions.parquet')
df.head()

Unnamed: 0,game_id,drive_id,play_id,offense,defense,period,clock_minutes,clock_seconds,offense_score,defense_score,offense_timeouts,defense_timeouts,yards_to_goal,down,distance,play_type,play_text,pct_game_played,score_diff,season,week,season_type,neutral_site,venue_id,completed,home_id,home_team,home_conference,away_id,away_team,away_conference,is_home_team,home_division,away_division,pregame_elo_diff,pregame_offense_elo,pregame_defense_elo,precipitation,wind_speed,temperature,game_indoors,offense_division,defense_division,pregame_spread,decision,offense_strength,defense_strength,fg_proba,exp_wp_fg,fourth_down_proba,exp_wp_go,punt_yards_to_goal,exp_wp_punt,cur_win_proba,offense_id,offense_team,defense_id,defense_team,offense_color,offense_alternate_color,offense_logos,offense_conference,defense_color,defense_alternate_color,defense_logos,defense_conference
0,333610120,33361012026,333610120256,Maryland,Marshall,4,0,50,20,31,0.0,2.0,66,4,4,Pass Incompletion,C.J. Brown pass incomplete to Levern Jacobs.,0.986111,-11,2013,1,postseason,True,3852.0,True,120,Maryland,ACC,276,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,go,0.083596,-0.226068,0.0001,0.001001,0.4057,0.001755,75.524101,0.0029,0.0024,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA
1,333610120,33361012022,333610120226,Maryland,Marshall,4,5,53,20,24,3.0,3.0,87,4,5,Punt,"Nathan Renfro punt for 45 yards, returned by D...",0.901944,-4,2013,1,postseason,True,3852.0,True,120,Maryland,ACC,276,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,punt,0.083596,-0.226068,0.0,0.167,0.3501,0.180295,55.771702,0.1891,0.2234,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA
2,333610120,33361012010,333610120093,Maryland,Marshall,2,7,49,10,17,3.0,3.0,68,4,3,Punt,"Nathan Renfro punt for 38 yards, fair catch by...",0.369722,-7,2013,1,postseason,True,3852.0,True,120,Maryland,ACC,276,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,punt,0.083596,-0.226068,0.0001,0.254037,0.5189,0.250828,74.180702,0.2735,0.2475,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA
3,333610120,33361012004,333610120027,Maryland,Marshall,1,8,9,0,0,3.0,3.0,97,4,8,Punt,"Nathan Renfro punt for 34 yards, returned by D...",0.114167,0,2013,1,postseason,True,3852.0,True,120,Maryland,ACC,276,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,punt,0.083596,-0.226068,0.0,0.391,0.3275,0.41764,53.889999,0.432,0.4731,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA
4,333610120,33361012002,333610120012,Maryland,Marshall,1,11,33,0,0,3.0,3.0,86,4,18,Punt,"Nathan Renfro punt for 47 yards, returned by D...",0.0575,0,2013,1,postseason,True,3852.0,True,120,Maryland,ACC,276,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,punt,0.083596,-0.226068,0.0,0.391,0.1483,0.406746,57.934299,0.4492,0.4777,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA


In [4]:
STRONG_RECOMMEND = 0.04
RECOMMEND = 0.015
SLIGHT_RECOMMEND = 0.005

df = (
    df
    .assign(
        exp_wp_go=lambda x: np.minimum(x['exp_wp_go'] + 0.005, 1.00),
    )
    .assign(
        wpa_fg=lambda x: x['exp_wp_fg'] - x['cur_win_proba'],
        wpa_go=lambda x: x['exp_wp_go'] - x['cur_win_proba'],
        wpa_punt=lambda x: x['exp_wp_punt'] - x['cur_win_proba'],
    )
    .assign(
        fg_diff_go=lambda x: x['wpa_fg'] - x['wpa_go'],
        fg_diff_punt=lambda x: x['wpa_fg'] - x['wpa_punt'],
        go_diff_fg=lambda x: x['wpa_go'] - x['wpa_fg'],
        go_diff_punt=lambda x: x['wpa_go'] - x['wpa_punt'],
        punt_diff_fg=lambda x: x['wpa_punt'] - x['wpa_fg'],
        punt_diff_go=lambda x: x['wpa_punt'] - x['wpa_go'],
    )
    .assign(
        recommendation=lambda x: np.select(
            [
                # Go
                (x['go_diff_fg'] > STRONG_RECOMMEND) & (x['go_diff_punt'] > STRONG_RECOMMEND),
                (x['go_diff_fg'] > RECOMMEND) & (x['go_diff_punt'] > RECOMMEND),
                (x['go_diff_fg'] > SLIGHT_RECOMMEND) & (x['go_diff_punt'] > SLIGHT_RECOMMEND),

                # Field Goal
                (x['fg_diff_go'] > STRONG_RECOMMEND) & (x['fg_diff_punt'] > STRONG_RECOMMEND),
                (x['fg_diff_go'] > RECOMMEND) & (x['fg_diff_punt'] > RECOMMEND),
                (x['fg_diff_go'] > SLIGHT_RECOMMEND) & (x['fg_diff_punt'] > SLIGHT_RECOMMEND),

                # Punt
                (x['punt_diff_go'] > STRONG_RECOMMEND) & (x['punt_diff_fg'] > STRONG_RECOMMEND),
                (x['punt_diff_go'] > RECOMMEND) & (x['punt_diff_fg'] > RECOMMEND),
                (x['punt_diff_go'] > SLIGHT_RECOMMEND) & (x['punt_diff_fg'] > SLIGHT_RECOMMEND),

                # Go or FG
                (np.abs(x['go_diff_fg']) < SLIGHT_RECOMMEND) & (np.maximum(x['go_diff_punt'], x['fg_diff_punt']) > STRONG_RECOMMEND),
                (np.abs(x['go_diff_fg']) < SLIGHT_RECOMMEND) & (np.maximum(x['go_diff_punt'], x['fg_diff_punt']) > RECOMMEND),
                (np.abs(x['go_diff_fg']) < SLIGHT_RECOMMEND) & (np.maximum(x['go_diff_punt'], x['fg_diff_punt']) > SLIGHT_RECOMMEND),

                # Go or Punt
                (np.abs(x['punt_diff_go']) < SLIGHT_RECOMMEND) & (np.maximum(x['fg_diff_go'], x['punt_diff_fg']) > STRONG_RECOMMEND),
                (np.abs(x['punt_diff_go']) < SLIGHT_RECOMMEND) & (np.maximum(x['fg_diff_go'], x['punt_diff_fg']) > RECOMMEND),
                (np.abs(x['punt_diff_go']) < SLIGHT_RECOMMEND) & (np.maximum(x['fg_diff_go'], x['punt_diff_fg']) > SLIGHT_RECOMMEND),
            
                # FG or Punt
                (np.abs(x['fg_diff_punt']) < SLIGHT_RECOMMEND) & (np.maximum(x['go_diff_fg'], x['punt_diff_go']) > STRONG_RECOMMEND),
                (np.abs(x['fg_diff_punt']) < SLIGHT_RECOMMEND) & (np.maximum(x['go_diff_fg'], x['punt_diff_go']) > RECOMMEND),
                (np.abs(x['fg_diff_punt']) < SLIGHT_RECOMMEND) & (np.maximum(x['go_diff_fg'], x['punt_diff_go']) > SLIGHT_RECOMMEND),

            ],
            [
                'go - strongly recommended',
                'go - recommended',
                'go - slightly recommended',
                'field goal - strongly recommended',
                'field goal - recommended',
                'field goal - slightly recommended',
                'punt - strongly recommended',
                'punt - recommended',
                'punt - slightly recommended',
                'go or field goal - strongly recommended',
                'go or field goal - recommended',
                'go or field goal - slightly recommended',
                'go or punt - strongly recommended',
                'go or punt - recommended',
                'go or punt - slightly recommended',
                'punt or field goal - strongly recommended',
                'punt or field goal - recommended',
                'punt or field goal - slightly recommended',
            ],
            default='any decision - no strong recommendation'
        )
    )
)

df.head()

Unnamed: 0,game_id,drive_id,play_id,offense,defense,period,clock_minutes,clock_seconds,offense_score,defense_score,offense_timeouts,defense_timeouts,yards_to_goal,down,distance,play_type,play_text,pct_game_played,score_diff,season,week,season_type,neutral_site,venue_id,completed,home_id,home_team,home_conference,away_id,away_team,away_conference,is_home_team,home_division,away_division,pregame_elo_diff,pregame_offense_elo,pregame_defense_elo,precipitation,wind_speed,temperature,game_indoors,offense_division,defense_division,pregame_spread,decision,offense_strength,defense_strength,fg_proba,exp_wp_fg,fourth_down_proba,exp_wp_go,punt_yards_to_goal,exp_wp_punt,cur_win_proba,offense_id,offense_team,defense_id,defense_team,offense_color,offense_alternate_color,offense_logos,offense_conference,defense_color,defense_alternate_color,defense_logos,defense_conference,wpa_fg,wpa_go,wpa_punt,fg_diff_go,fg_diff_punt,go_diff_fg,go_diff_punt,punt_diff_fg,punt_diff_go,recommendation
0,333610120,33361012026,333610120256,Maryland,Marshall,4,0,50,20,31,0.0,2.0,66,4,4,Pass Incompletion,C.J. Brown pass incomplete to Levern Jacobs.,0.986111,-11,2013,1,postseason,True,3852.0,True,120,Maryland,ACC,276,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,go,0.083596,-0.226068,0.0001,0.001001,0.4057,0.006755,75.524101,0.0029,0.0024,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA,-0.001399,0.004355,0.0005,-0.005754,-0.001899,0.005754,0.003855,0.001899,-0.003855,punt or field goal - slightly recommended
1,333610120,33361012022,333610120226,Maryland,Marshall,4,5,53,20,24,3.0,3.0,87,4,5,Punt,"Nathan Renfro punt for 45 yards, returned by D...",0.901944,-4,2013,1,postseason,True,3852.0,True,120,Maryland,ACC,276,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,punt,0.083596,-0.226068,0.0,0.167,0.3501,0.185295,55.771702,0.1891,0.2234,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA,-0.0564,-0.038105,-0.0343,-0.018295,-0.0221,0.018295,-0.003805,0.0221,0.003805,go or punt - recommended
2,333610120,33361012010,333610120093,Maryland,Marshall,2,7,49,10,17,3.0,3.0,68,4,3,Punt,"Nathan Renfro punt for 38 yards, fair catch by...",0.369722,-7,2013,1,postseason,True,3852.0,True,120,Maryland,ACC,276,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,punt,0.083596,-0.226068,0.0001,0.254037,0.5189,0.255828,74.180702,0.2735,0.2475,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA,0.006537,0.008328,0.026,-0.001791,-0.019463,0.001791,-0.017672,0.019463,0.017672,punt - recommended
3,333610120,33361012004,333610120027,Maryland,Marshall,1,8,9,0,0,3.0,3.0,97,4,8,Punt,"Nathan Renfro punt for 34 yards, returned by D...",0.114167,0,2013,1,postseason,True,3852.0,True,120,Maryland,ACC,276,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,punt,0.083596,-0.226068,0.0,0.391,0.3275,0.42264,53.889999,0.432,0.4731,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA,-0.0821,-0.05046,-0.0411,-0.03164,-0.041,0.03164,-0.00936,0.041,0.00936,punt - slightly recommended
4,333610120,33361012002,333610120012,Maryland,Marshall,1,11,33,0,0,3.0,3.0,86,4,18,Punt,"Nathan Renfro punt for 47 yards, returned by D...",0.0575,0,2013,1,postseason,True,3852.0,True,120,Maryland,ACC,276,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,punt,0.083596,-0.226068,0.0,0.391,0.1483,0.411746,57.934299,0.4492,0.4777,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA,-0.0867,-0.065954,-0.0285,-0.020746,-0.0582,0.020746,-0.037454,0.0582,0.037454,punt - recommended


In [5]:
df['go_recommendation'] = df['recommendation'].fillna('').str.contains('go - ')

In [6]:
df.go_recommendation.value_counts()

go_recommendation
False    124027
True      31611
Name: count, dtype: int64

In [7]:
df.decision.value_counts()

decision
punt          92915
go            34249
field_goal    28474
Name: count, dtype: int64

In [8]:
df.recommendation.value_counts()

recommendation
go - slightly recommended                    25294
any decision - no strong recommendation      23525
punt - recommended                           19089
punt - slightly recommended                  12592
punt or field goal - slightly recommended    10002
field goal - recommended                      9774
go or field goal - slightly recommended       8012
punt - strongly recommended                   7827
go or punt - slightly recommended             7705
field goal - slightly recommended             6877
field goal - strongly recommended             6835
go or punt - recommended                      6652
go - recommended                              5408
go or punt - strongly recommended             2181
go or field goal - recommended                2018
go - strongly recommended                      909
punt or field goal - recommended               465
go or field goal - strongly recommended        450
punt or field goal - strongly recommended       23
Name: count, dty

In [9]:
df.recommendation.value_counts()

recommendation
go - slightly recommended                    25294
any decision - no strong recommendation      23525
punt - recommended                           19089
punt - slightly recommended                  12592
punt or field goal - slightly recommended    10002
field goal - recommended                      9774
go or field goal - slightly recommended       8012
punt - strongly recommended                   7827
go or punt - slightly recommended             7705
field goal - slightly recommended             6877
field goal - strongly recommended             6835
go or punt - recommended                      6652
go - recommended                              5408
go or punt - strongly recommended             2181
go or field goal - recommended                2018
go - strongly recommended                      909
punt or field goal - recommended               465
go or field goal - strongly recommended        450
punt or field goal - strongly recommended       23
Name: count, dty

In [10]:
# Load games data again because we missed the start date in the notebook previous load
all_dfs = []
id_cols = ['id', 'season', 'week', 'season_type', 'completed', 'neutral_site', 'venue_id','start_date']
home_cols = ['home_id', 'home_team', 'home_conference', 'home_division', 'home_points', 'home_pregame_elo']
away_cols = ['away_id', 'away_team', 'away_conference', 'away_division', 'away_points', 'away_pregame_elo']

games_dir = join(DATA_DIR, 'games')
if not os.path.exists(games_dir):
    os.makedirs(games_dir)


for year in YEARS:
    file_path = join(games_dir, f'{year}.parquet')
    if os.path.exists(file_path):
        print(f'Reading {file_path} from cached data')
        df_tmp = pd.read_parquet(file_path)
    else:
        print(f'Fetching {year} games data from CFBD API')
        with cfbd.ApiClient(configuration) as api_client:
            api_instance = cfbd.GamesApi(api_client)
            data = api_instance.get_games(year=year)
        df_tmp = pd.DataFrame([val.to_dict() for val in data])
        df_tmp.columns = convert_to_snake_case(df_tmp.columns)
        df_tmp = df_tmp[id_cols + [
                            'home_id','home_team','home_conference','home_classification','home_points','home_pregame_elo',
                            'away_id','away_team','away_conference','away_classification','away_points','away_pregame_elo']]
        df_tmp['season_type'] = df_tmp['season_type'].apply(lambda x: x.value)
        df_tmp = df_tmp.assign(
            home_division=df_tmp['home_classification'].apply(lambda x: x.value if x is not None else None),
            away_division=df_tmp['away_classification'].apply(lambda x: x.value if x is not None else None)
        ).drop(columns=['home_classification','away_classification'])
        df_tmp = df_tmp[id_cols + home_cols + away_cols]
        df_tmp.to_parquet(file_path)
    all_dfs.append(df_tmp)
del df_tmp

df_games = pd.concat(all_dfs, ignore_index=True).reset_index(drop=True)
df_games.head()

Reading ../data/games/2013.parquet from cached data
Reading ../data/games/2014.parquet from cached data
Reading ../data/games/2015.parquet from cached data
Reading ../data/games/2016.parquet from cached data
Reading ../data/games/2017.parquet from cached data
Reading ../data/games/2018.parquet from cached data
Reading ../data/games/2019.parquet from cached data
Reading ../data/games/2020.parquet from cached data
Reading ../data/games/2021.parquet from cached data
Reading ../data/games/2022.parquet from cached data
Reading ../data/games/2023.parquet from cached data
Reading ../data/games/2024.parquet from cached data


Unnamed: 0,id,season,week,season_type,completed,neutral_site,venue_id,start_date,home_id,home_team,home_conference,home_division,home_points,home_pregame_elo,away_id,away_team,away_conference,away_division,away_points,away_pregame_elo
0,332412309,2013,1,regular,True,False,3696.0,2013-08-29 22:00:00+00:00,2309,Kent State,Mid-American,fbs,17.0,1530.0,2335,Liberty,Big South,fcs,10.0,1467.0
1,332412579,2013,1,regular,True,False,3994.0,2013-08-29 22:00:00+00:00,2579,South Carolina,SEC,fbs,27.0,1759.0,153,North Carolina,ACC,fbs,10.0,1638.0
2,332410154,2013,1,regular,True,False,3630.0,2013-08-29 22:30:00+00:00,154,Wake Forest,ACC,fbs,31.0,,2506,Presbyterian,Big South,fcs,7.0,
3,332412710,2013,1,regular,True,False,3746.0,2013-08-29 23:00:00+00:00,2710,Western Illinois,MVFC,fcs,42.0,,2261,Hampton,MEAC,fcs,9.0,
4,332412050,2013,1,regular,True,False,3919.0,2013-08-29 23:00:00+00:00,2050,Ball State,Mid-American,fbs,51.0,,2287,Illinois State,MVFC,fcs,28.0,


In [11]:
# join start date
df = (
    df.drop(columns=['home_id','away_id']).merge(
        df_games[['season','week','home_id','away_id','start_date']],
        left_on=['season','week','offense_id','defense_id'],
        right_on=['season','week','home_id','away_id'],
        how='left'
    ).drop(columns=['home_id','away_id'])
    .merge(
        df_games[['season','week','home_id','away_id','start_date']],
        left_on=['season','week','defense_id','offense_id'],
        right_on=['season','week','home_id','away_id'],
        how='left',
        suffixes=('', '_x')
    ).drop(columns=['home_id','away_id'])
    .assign(
        start_date=lambda x: x.start_date.fillna(x.start_date_x),
    ).drop(columns=['start_date_x'])
)

# Create Team Tendencies Dataset

In [12]:
cols = [
    'season', 'week', 'season_type', 'offense_id', 'offense_team',
    'offense_division', 'offense_conference', 'offense_color', 'offense_alternate_color',
    'offense_logos', 'defense_id', 'defense_team', 'defense_division', 'defense_conference',
    'defense_color', 'defense_alternate_color', 'defense_logos',
    'wpa_fg', 'wpa_go', 'wpa_punt', 'decision', 'recommendation'
]

last_30_seconds = 1 - (30 / (4*15*60))

team_tendencies_go = (
    df.query('go_recommendation').copy()
    # Filter out the last 30 seconds of the game
    .query('offense_division == "fbs" and pct_game_played < @last_30_seconds')
    [cols]
    .assign(
        wp_lost = lambda x: np.select(
            [x.decision == 'field_goal', x.decision == 'punt'],
            [x.wpa_go - x.wpa_fg, x.wpa_go - x.wpa_punt],
            default=0
        ),
    )
    .reset_index(drop=True)
)
team_tendencies_go.head()

Unnamed: 0,season,week,season_type,offense_id,offense_team,offense_division,offense_conference,offense_color,offense_alternate_color,offense_logos,defense_id,defense_team,defense_division,defense_conference,defense_color,defense_alternate_color,defense_logos,wpa_fg,wpa_go,wpa_punt,decision,recommendation,wp_lost
0,2013,1,postseason,120,Maryland,fbs,ACC,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,276,Marshall,fbs,Conference USA,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,-0.064586,-0.030462,-0.09,go,go - recommended,0.0
1,2013,1,postseason,120,Maryland,fbs,ACC,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,276,Marshall,fbs,Conference USA,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,-0.097362,-0.029139,-0.0551,punt,go - recommended,0.025961
2,2013,1,postseason,2655,Tulane,fbs,Conference USA,#006547,#468ac9,http://a.espncdn.com/i/teamlogos/ncaa/500/2655...,309,Louisiana,fbs,Sun Belt,#ce181e,#ffffff,http://a.espncdn.com/i/teamlogos/ncaa/500/309.png,-0.0198,-0.003992,-0.0115,punt,go - slightly recommended,0.007508
3,2013,1,regular,98,Western Kentucky,fbs,Sun Belt,#F32026,#b3b5b8,http://a.espncdn.com/i/teamlogos/ncaa/500/98.png,96,Kentucky,fbs,SEC,#0033a0,#ffffff,http://a.espncdn.com/i/teamlogos/ncaa/500/96.png,-0.00681,0.000899,-0.0045,punt,go - slightly recommended,0.005399
4,2013,1,regular,98,Western Kentucky,fbs,Sun Belt,#F32026,#b3b5b8,http://a.espncdn.com/i/teamlogos/ncaa/500/98.png,96,Kentucky,fbs,SEC,#0033a0,#ffffff,http://a.espncdn.com/i/teamlogos/ncaa/500/96.png,-0.00597,0.001847,-0.0036,punt,go - slightly recommended,0.005447


In [13]:
team_tendencies_res = (
    team_tendencies_go
    .groupby(['season','offense_team', 'offense_conference', 'offense_color', 'offense_alternate_color', 'offense_logos'])
    .agg(
        n_go=('decision', lambda x: (x == 'go').sum()),
        n_go_rec=('offense_team', 'count'),
        net_wp_lost=('wp_lost', 'sum'),
    )
    .sort_values('n_go', ascending=False)
    .query('n_go_rec > 5')
    .reset_index()
)

team_tendencies_res.head()

Unnamed: 0,season,offense_team,offense_conference,offense_color,offense_alternate_color,offense_logos,n_go,n_go_rec,net_wp_lost
0,2023,Western Michigan,Mid-American,#532e1f,#8b7f79,http://a.espncdn.com/i/teamlogos/ncaa/500/2711...,22,39,0.144662
1,2023,North Texas,American Athletic,#00853E,#000000,http://a.espncdn.com/i/teamlogos/ncaa/500/249.png,20,33,0.147975
2,2023,Baylor,Big 12,#154734,#ffb81c,http://a.espncdn.com/i/teamlogos/ncaa/500/239.png,20,30,0.061254
3,2021,Navy,American Athletic,#00225b,#b5a67c,http://a.espncdn.com/i/teamlogos/ncaa/500/2426...,20,30,0.146593
4,2016,Baylor,Big 12,#154734,#ffb81c,http://a.espncdn.com/i/teamlogos/ncaa/500/239.png,20,26,0.073461


In [14]:
team_tendencies_res = (
    team_tendencies_res
    .replace(
        {
            "offense_color": {
                "#null": "#FFFFFF",
            },
            "offense_alternate_color": {
                "#null": "#000",
            },
        }
    )
)

# Fix: make the the darker color the bourder color column, and the lighter color the fill color
team_tendencies_res["fill_color"] = np.where(
    team_tendencies_res.offense_color > team_tendencies_res.offense_alternate_color,
    team_tendencies_res.offense_alternate_color,
    team_tendencies_res.offense_color
)
team_tendencies_res["border_color"] = np.where(
    team_tendencies_res.offense_color < team_tendencies_res.offense_alternate_color, 
    team_tendencies_res.offense_alternate_color, 
    team_tendencies_res.offense_color
)

# if bourder color is white, make light grey
team_tendencies_res["border_color"] = np.where(
    team_tendencies_res["border_color"] > "#fafafa",
    "#ebebeb",
    team_tendencies_res["border_color"]
)

team_tendencies_res.drop(columns=['offense_color', 'offense_alternate_color'], inplace=True)

In [15]:
team_tendencies_res.to_parquet(
    join(OUTPUT_DIR, 'team_tendencies.parquet')
    , index=False
)

# Create Coach Tendencies Dataset

In [16]:
# Load coachs data
all_dfs = []

coaches_dir = join(DATA_DIR, 'coaches')
if not os.path.exists(coaches_dir):
    os.makedirs(coaches_dir)

for year in YEARS:
    file_path = join(coaches_dir, f'{year}.parquet')
    if os.path.exists(file_path):
        print(f'Reading {file_path} from cached data')
        df_tmp = pd.read_parquet(file_path)
    else:
        print(f'Fetching {year} coaches data from CFBD API')
        with cfbd.ApiClient(configuration) as api_client:
            api_instance = cfbd.CoachesApi(api_client)
            data = api_instance.get_coaches(year=year)
        df_tmp = pd.DataFrame([val.to_dict() for val in data])
        # convert season list column with json inside to individual columns
        df_tmp = pd.concat([
            df_tmp.explode('seasons').reset_index(drop=True).drop(columns=['seasons']), 
            pd.json_normalize(df_tmp.explode('seasons').reset_index(drop=True)['seasons'])
        ], axis=1)
        df_tmp.columns = convert_to_snake_case(df_tmp.columns)
        df_tmp.to_parquet(file_path)
    all_dfs.append(df_tmp)
del df_tmp

df_coaches = pd.concat(all_dfs, ignore_index=True).reset_index(drop=True)

coach_cols = ['first_name','last_name','school','year','hire_date']
df_coaches = (
    df_coaches[coach_cols].drop_duplicates().reset_index(drop=True)
    .assign(
        coach_name=lambda x: x.first_name + ' ' + x.last_name,
    ).drop(columns=['first_name','last_name'])
)

df_coaches.head()

Reading ../data/coaches/2013.parquet from cached data
Reading ../data/coaches/2014.parquet from cached data
Reading ../data/coaches/2015.parquet from cached data
Reading ../data/coaches/2016.parquet from cached data
Reading ../data/coaches/2017.parquet from cached data
Reading ../data/coaches/2018.parquet from cached data
Reading ../data/coaches/2019.parquet from cached data
Reading ../data/coaches/2020.parquet from cached data
Reading ../data/coaches/2021.parquet from cached data
Reading ../data/coaches/2022.parquet from cached data
Reading ../data/coaches/2023.parquet from cached data
Reading ../data/coaches/2024.parquet from cached data


  df_coaches = pd.concat(all_dfs, ignore_index=True).reset_index(drop=True)


Unnamed: 0,school,year,hire_date,coach_name
0,Boston College,2013,2012-12-04 00:00:00+00:00,Steve Addazio
1,Wisconsin,2013,2012-12-21 00:00:00+00:00,Gary Andersen
2,Rice,2013,2007-01-19 00:00:00+00:00,David Bailiff
3,Miami (OH),2013,2013-10-06 00:00:00+00:00,Mike Bath
4,Virginia Tech,2013,NaT,Frank Beamer


In [17]:
cols = [
    'season', 'week', 'offense_id', 'offense_team',
    'offense_division', 'offense_conference', 'offense_color', 'offense_alternate_color',
    'offense_logos', 'defense_id', 'defense_team', 'defense_division', 'defense_conference',
    'defense_color', 'defense_alternate_color', 
    'wpa_fg', 'wpa_go', 'wpa_punt', 'decision', 'recommendation', 'start_date', 'row_id'
]



last_30_seconds = 1 - (30 / (4*15*60))

coach_tendencies_go = (
    df.query('go_recommendation').copy()
    .assign(
        row_id=lambda x: x.index,
    )
    # Filter out the last 30 seconds of the game
    .query('offense_division == "fbs" and pct_game_played < @last_30_seconds')
    [cols]
    .assign(
        wp_lost = lambda x: np.select(
            [x.decision == 'field_goal', x.decision == 'punt'],
            [x.wpa_go - x.wpa_fg, x.wpa_go - x.wpa_punt],
            default=0
        ),
    )
    .merge(
        df_coaches,
        left_on=['season', 'offense_team'],
        right_on=['year', 'school'],
        how='left'
    ).drop(columns=['year', 'school'])
    # Keept the most recenlty hired coach
    .query('start_date >= hire_date')
    .sort_values('hire_date', ascending=True)
    .drop_duplicates(subset=['row_id'])
    .drop(columns=['row_id','hire_date'])
    .reset_index(drop=True)
)
coach_tendencies_go.head()

Unnamed: 0,season,week,offense_id,offense_team,offense_division,offense_conference,offense_color,offense_alternate_color,offense_logos,defense_id,defense_team,defense_division,defense_conference,defense_color,defense_alternate_color,wpa_fg,wpa_go,wpa_punt,decision,recommendation,start_date,wp_lost,coach_name
0,2014,14,2653,Troy,fbs,Sun Belt,#AE0210,#88898c,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,309,Louisiana,fbs,Sun Belt,#ce181e,#ffffff,-0.087702,-0.059787,-0.0651,go,go - slightly recommended,2014-11-29 17:30:00+00:00,0.0,Larry Blakeney
1,2013,12,2653,Troy,fbs,Sun Belt,#AE0210,#88898c,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,145,Ole Miss,fbs,SEC,#13294b,#c8102e,-0.0001,0.005038,0.0,punt,go - slightly recommended,2013-11-16 17:00:00+00:00,0.005038,Larry Blakeney
2,2014,12,2653,Troy,fbs,Sun Belt,#AE0210,#88898c,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,70,Idaho,fbs,Sun Belt,#null,#8c6e4a,-0.020035,-0.008933,-0.0301,go,go - slightly recommended,2014-11-15 22:00:00+00:00,0.0,Larry Blakeney
3,2013,12,2653,Troy,fbs,Sun Belt,#AE0210,#88898c,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,145,Ole Miss,fbs,SEC,#13294b,#c8102e,-0.0058,0.001632,-0.0044,punt,go - slightly recommended,2013-11-16 17:00:00+00:00,0.006032,Larry Blakeney
4,2014,2,2653,Troy,fbs,Sun Belt,#AE0210,#88898c,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,150,Duke,fbs,ACC,#013088,#ffffff,-0.0014,0.004214,-0.0013,punt,go - slightly recommended,2014-09-06 23:00:00+00:00,0.005514,Larry Blakeney


In [18]:
coach_tendencies_res = (
    coach_tendencies_go
    .groupby(['season','offense_team', 'offense_conference', 'offense_color', 'offense_alternate_color', 'coach_name'])
    .agg(
        n_go=('decision', lambda x: (x == 'go').sum()),
        n_go_rec=('offense_team', 'count'),
        net_wp_lost=('wp_lost', 'sum'),
    )
    .sort_values('n_go', ascending=False)
    .query('n_go_rec > 5')
    .reset_index()
)

coach_tendencies_res.head()

Unnamed: 0,season,offense_team,offense_conference,offense_color,offense_alternate_color,coach_name,n_go,n_go_rec,net_wp_lost
0,2023,Western Michigan,Mid-American,#532e1f,#8b7f79,Lance Taylor,22,39,0.144662
1,2023,Baylor,Big 12,#154734,#ffb81c,Dave Aranda,20,30,0.061254
2,2021,Navy,American Athletic,#00225b,#b5a67c,Ken Niumatalolo,20,30,0.146593
3,2023,North Texas,American Athletic,#00853E,#000000,Eric Morris,20,33,0.147975
4,2016,Baylor,Big 12,#154734,#ffb81c,Jim Grobe,20,26,0.073461


In [19]:
coach_tendencies_res = (
    coach_tendencies_res
    .replace(
        {
            "offense_color": {
                "#null": "#FFFFFF",
            },
            "offense_alternate_color": {
                "#null": "#000",
            },
        }
    )
)

# Fix: make the the darker color the bourder color column, and the lighter color the fill color
coach_tendencies_res["fill_color"] = np.where(
    coach_tendencies_res.offense_color > coach_tendencies_res.offense_alternate_color,
    coach_tendencies_res.offense_alternate_color,
    coach_tendencies_res.offense_color
)
coach_tendencies_res["border_color"] = np.where(
    coach_tendencies_res.offense_color < coach_tendencies_res.offense_alternate_color, 
    coach_tendencies_res.offense_alternate_color, 
    coach_tendencies_res.offense_color
)

# if bourder color is white, make light grey
coach_tendencies_res["border_color"] = np.where(
    coach_tendencies_res["border_color"] > "#fafafa",
    "#ebebeb",
    coach_tendencies_res["border_color"]
)

coach_tendencies_res.drop(columns=['offense_color', 'offense_alternate_color'], inplace=True)

In [20]:
coach_tendencies_res.head()

Unnamed: 0,season,offense_team,offense_conference,coach_name,n_go,n_go_rec,net_wp_lost,fill_color,border_color
0,2023,Western Michigan,Mid-American,Lance Taylor,22,39,0.144662,#532e1f,#8b7f79
1,2023,Baylor,Big 12,Dave Aranda,20,30,0.061254,#154734,#ebebeb
2,2021,Navy,American Athletic,Ken Niumatalolo,20,30,0.146593,#00225b,#b5a67c
3,2023,North Texas,American Athletic,Eric Morris,20,33,0.147975,#000000,#00853E
4,2016,Baylor,Big 12,Jim Grobe,20,26,0.073461,#154734,#ebebeb


In [21]:
coach_tendencies_res.to_parquet(
    join(OUTPUT_DIR, 'coach_tendencies.parquet')
    , index=False
)

# Create Game Decisions Dataset

In [22]:
# format as Q1 00:00
df['time'] = (
    'Q' + df['period'].astype(str) + ' ' +
    df['clock_minutes'].astype(str).str.zfill(2) + ':' +
    df['clock_seconds'].astype(str).str.zfill(2)
)
df.head()

Unnamed: 0,game_id,drive_id,play_id,offense,defense,period,clock_minutes,clock_seconds,offense_score,defense_score,offense_timeouts,defense_timeouts,yards_to_goal,down,distance,play_type,play_text,pct_game_played,score_diff,season,week,season_type,neutral_site,venue_id,completed,home_team,home_conference,away_team,away_conference,is_home_team,home_division,away_division,pregame_elo_diff,pregame_offense_elo,pregame_defense_elo,precipitation,wind_speed,temperature,game_indoors,offense_division,defense_division,pregame_spread,decision,offense_strength,defense_strength,fg_proba,exp_wp_fg,fourth_down_proba,exp_wp_go,punt_yards_to_goal,exp_wp_punt,cur_win_proba,offense_id,offense_team,defense_id,defense_team,offense_color,offense_alternate_color,offense_logos,offense_conference,defense_color,defense_alternate_color,defense_logos,defense_conference,wpa_fg,wpa_go,wpa_punt,fg_diff_go,fg_diff_punt,go_diff_fg,go_diff_punt,punt_diff_fg,punt_diff_go,recommendation,go_recommendation,start_date,time
0,333610120,33361012026,333610120256,Maryland,Marshall,4,0,50,20,31,0.0,2.0,66,4,4,Pass Incompletion,C.J. Brown pass incomplete to Levern Jacobs.,0.986111,-11,2013,1,postseason,True,3852.0,True,Maryland,ACC,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,go,0.083596,-0.226068,0.0001,0.001001,0.4057,0.006755,75.524101,0.0029,0.0024,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA,-0.001399,0.004355,0.0005,-0.005754,-0.001899,0.005754,0.003855,0.001899,-0.003855,punt or field goal - slightly recommended,False,2013-12-27 19:30:00+00:00,Q4 00:50
1,333610120,33361012022,333610120226,Maryland,Marshall,4,5,53,20,24,3.0,3.0,87,4,5,Punt,"Nathan Renfro punt for 45 yards, returned by D...",0.901944,-4,2013,1,postseason,True,3852.0,True,Maryland,ACC,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,punt,0.083596,-0.226068,0.0,0.167,0.3501,0.185295,55.771702,0.1891,0.2234,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA,-0.0564,-0.038105,-0.0343,-0.018295,-0.0221,0.018295,-0.003805,0.0221,0.003805,go or punt - recommended,False,2013-12-27 19:30:00+00:00,Q4 05:53
2,333610120,33361012010,333610120093,Maryland,Marshall,2,7,49,10,17,3.0,3.0,68,4,3,Punt,"Nathan Renfro punt for 38 yards, fair catch by...",0.369722,-7,2013,1,postseason,True,3852.0,True,Maryland,ACC,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,punt,0.083596,-0.226068,0.0001,0.254037,0.5189,0.255828,74.180702,0.2735,0.2475,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA,0.006537,0.008328,0.026,-0.001791,-0.019463,0.001791,-0.017672,0.019463,0.017672,punt - recommended,False,2013-12-27 19:30:00+00:00,Q2 07:49
3,333610120,33361012004,333610120027,Maryland,Marshall,1,8,9,0,0,3.0,3.0,97,4,8,Punt,"Nathan Renfro punt for 34 yards, returned by D...",0.114167,0,2013,1,postseason,True,3852.0,True,Maryland,ACC,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,punt,0.083596,-0.226068,0.0,0.391,0.3275,0.42264,53.889999,0.432,0.4731,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA,-0.0821,-0.05046,-0.0411,-0.03164,-0.041,0.03164,-0.00936,0.041,0.00936,punt - slightly recommended,False,2013-12-27 19:30:00+00:00,Q1 08:09
4,333610120,33361012002,333610120012,Maryland,Marshall,1,11,33,0,0,3.0,3.0,86,4,18,Punt,"Nathan Renfro punt for 47 yards, returned by D...",0.0575,0,2013,1,postseason,True,3852.0,True,Maryland,ACC,Marshall,Conference USA,0,fbs,fbs,218.124852,1921.246102,1703.121251,0.0,9.2,46.0,False,fbs,fbs,3.0,punt,0.083596,-0.226068,0.0,0.391,0.1483,0.411746,57.934299,0.4492,0.4777,120,Maryland,276,Marshall,#D5002B,#ffcd00,http://a.espncdn.com/i/teamlogos/ncaa/500/120.png,ACC,#00ae42,#be854c,http://a.espncdn.com/i/teamlogos/ncaa/500/276.png,Conference USA,-0.0867,-0.065954,-0.0285,-0.020746,-0.0582,0.020746,-0.037454,0.0582,0.037454,punt - recommended,False,2013-12-27 19:30:00+00:00,Q1 11:33


In [23]:
df = df.dropna().reset_index(drop=True)

In [24]:
df_coaches.head()

Unnamed: 0,school,year,hire_date,coach_name
0,Boston College,2013,2012-12-04 00:00:00+00:00,Steve Addazio
1,Wisconsin,2013,2012-12-21 00:00:00+00:00,Gary Andersen
2,Rice,2013,2007-01-19 00:00:00+00:00,David Bailiff
3,Miami (OH),2013,2013-10-06 00:00:00+00:00,Mike Bath
4,Virginia Tech,2013,NaT,Frank Beamer


In [25]:
# merge coach
df = (
    df.assign(
        row_id=lambda x: x.index
    )
    .merge(
        df_coaches,
        left_on=['season', 'offense_team'],
        right_on=['year', 'school'],
        how='left'
    ).drop(columns=['year', 'school'])
    # Keept the most recenlty hired coach
    .query('start_date >= hire_date')
    .sort_values('hire_date', ascending=True)
    .drop_duplicates(subset=['row_id'])
    .drop(columns=['row_id', 'start_date','hire_date'])
)
df.head()

Unnamed: 0,game_id,drive_id,play_id,offense,defense,period,clock_minutes,clock_seconds,offense_score,defense_score,offense_timeouts,defense_timeouts,yards_to_goal,down,distance,play_type,play_text,pct_game_played,score_diff,season,week,season_type,neutral_site,venue_id,completed,home_team,home_conference,away_team,away_conference,is_home_team,home_division,away_division,pregame_elo_diff,pregame_offense_elo,pregame_defense_elo,precipitation,wind_speed,temperature,game_indoors,offense_division,defense_division,pregame_spread,decision,offense_strength,defense_strength,fg_proba,exp_wp_fg,fourth_down_proba,exp_wp_go,punt_yards_to_goal,exp_wp_punt,cur_win_proba,offense_id,offense_team,defense_id,defense_team,offense_color,offense_alternate_color,offense_logos,offense_conference,defense_color,defense_alternate_color,defense_logos,defense_conference,wpa_fg,wpa_go,wpa_punt,fg_diff_go,fg_diff_punt,go_diff_fg,go_diff_punt,punt_diff_fg,punt_diff_go,recommendation,go_recommendation,time,coach_name
18166,400548417,40054841720,400548417103969407,Troy,UL Monroe,3,3,5,14,19,3.0,3.0,70,4,5,Punt,"Ryan Kay punt for 42 yds, fair catch by Rashon...",0.698611,-5,2014,5,regular,False,3817.0,True,UL Monroe,Sun Belt,Troy,Sun Belt,-1,fbs,fbs,-282.04493,1301.073074,1583.118005,0.0,3.4,82.9,False,fbs,fbs,10.5,punt,0.147351,-0.126023,0.0001,0.120021,0.3791,0.144429,72.441002,0.1556,0.1632,2653,Troy,2433,UL Monroe,#AE0210,#88898c,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,Sun Belt,#231F20,#b18445,http://a.espncdn.com/i/teamlogos/ncaa/500/2433...,Sun Belt,-0.043179,-0.018771,-0.0076,-0.024409,-0.035579,0.024409,-0.011171,0.035579,0.011171,punt - slightly recommended,False,Q3 03:05,Larry Blakeney
8994,333042653,33304265301,333042653005,Troy,UL Monroe,1,13,44,0,0,3.0,3.0,66,4,1,Punt,"Will Scott punt for 44 yards, fair catch by Ra...",0.021111,0,2013,10,regular,False,3975.0,True,Troy,Sun Belt,UL Monroe,Sun Belt,1,fbs,fbs,-14.031905,1528.920312,1542.952217,0.0,5.8,73.0,False,fbs,fbs,-3.0,punt,0.199547,-0.159808,0.0001,0.428052,0.7446,0.508417,75.943298,0.5007,0.5293,2653,Troy,2433,UL Monroe,#AE0210,#88898c,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,Sun Belt,#231F20,#b18445,http://a.espncdn.com/i/teamlogos/ncaa/500/2433...,Sun Belt,-0.101248,-0.020883,-0.0286,-0.080365,-0.072648,0.080365,0.007717,0.072648,-0.007717,go - slightly recommended,True,Q1 13:44,Larry Blakeney
8995,333042653,33304265307,333042653052,Troy,UL Monroe,1,4,2,0,7,3.0,3.0,64,4,1,Punt,"Will Scott punt for 51 yards, downed at the La...",0.182778,-7,2013,10,regular,False,3975.0,True,Troy,Sun Belt,UL Monroe,Sun Belt,1,fbs,fbs,-14.031905,1528.920312,1542.952217,0.0,5.8,73.0,False,fbs,fbs,-3.0,punt,0.199547,-0.159808,0.01,0.29321,0.7371,0.349194,77.735001,0.3175,0.3629,2653,Troy,2433,UL Monroe,#AE0210,#88898c,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,Sun Belt,#231F20,#b18445,http://a.espncdn.com/i/teamlogos/ncaa/500/2433...,Sun Belt,-0.06969,-0.013706,-0.0454,-0.055984,-0.02429,0.055984,0.031694,0.02429,-0.031694,go - recommended,True,Q1 04:02,Larry Blakeney
8996,333042653,33304265327,333042653257,Troy,UL Monroe,4,6,0,36,42,3.0,3.0,3,4,3,Rush,Corey Robinson rush for 3 yards for a TOUCHDOWN.,0.9,-6,2013,10,regular,False,3975.0,True,Troy,Sun Belt,UL Monroe,Sun Belt,1,fbs,fbs,-14.031905,1528.920312,1542.952217,0.0,5.8,73.0,False,fbs,fbs,-3.0,go,0.199547,-0.159808,0.9072,0.299432,0.3109,0.235298,89.0,0.1961,0.3062,2653,Troy,2433,UL Monroe,#AE0210,#88898c,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,Sun Belt,#231F20,#b18445,http://a.espncdn.com/i/teamlogos/ncaa/500/2433...,Sun Belt,-0.006768,-0.070902,-0.1101,0.064134,0.103332,-0.064134,0.039198,-0.103332,-0.039198,field goal - strongly recommended,False,Q4 06:00,Larry Blakeney
6579,332852247,33285224702,332852247013,Troy,Georgia State,1,13,13,0,0,3.0,3.0,59,4,8,Punt,"Will Scott punt for 40 yards, fair catch by Al...",0.029722,0,2013,7,regular,False,3495.0,True,Georgia State,Sun Belt,Troy,Sun Belt,-1,fbs,fbs,520.736175,1440.616374,919.880198,0.0,11.4,80.1,False,fbs,fbs,-16.0,punt,0.2282,-0.325687,0.03,0.82343,0.3924,0.850746,81.286301,0.8743,0.8797,2653,Troy,2247,Georgia State,#AE0210,#88898c,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,Sun Belt,#1e539a,#ebebeb,http://a.espncdn.com/i/teamlogos/ncaa/500/2247...,Sun Belt,-0.05627,-0.028954,-0.0054,-0.027316,-0.05087,0.027316,-0.023554,0.05087,0.023554,punt - recommended,False,Q1 13:13,Larry Blakeney


In [26]:
df['distance'] = df.distance.replace(0, 1)

In [27]:
# Coach, offense team, defense team, offense logo, defense logo, time (Q, Min:Sec), pregame off elo, pregame def elo, down + distance, yards to goal, off points, def points, decision, recommendation, wp diff, 

cols = [
    'season', 'week', 
    'offense_team', 'offense_conference', 'offense_division', 'offense_logos', 'offense_score', 'coach_name',
    'defense_team', 'defense_logos', 'defense_score',
    'exp_wp_go', 'exp_wp_fg', 'exp_wp_punt',
    'time','pregame_offense_elo','pregame_defense_elo','down','distance','yards_to_goal', 'recommendation', 'decision',
    'play_text'
]

rename_dict = {
    'season': 'Season',
    'week': 'Week',
    'offense_team': 'Offense Team',
    'offense_conference': 'Offense Conference',
    'offense_division': 'Offense Division',
    'offense_logos': 'Offense Logo',
    'offense_score': 'Offense Score',
    'coach_name': 'Offense Coach Name',
    'defense_team': 'Defense Team',
    'defense_logos': 'Defense Logo',
    'defense_score': 'Defense Score',
    'wp_diff': 'Win Probability Diff',
    'time': 'Time',
    'pregame_offense_elo': 'Pregame Offense Elo',
    'pregame_defense_elo': 'Pregame Defense Elo',
    'down': 'Down',
    'distance': 'Distance',
    'yards_to_goal': 'Yards to Goal',
    'recommendation': 'Recommendation',
    'decision': 'Decision',
    'play_text': 'Desc',
    'exp_wp_go': 'Win Probability Go',
    'exp_wp_fg': 'Win Probability Field Goal',
    'exp_wp_punt': 'Win Probability Punt',
}

df_decisions = (
    df.query('offense_division == "fbs"')
    .assign(
        pregame_defense_elo=lambda x: x['pregame_defense_elo'].astype(int),
        pregame_offense_elo=lambda x: x['pregame_offense_elo'].astype(int),
    )
    [cols]
    .rename(columns=rename_dict)
    .drop(columns=['Offense Division'])
    .assign(
        Decision=lambda x: np.select([x.Decision=="field_goal", x.Decision=="go", x.Decision=="punt"],
            ['Field Goal', 'Go', 'Punt'],
            default='Any Decision'
        ),
    )
)        
df_decisions.head()

Unnamed: 0,Season,Week,Offense Team,Offense Conference,Offense Logo,Offense Score,Offense Coach Name,Defense Team,Defense Logo,Defense Score,Win Probability Go,Win Probability Field Goal,Win Probability Punt,Time,Pregame Offense Elo,Pregame Defense Elo,Down,Distance,Yards to Goal,Recommendation,Decision,Desc
18166,2014,5,Troy,Sun Belt,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,14,Larry Blakeney,UL Monroe,http://a.espncdn.com/i/teamlogos/ncaa/500/2433...,19,0.144429,0.120021,0.1556,Q3 03:05,1301,1583,4,5,70,punt - slightly recommended,Punt,"Ryan Kay punt for 42 yds, fair catch by Rashon..."
8994,2013,10,Troy,Sun Belt,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,0,Larry Blakeney,UL Monroe,http://a.espncdn.com/i/teamlogos/ncaa/500/2433...,0,0.508417,0.428052,0.5007,Q1 13:44,1528,1542,4,1,66,go - slightly recommended,Punt,"Will Scott punt for 44 yards, fair catch by Ra..."
8995,2013,10,Troy,Sun Belt,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,0,Larry Blakeney,UL Monroe,http://a.espncdn.com/i/teamlogos/ncaa/500/2433...,7,0.349194,0.29321,0.3175,Q1 04:02,1528,1542,4,1,64,go - recommended,Punt,"Will Scott punt for 51 yards, downed at the La..."
8996,2013,10,Troy,Sun Belt,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,36,Larry Blakeney,UL Monroe,http://a.espncdn.com/i/teamlogos/ncaa/500/2433...,42,0.235298,0.299432,0.1961,Q4 06:00,1528,1542,4,3,3,field goal - strongly recommended,Go,Corey Robinson rush for 3 yards for a TOUCHDOWN.
6579,2013,7,Troy,Sun Belt,http://a.espncdn.com/i/teamlogos/ncaa/500/2653...,0,Larry Blakeney,Georgia State,http://a.espncdn.com/i/teamlogos/ncaa/500/2247...,0,0.850746,0.82343,0.8743,Q1 13:13,1440,919,4,8,59,punt - recommended,Punt,"Will Scott punt for 40 yards, fair catch by Al..."


In [28]:
down_map = {
    1: '1st',
    2: '2nd',
    3: '3rd',
    4: '4th'
}

df_decisions['Down & Distance'] = df_decisions['Down'].map(down_map) + ' & ' + df_decisions['Distance'].astype(str)

In [29]:
# write to parquet
df_decisions.to_parquet(
    join(OUTPUT_DIR, 'plays_tendencies.parquet')
    , index=False
)