In [79]:
import pandas as pd
import numpy as np
import time
import json
import os
os.chdir('/Users/lucashaupt/Documents/GitHub/nfl-live-win-probability')
data_dir = "data/"

In [80]:
def aggregate_match_results_by_team(results, return_detail=False):
    results_df = pd.DataFrame(results)
    results_df['outcome'] = np.sign(results_df['home_score'] - results_df['away_score']) + 1
    home_df = results_df[['home_team_id', 'away_team_id', 'game_code', 'outcome', 'home_score', 'away_score']]\
        .rename(columns={'home_team_id': 'team_id', 'away_team_id': 'opponent_id', 'home_score': 'GF', 'away_score': 'GA'})
    home_df['is_home'] = 1
    away_df = results_df[['away_team_id', 'home_team_id', 'game_code', 'outcome', 'home_score', 'away_score']]\
        .rename(columns={'away_team_id': 'team_id', 'home_team_id': 'opponent_id', 'away_score': 'GF', 'home_score': 'GA'})
    away_df['outcome'] = away_df['outcome'].map({0: 2, 1: 1, 2: 0})
    away_df['is_home'] = 0
    results_df = pd.concat([home_df, away_df], axis=0)
    results_df['G'] = 1
    results_df['W'] = (results_df['outcome'] == 2).astype(int)
    results_df['D'] = (results_df['outcome'] == 1).astype(int)
    results_df['L'] = (results_df['outcome'] == 0).astype(int)
    results_df['points'] = results_df['outcome']
    results_df['GF_away'] = results_df['GF'].where(results_df['is_home'] == 0, 0)
    agg_df = results_df.groupby('team_id')[['G', 'W', 'D', 'L', 'points', 'GF', 'GA', 'GF_away']].sum()
    if return_detail:
        return agg_df, results_df
    else:
        return agg_df
    
def simulate_season_standing_with_tiebreakers(match_preds, match_results, team_names, pred_params, n_runs):
    team_names = pd.DataFrame(team_names).set_index('id')['name']
    start_time = time.time()
    print('Simulating season ({0:d} runs)'.format(n_runs))
    run_list = []
    no_games_left = len(match_preds) == 0
    for match in match_preds:
        exact_score_probs = np.array(match['pred_exact_score']) / np.sum(match['pred_exact_score'])
        scores = np.random.multinomial(
            n=1,
            pvals=exact_score_probs,
            size=n_runs
        )
        ind_scores = np.argmax(scores, axis=1)
        aux_home = np.tile(np.arange(0, pred_params['max_home_score'] + 1), (pred_params['max_away_score'] + 1, 1)).flatten(order='F')
        aux_away = np.tile(np.arange(0, pred_params['max_away_score'] + 1), (pred_params['max_home_score'] + 1))
        home_scores = aux_home[ind_scores] + match['current_score'][0]
        away_scores = aux_away[ind_scores] + match['current_score'][1]
        away_outcomes = np.zeros((n_runs, 3)).astype(int)
        away_outcomes[np.arange(n_runs), np.sign(home_scores - away_scores) + 1] = 1
        home_outcomes = np.fliplr(away_outcomes)

        home_df = pd.DataFrame(
            data=np.hstack((np.arange(1, n_runs + 1).reshape(-1, 1), home_outcomes, home_scores.reshape(-1, 1), away_scores.reshape(-1, 1))),
            columns=['run', 'W', 'D', 'L', 'GF', 'GA']
        )
        home_df['team_id'] = match['home_team_id']
        home_df['opponent_id'] = match['away_team_id']
        home_df['is_home'] = 1
        away_df = pd.DataFrame(
            data=np.hstack((np.arange(1, n_runs + 1).reshape(-1, 1), away_outcomes, away_scores.reshape(-1, 1), home_scores.reshape(-1, 1))),
            columns=['run', 'W', 'D', 'L', 'GF', 'GA']
        )
        away_df['team_id'] = match['away_team_id']
        away_df['opponent_id'] = match['home_team_id']
        away_df['is_home'] = 0
        run_df = pd.concat([home_df, away_df], axis=0)
        run_list.append(run_df)

    if no_games_left:
        empty_run_df_cols = ['run', 'team_id', 'opponent_id', 'W', 'D', 'L', 'GF', 'GA', 'is_home', 'G', 'points', 'GF_away']
        all_runs_df = pd.DataFrame({col: pd.Series([], dtype='str' if col in ['team_id', 'opponent_id'] else 'int64') for col in empty_run_df_cols})
        missing_team_ids = team_names.index.unique()
    else:
        all_runs_df = pd.concat(run_list, axis=0).reset_index(drop=True)
        all_runs_df['G'] = 1
        all_runs_df['points'] = (all_runs_df[['W', 'D', 'L']] * np.array([2, 1, 0])).sum(axis=1)
        all_runs_df['GF_away'] = all_runs_df['GF'].where(all_runs_df['is_home'] == 0, 0)
        # Aggregate simulated game results by run
        agg_runs_df = all_runs_df.drop(columns='opponent_id').groupby(['run', 'team_id']).sum().reset_index(drop=False)
        missing_team_ids = np.setdiff1d(team_names.index, agg_runs_df.team_id)
        
    # Ranking criteria
    ranking_criteria = ['points', 'GD', 'GF', 'H2H_points', 'H2H_GF_away']
    
    # Preliminary criteria (anything that does not involve head-to-head subsetting)
    preliminary_criteria = []
    for i in ranking_criteria:
        if i.find('H2H_') == 0:
            break
        else:
            preliminary_criteria.append(i)

    # Aggregate previous results
    agg_match_results, match_team_results = aggregate_match_results_by_team(match_results, return_detail=True)

    # Add missing teams (teams present in observed results but without any remaining games)
    if missing_team_ids.size > 0:
        aux_ind = pd.MultiIndex.from_product([np.arange(1, n_runs + 1), missing_team_ids], names=['run', 'team_id'])
        missing_df = pd.DataFrame(columns=np.setdiff1d(all_runs_df.columns, ['run', 'team_id', 'opponent_id']).tolist(),
                                  data=0, index=aux_ind).reset_index(drop=False)
        if no_games_left:
            agg_runs_df = missing_df
        else:
            agg_runs_df = pd.concat([agg_runs_df, missing_df], axis=0)

    # Add observed results from past games
    sum_cols = ['G', 'W', 'D', 'L', 'GF', 'GA', 'points', 'GF_away']
    prev_df = agg_match_results.reindex(agg_runs_df.team_id.values, columns=sum_cols).fillna(0).astype(int)
    agg_runs_df[sum_cols] = agg_runs_df[sum_cols].values + prev_df.values
    agg_runs_df['GD'] = agg_runs_df['GF'] - agg_runs_df['GA']
    agg_runs_df = agg_runs_df.sort_values(by=['run'] + preliminary_criteria, ascending=False).reset_index(drop=True)

    # Identify ties (2 or more teams with identical values in the (preliminary) sorting features)
    aux = agg_runs_df.drop_duplicates(subset=['run'] + preliminary_criteria).index.values
    agg_runs_df['tie_id'] = 0
    agg_runs_df.loc[aux, 'tie_id'] = 1
    agg_runs_df['tie_id'] = np.cumsum(agg_runs_df['tie_id'])
    tie_size = agg_runs_df.groupby(['tie_id'])['team_id'].count().rename('tie_size').reset_index(drop=False)
    agg_runs_df = agg_runs_df.merge(right=tie_size, on='tie_id', how='left')

    # Head-to-head data
    h2h_df = agg_runs_df[['run', 'tie_id', 'team_id']]
    h2h_df = h2h_df.merge(right=h2h_df, on=['run', 'tie_id'], how='left')
    h2h_df = h2h_df.loc[h2h_df.team_id_x != h2h_df.team_id_y]
    h2h_df.rename(columns={'team_id_x': 'team_id', 'team_id_y': 'opponent_id'}, inplace=True)
    # Relevant results
    h2h_past_df = h2h_df.merge(
        right=match_team_results[['team_id', 'opponent_id', 'is_home'] + sum_cols],
        on=['team_id', 'opponent_id'],
        how='inner'
    )
    # Relevant simulations
    h2h_pred_df = h2h_df.merge(
        right=all_runs_df[['run', 'team_id', 'opponent_id', 'is_home'] + sum_cols],
        on=['run', 'team_id', 'opponent_id'],
        how='inner'
    )
    # Join results and predictions
    h2h_df = pd.concat([h2h_past_df, h2h_pred_df], axis=0).reset_index(drop=True)
    h2h_df['GD'] = h2h_df['GF'] - h2h_df['GA']
    h2h_df['GF_away'] = h2h_df['GF'].where(h2h_df['is_home'] == 0, 0)
    h2h_df.drop(columns='opponent_id', inplace=True)

    # Aggregate "head-to-head runs"
    agg_h2h_df = h2h_df.groupby(['run', 'tie_id', 'team_id']).sum().reset_index(drop=False)
    agg_h2h_df.rename(columns={i: 'H2H_' + i for i in ['points', 'GD', 'GF', 'GF_away']}, inplace=True)

    # Add tie breaker values to the main data frame
    h2h_cols = [i for i in agg_h2h_df.columns if i.find('H2H_') == 0]
    agg_runs_df = agg_runs_df.merge(
        right=agg_h2h_df[['run', 'tie_id', 'team_id'] + h2h_cols],
        on=['run', 'tie_id', 'team_id'],
        how='left'
    )
    agg_runs_df[h2h_cols] = agg_runs_df[h2h_cols].fillna(0)

    # Sort main data frame again, now with all tie-breaking data included
    agg_runs_df = agg_runs_df.sort_values(by=['run'] + ranking_criteria, ascending=False).reset_index(drop=True)

    # Calculate rank ("cheap" method, repeating a [1, 2, 3, ..., N] array as many times as runs)
    agg_runs_df['rank'] = np.arange(1, team_names.size + 1).tolist() * n_runs

    # Distribution of simulated end-of-season rankings
    rank_dist = agg_runs_df.groupby(['team_id', 'rank'])[['run']].count().rename(columns={'run': 'n'})
    rank_dist['p'] = rank_dist['n'] / n_runs
    # Distribution of simulated end-of-season league points
    points_dist = agg_runs_df.groupby(['team_id', 'points'])[['run']].count().rename(columns={'run': 'n'})
    points_dist['p'] = points_dist['n'] / n_runs

    wins_dist = agg_runs_df.groupby(['team_id', 'W'])[['run']].count().rename(columns={'run': 'n'})
    wins_dist['p'] = wins_dist['n'] / n_runs

    # Average number of points per team
    avg_points_df = agg_runs_df.groupby('team_id')[['points', 'rank', 'W', 'L', 'D']].mean().sort_values(by='points', ascending=False).astype(float)
    avg_points_df['team_name'] = team_names.loc[avg_points_df.index].values

    if agg_match_results.shape[0] == team_names.shape[0]:
        aux = agg_match_results[['points', 'GF', 'GA']].copy()
        aux['GD'] = aux['GF'] - aux['GA']
        ordered_team_ids = aux.sort_values(by=['points', 'GD', 'GF'], ascending=False).index.values
    else:
        ordered_team_ids = avg_points_df.index.values

    # Prepare "agg_match_results" before formatting output
    for team_id in ordered_team_ids:
        if team_id not in agg_match_results.index:
            agg_match_results.loc[team_id] = 0
    agg_match_results = agg_match_results.drop(columns='GF_away').rename(columns={
        'G': 'g',
        'W': 'w',
        'D': 'd',
        'L': 'l',
        'GF': 'gF',
        'GA': 'gA',
    }).astype(int)

    print('   Done! Time elapsed: {0:.4f} seconds'.format(time.time() - start_time))

    # Merge results into a single dictionary
    final_list = [{'id': team_id,
                   'name': avg_points_df.loc[team_id, 'team_name'],
                   'current': agg_match_results.loc[team_id].to_dict(),
                   'predicted': {'averagePoints': avg_points_df.loc[team_id, 'points'], 'averageRank': avg_points_df.loc[team_id, 'rank'], 'averageWins': avg_points_df.loc[team_id, 'W'], 'averageLosses': avg_points_df.loc[team_id, 'L'], 'averageTies': avg_points_df.loc[team_id, 'D'], 'rank': {}, 'points': {}, 'wins': {}}}
                  for team_id in ordered_team_ids]
    for (team_id, rank), sim_results in rank_dist.to_dict(orient='index').items():
        idx_team = np.flatnonzero(ordered_team_ids == team_id)[0]
        final_list[idx_team]['predicted']['rank'][rank] = sim_results['p']
    for (team_id, rank), sim_results in points_dist.to_dict(orient='index').items():
        idx_team = np.flatnonzero(ordered_team_ids == team_id)[0]
        final_list[idx_team]['predicted']['points'][rank] = sim_results['p']
    for (team_id, rank), sim_results in wins_dist.to_dict(orient='index').items():
        idx_team = np.flatnonzero(ordered_team_ids == team_id)[0]
        final_list[idx_team]['predicted']['wins'][rank] = sim_results['p']
    return final_list

In [81]:
with open(os.path.join(data_dir, 'simulation_inputs.json'), 'r') as f:
    sim_data = json.load(f)

simulation_outputs = simulate_season_standing_with_tiebreakers(sim_data['predictions'], sim_data['results'], sim_data['teams'], sim_data['prediction_params'], n_runs=10000)
with open(os.path.join(data_dir, 'simulation_outputs.json'), 'w') as f:
    json.dump(simulation_outputs, f)

Simulating season (10000 runs)
   Done! Time elapsed: 35.0765 seconds


In [82]:
os.system('say "done"')

0

In [83]:
pd.json_normalize(simulation_outputs[3]['predicted']).T.to_clipboard()

In [84]:
pd.json_normalize(simulation_outputs).to_clipboard()

In [85]:
sim_data["predictions"]

[{'game_code': 2337503,
  'home_team_id': 350,
  'away_team_id': 331,
  'pred_exact_score': [0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0004027150961826617,
   0.0,
   0.0,
   4.655778139344003e-05,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0006994343475079425,
   0.0004448222519273885,
   0.0,
   0.0,
   0.002316785883001015,
   1.753285537865202e-43,
   0.0,
   1.593786699599354e-05,
   0.00032458960940297813,
   0.0001747636840196885,
   0.0,
   4.630496005317596e-05,
   0.0,
   0.0,
   0.0,
   0.0006328110120473045,
   0.0,
   5.843416605630649e-20,
   0.0,
   0.0006909658561588101,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   3.735946857039683e-33,
   0.0,
   0.0019794445006229245,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
