In [109]:
import pandas as pd
import arviz as az
import numpy as np
import pymc as pm
from pymc.math import exp
import math


In [2]:
bonuses_df = pd.read_csv("2023-acf-nationals-all-bonuses.csv")
buzzes_df = pd.read_csv("2023-acf-nationals-all-buzzes.csv")
lengths_df = pd.read_csv("tossup-meta.csv")

In [3]:
buzzes_df = pd.merge(buzzes_df, lengths_df[['round', 'tossup', 'words']], how='left', left_on=['round', 'tossup'], right_on=['round', 'tossup'])
buzzes_df['pct_read'] = buzzes_df['buzz_position'] / buzzes_df['words'] * 100

buzzes_df['pct_read'] = buzzes_df['pct_read'].apply(lambda x: min(x, 100))

buzzes_df.head()

Unnamed: 0,game_id,round,tossup,team,player,opponent,category,subcategory,answer,buzz_position,value,words,pct_read
0,232,1,1,Chicago A,Matt Jackson,Houston A,History,European History,German university students,91,10,149.0,61.073826
1,232,1,2,Chicago A,Adam Fine,Houston A,Science,Physics,gravitational lensing,64,10,138.0,46.376812
2,232,1,3,Houston A,John Broussard,Chicago A,Arts,Painting/Sculpture,Kara Walker,111,10,148.0,75.0
3,232,1,4,Chicago A,Matt Jackson,Houston A,History,American History,George Crook,147,10,144.0,100.0
4,232,1,5,Chicago A,Matt Jackson,Houston A,Thought,Social Science,communication,110,10,133.0,82.706767


In [None]:
def get_player_survival_data(df, player_name, category):
    team_name = df[df['player'] == player_name]['team'].iloc[0]
    
    game_ids = df.loc[(df['team'] == team_name) | (df['opponent'] == team_name), "game_id"].unique()
    
    buzzes_list = []
    
    player_df = df[df['game_id'].isin(game_ids)]
    
    player_df['player_buzz'] = [1 if player == player_name else 0 for player in player_df['player']]
    
    #remove buzzes after negs
    
    player_df = player_df.drop_duplicates(subset=['round', 'tossup'], keep='first').reset_index(drop=True)
    
    player_df = player_df[player_df['category'] == category]

    
    return player_df

player_df = get_player_survival_data(buzzes_df, 'Geoffrey Chen', 'Science')

player_df.head()

In [None]:
get_player_survival_data(buzzes_df, "Hari Parameswaran", "History")

In [None]:
player_buzzes = buzzes_df.groupby(['player', 'category']).value.sum()

In [None]:
players = [key[0] for key in player_buzzes.keys()]

players = list(set(players))

science_points = []

for player in players:
    if "Science" in player_buzzes[player]:
        science_points.append({'Player': player, "Points": player_buzzes[player]['Science']})

In [None]:
log2 = np.log(2)

player_list = ['Hari Parameswaran']

for player in player_list:
    
    player_df = get_player_survival_data(buzzes_df, player, 'History')
    
    observed_mask = player_df['player_buzz'] == 1
    
    censored_vals = player_df['pct_read'].copy()
    censored = censored_vals[~observed_mask]
    
        
    y_uncensored = player_df['pct_read'].copy()
    
    y_uncensored = y_uncensored[observed_mask]
    
    upper = np.full(np.shape(censored), 110)
    
    with pm.Model() as m:
        beta0 = pm.Normal("beta0", 0, tau=0.01)
        α = pm.Exponential("α", 1/2)

        λ = exp(beta0)
        β = λ ** (-1 / α)

        impute_censored = pm.Bound("impute_censored", pm.Weibull.dist(alpha=α, beta=β), lower=censored, upper=upper, shape=censored.shape[0])
        
        likelihood = pm.Truncated("likelihood", pm.Weibull.dist(
        alpha=α,
        beta=β), 
        observed=y_uncensored,
        shape=y_uncensored.shape[0],
        upper=105
    )
        

        median0 = pm.Deterministic("median0", (log2 * exp(-beta0)) ** (1 / α))

        trace = pm.sample(
            3000,
            tune=500,
            init="jitter+adapt_diag_grad",
            target_accept=0.9,
        )
        
        print("Player: " + player)
        
        print(az.summary(trace))

In [None]:
az.plot_trace(trace)

In [4]:
#define get_team_survival_data to create graphs of temas
def get_team_survival_data(df, team_name):
    
    buzzes_list = []
    
    team_df = df[(df['team'] == team_name) | (df['opponent'] == team_name)]
    
    team_df['team_buzz'] = [1 if team == team_name else 0 for team in team_df['team']]
    
    #remove buzzes after negs
    
    team_df = team_df.drop_duplicates(subset=['round', 'tossup'], keep='first').reset_index(drop=True)
    
    cat_df = pd.get_dummies(team_df['category'])
    
    team_df = pd.concat([team_df, cat_df], axis=1)

    
    return team_df

In [None]:
team_df = get_team_survival_data(buzzes_df, 'Cornell A')

team_df.head()

In [5]:
log2 = np.log(2)

team_list = ['Cornell A', 'Chicago A']

trace_data = {}

for team in team_list:
    
    
    team_df = get_team_survival_data(buzzes_df, team)
    
    X = np.array(team_df[['Arts', 'Beliefs and Other', 'History', 'Literature', 'Science', 'Thought']]).T    
    
    observed_mask = team_df['team_buzz'] == 1
    
    censored_vals = team_df['pct_read'].copy()
    
    
    censored = censored_vals[~observed_mask]
    
    with pm.Model() as m:
    
        
    
        x_censored = X[:, ~observed_mask]
        x_uncensored = X[:, observed_mask]


        y_uncensored = team_df['pct_read'].copy()
    
        y_uncensored = y_uncensored[observed_mask]
    
    
    
        upper = np.full(np.shape(censored), 110)
        
        beta_0 = pm.Normal("beta_0", 0, tau=.01)
    
        
        
        
        betas = pm.Normal("beta", 0, tau=0.1, shape=X.shape[0])
        
        α = pm.Exponential("α", 1/2)

        λ_censored = exp(beta_0 + pm.math.dot(betas, x_censored))
        
        λ_uncensored = exp(beta_0 + pm.math.dot(betas, x_uncensored))
        
        β_censored = λ_censored ** (-1 / α)

        β_uncensored = λ_uncensored ** (-1 / α)
         

        impute_censored = pm.Bound("impute_censored", pm.Weibull.dist(alpha=α, beta=β_censored), lower=censored, upper=upper, shape=censored.shape[0])
        
        likelihood = pm.Truncated("likelihood", pm.Weibull.dist(
        alpha=α,
        beta=β_uncensored), 
        observed=y_uncensored,
        shape=1,  
        upper=100                   
    )
        

        #median0 = pm.Deterministic("median0", pm.math.exp((100 / σ) ** α - ((100 + p) / σ) ** α))
        
        lit_pred = pm.Deterministic('lit_prediction', pm.Weibull.dist(alpha=α, beta=pm.math.exp(beta_0 + betas[3]) ** (-1 / α)))
        arts_pred = pm.Deterministic('art_prediction', pm.Weibull.dist(alpha=α, beta=pm.math.exp(beta_0 + betas[0]) ** (-1 / α)))
        belief_pred = pm.Deterministic('belief_prediction', pm.Weibull.dist(alpha=α, beta=pm.math.exp(beta_0 + betas[1]) ** (-1 / α)))
        hist_pred = pm.Deterministic('hist_prediction', pm.Weibull.dist(alpha=α, beta=pm.math.exp(beta_0 + betas[2]) ** (-1 / α)))
        sci_pred = pm.Deterministic('sci_prediction', pm.Weibull.dist(alpha=α, beta=pm.math.exp(beta_0 + betas[4]) ** (-1 / α)))
        thought_pred = pm.Deterministic('thought_prediction', pm.Weibull.dist(alpha=α, beta=pm.math.exp(beta_0 + betas[5]) ** (-1 / α)))

        trace = pm.sample(
            3000,
            tune=500,
            init="jitter+adapt_diag_grad",
            target_accept=0.9,
        )
        
        team_data = {"Lit": np.array(trace.posterior['lit_prediction']), "Arts": np.array(trace.posterior['art_prediction']), 
                    "Belief": np.array(trace.posterior['belief_prediction']), "History": np.array(trace.posterior['hist_prediction']), 
                    "Science": np.array(trace.posterior['sci_prediction']), "Thought": np.array(trace.posterior['thought_prediction'])}
        
        trace_data[team] = team_data
        
        print("Team: " + team)
        
        print(az.summary(trace))
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['team_buzz'] = [1 if team == team_name else 0 for team in team_df['team']]
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag_grad...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta, α, impute_censored]


Sampling 4 chains for 500 tune and 3_000 draw iterations (2_000 + 12_000 draws total) took 348 seconds.
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


Team: Cornell A
                      mean      sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  \
beta_0             -16.241   1.503 -18.996  -13.458      0.075    0.053   
beta[0]             -0.017   1.285  -2.550    2.198      0.065    0.046   
beta[1]             -0.558   1.278  -2.958    1.767      0.064    0.045   
beta[2]              0.232   1.279  -2.109    2.588      0.065    0.046   
beta[3]             -0.323   1.278  -2.668    2.036      0.064    0.046   
...                    ...     ...     ...      ...        ...      ...   
art_prediction      73.657  22.360  31.792  114.939      0.202    0.143   
belief_prediction   84.520  25.896  37.661  133.842      0.237    0.167   
hist_prediction     68.556  21.061  29.125  107.234      0.195    0.138   
sci_prediction      73.171  22.549  29.692  113.914      0.205    0.145   
thought_prediction  84.637  26.022  37.250  134.024      0.235    0.166   

                    ess_bulk  ess_tail  r_hat  
beta_0                 404.0    101

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['team_buzz'] = [1 if team == team_name else 0 for team in team_df['team']]
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag_grad...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_0, beta, α, impute_censored]


Sampling 4 chains for 500 tune and 3_000 draw iterations (2_000 + 12_000 draws total) took 303 seconds.
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


Team: Chicago A
                      mean      sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  \
beta_0             -18.206   1.538 -21.111  -15.363      0.078    0.055   
beta[0]             -0.680   1.275  -3.051    1.704      0.066    0.047   
beta[1]             -0.032   1.274  -2.341    2.375      0.066    0.047   
beta[2]             -0.136   1.272  -2.475    2.261      0.066    0.047   
beta[3]             -0.269   1.273  -2.528    2.207      0.066    0.047   
...                    ...     ...     ...      ...        ...      ...   
art_prediction      81.690  22.189  39.785  122.741      0.203    0.144   
belief_prediction   69.862  19.324  32.530  104.638      0.177    0.125   
hist_prediction     71.448  19.746  33.291  106.635      0.179    0.127   
sci_prediction      77.287  21.082  37.972  116.261      0.194    0.137   
thought_prediction  71.145  19.380  33.385  106.275      0.180    0.129   

                    ess_bulk  ess_tail  r_hat  
beta_0                 388.0     99

In [114]:

def sim_game(first_buzzes, second_buzzes, first_neg_rate, second_neg_rate, first_ppb, second_ppb):
    tossups = ["Lit"] * 4 + ["Arts"] * 3 + ['Belief'] * 3 + ["History"] * 4 + ["Science"] * 4 + ["Thought"] * 2
    first_score = 0
    second_score = 0
    
    for tossup in tossups:
        
        buzz_one = np.random.choice(first_buzzes[tossup].flatten())
        buzz_two = np.random.choice(second_buzzes[tossup].flatten())
        
        first_bonus_points = math.floor(first_ppb / 10 + np.random.random()) * 10
        second_bonus_points = math.floor(second_ppb / 10 + np.random.random()) * 10
        
        if (buzz_one >= 110) & (buzz_two >= 110):
            continue
            
        elif buzz_one < buzz_two:
            
            is_neg = np.random.choice([0, 1], p=[1 - first_neg_rate, first_neg_rate])
            
            if is_neg:
                
                first_score -= 5
                
                if buzz_two <= 110:
                    
                    second_score += 10
                    second_score += second_bonus_points
            else:
                
                first_score += 10
                first_score += first_bonus_points
                    
        elif buzz_two <= buzz_one:
            
            is_neg = np.random.choice([0, 1], p=[1 - second_neg_rate, second_neg_rate])
            
            if is_neg:
                
                second_score -= 5
                
                if buzz_one <= 110:
                    
                    first_score += 10
                    first_score += first_bonus_points
            else:
                
                second_score += 10
                second_score += second_bonus_points
                
    return (first_score, second_score)

0

In [120]:
wins = {"Chicago A": 0, "Cornell A": 0}

for i in range(1000):
    
    scores = sim_game(trace_data['Chicago A'], trace_data['Cornell A'], .13, .15, 19.5, 17.5)
    
    
    if scores[1] > scores[0]:
        wins['Cornell A'] += 1
    elif scores[0] > scores[1]:
        wins['Chicago A'] += 1

print("Chicago A: " + str(wins["Chicago A"]), "Cornell A: " + str(wins['Cornell A']))

Chicago A: 626 Cornell A: 360


In [None]:
import seaborn as sns
pred_array = np.array(ppc.posterior_predictive.likelihood).flatten()
np.quantile(pred_array, [.05, .94])

In [None]:
team_df = get_team_survival_data(buzzes_df, team, 'Science')
print(list(team_df['value']))

In [None]:
buzzes_df = buzzes_df[buzzes_df['round'] >= 8]
buzzes_df = buzzes_df[buzzes_df['round'] < 20]

In [None]:
science_df = pd.DataFrame.from_dict(science_points)

In [None]:
science_df.sort_values(by='Points', ascending=False, inplace=True)

list(science_df.head(10)['Player'])

In [None]:
len(x_censored)

In [None]:
team_name = buzzes_df[buzzes_df['player'] == 'Matt Bollinger']['team'].iloc[0]
game_ids = buzzes_df.loc[((buzzes_df.team == team_name) | (buzzes_df.opponent == team_name)), 'game_id'].unique()

In [None]:
buzzes_df[(buzzes_df['team'] == team_name) and (buzzes_df['opponent'] == team_name)]

In [None]:
buzzes_df.loc[lambda x: x['game_id'] in game_ids]

In [None]:
team_df = get_team_survival_data(buzzes_df, 'Cornell A')
    
X = team_df[['Arts', 'Beliefs and Other', 'History', 'Literature', 'Science', 'Thought']]

X = np.concatenate((np.ones(X.shape[0])[:, None], X), axis=1)
X.shape

In [None]:
team_df = get_team_survival_data(buzzes_df, 'Cornell A')
    
X = team_df[['Arts', 'Beliefs and Other', 'History', 'Literature', 'Science', 'Thought']]

X = np.concatenate((np.ones(X.shape[0])[:, None], X), axis=1).T

X

In [None]:
[2] + [3]