In [1]:
from hart_tools import (
    prep_manifest,
    read_hart_cvr, 
    read_cvrs, 
    check_for_contest,
    filter_cvr_contest,
    tabulate_styles
)

from IPython.core.interactiveshell import InteractiveShell

from assertion_audit_utils import \
    Assertion, Assorter, CVR, TestNonnegMean, check_audit_parameters, find_margins,\
    find_p_values, find_sample_size, new_sample_size, summarize_status,\
    write_audit_parameters, sort_cvr_sample_num, consistent_sampling

import os
import io
import re
import numpy as np
import math
import csv
import pandas as pd
import warnings
import copy
import xml.etree.ElementTree as ET
import xml.dom.minidom
import cryptorandom
from cryptorandom.cryptorandom import SHA256, int_from_hash_py3, int_from_hash
from cryptorandom.sample import random_permutation, sample_by_index
from numpy.random import choice

In [2]:
from hart_tools import (
    prep_manifest,
    read_hart_cvr, 
    read_cvrs, 
    check_for_contest,
    filter_cvr_contest,
    tabulate_styles
)

In [3]:
# read in CVRs
cvr_list = read_cvrs(cvr_folder = "Data/hart/OC2021/oc_cvrs_for_testing_v2")
# read in manifest
manifest = pd.read_csv("Data/hart/OC2021/oc_manifest_sample.csv")
len(cvr_list)

9

In [4]:
### Function to generate fake CVRs ###
def generate_fake_cvrs(contest_dict, style_dict):
    fake_cvr_list = []
    # loop through each style
    for style in style_dict.keys():
        # loop through the number of cards of that style
        for i in range(style_dict[style]['cards']):
            # loop through the contests in that style and generate CVR
            cvr = CVR(id = None, votes = {}, phantom=False, sample_num=None, p=None)
            for contest in style_dict[style]['contests']:
                # randomly choose vote for that contest based on contest probabilities
                cvr.set_votes({contest : {choice(contest_dict[contest]['candidates'], 
                                                        1, contest_dict[contest]['p'])[0] : True}})
            # add cvr to list
            fake_cvr_list.append(cvr)
    # return the list of CVRs generated
    return fake_cvr_list
            
            
## Q: what if margin varies by style for a contest? 
## Maybe just give the contest a different name like Contest 1 Region A ?
contest_dict = {'Contest 1' : {'candidates' : ['Candidate A', 'Candidate B'], 'p' : [0.55, 0.45]},
'Contest 2' : {'candidates' : ['Candidate A', 'Candidate B'], 'p' : [0.7, 0.3]},
'Contest 3' : {'candidates' : ['Candidate A', 'Candidate B'], 'p' : [0.6, 0.4]},
'Contest 4' : {'candidates' : ['Candidate A', 'Candidate B'], 'p' : [0.2, 0.8]},
'Contest 5' : {'candidates' : ['Candidate A', 'Candidate B'], 'p' : [0.34, 0.66]}}

style_dict = {'style_1' : {'contests' : ['Contest 1', 'Contest 2'], 'cards' : 10},
'style_2' : {'contests' : ['Contest 3', 'Contest 4', 'Contest 5'], 'cards' : 2},
'style_3' : {'contests' : ['Contest 1', 'Contest 2', 'Contest 3', 'Contest 4', 'Contest 5'],
           'cards' : 15}
}
         
fake_cvr_list = generate_fake_cvrs(contest_dict, style_dict)

In [5]:
fake_cvr_list[0].votes

{'Contest 1': {'Candidate A': True}, 'Contest 2': {'Candidate A': True}}

In [6]:
# set values
seed = 1234567890  # use, e.g., 20 rolls of a 10-sided die. Seed doesn't have to be numeric
replacement = False

risk_function = "alpha_mart"
#because comparison audit, may want to add f parameter to bias alpha towards u
risk_fn = lambda x, m, N: TestNonnegMean.alpha_mart(x, N=N, eta=(m+1)/2, f=.1)
g = 0.1
max_cards = 14
error_rate = 0.002

In [7]:
# contests to audit
# there are actually only 5 cards in the CVR list with this contest
contests = {'PRESIDENT AND VICE PRESIDENT':{'risk_limit':0.05,
                     'cards': 6,
                     'choice_function':'plurality',
                     'n_winners':1,
                     'candidates':['JOSEPH R. BIDEN\nKAMALA D. HARRIS',
                                   'DONALD J. TRUMP\nMICHAEL R. PENCE'],
                     'reported_winners' : ['DONALD J. TRUMP\nMICHAEL R. PENCE'],
                    }
           }

In [8]:
all_assertions = Assertion.make_all_assertions(contests)

In [9]:
cvr_list, phantom_vrs = CVR.make_phantoms(max_cards, cvr_list, contests, use_style=True, prefix='phantom-1-')
print(f"Created {phantom_vrs} phantom records")
# assign random sample nums including phantoms
CVR.assign_sample_nums(cvr_list, prng=SHA256(32))

Created 1 phantom records


True

In [10]:
min_margin = find_margins(contests, cvr_list, use_style=True)
min_margin

0.16666666666666674

In [11]:
check_audit_parameters(risk_function, g, error_rate, contests)

In [12]:
# find initial sample size
rf = lambda x,m,N: risk_fn(x,m,N)[1]   # p_history is the second returned value
ss_fn = lambda m, r, N: TestNonnegMean.initial_sample_size(\
                        risk_function=rf, N=N, margin=m, polling=False, \
                        error_rate=error_rate, alpha=r, reps=10) # change for comparison audits
total_sample_size, sample_size_contests = find_sample_size(contests, sample_size_function=ss_fn, use_style = True, cvr_list = cvr_list)  
print(total_sample_size)

6.0


In [13]:
print(cvr_list[4].has_contest('PRESIDENT AND VICE PRESIDENT'))
print(cvr_list[4].p)

print(cvr_list[2].has_contest('PRESIDENT AND VICE PRESIDENT'))
print(cvr_list[2].p)

False
0
True
1.0


In [14]:
sample_indices = consistent_sampling(
    cvr_list, 
    contests = contests, 
    sample_size_dict = sample_size_contests)

In [15]:
# set mvr_list to be the same as cvr_list for now -- sample order??
mvr_list = copy.deepcopy(cvr_list)

In [16]:
manifest

Unnamed: 0,Container,Tabulator,Batch Name,Number of Ballots
0,Mail,1,1,60
1,Mail,1,2,21
2,Mail,1,3,123
3,Mail,1,4,59
4,Mail,1,5,87
...,...,...,...,...
4412,In-Person,In Person - 5,514,418
4413,In-Person,In Person - 5,515,381
4414,In-Person,In Person - 5,516,240
4415,In-Person,In Person - 5,517,403


In [17]:
sampled_cvrs = [cvr_list[i-1] for i in sample_indices]
sampled_mvrs = [mvr_list[i-1] for i in sample_indices]

In [18]:
p_max = find_p_values(
    contests = contests, 
    mvr_sample = sampled_mvrs, 
    cvr_sample = sampled_cvrs, 
    use_style = True, 
    risk_function=risk_fn)
summarize_status(contests)

p-values for assertions in contest PRESIDENT AND VICE PRESIDENT
DONALD J. TRUMP
MICHAEL R. PENCE v JOSEPH R. BIDEN
KAMALA D. HARRIS 0.8325187510665614

contest PRESIDENT AND VICE PRESIDENT audit INCOMPLETE at risk limit 0.05. Attained risk 0.8325187510665614
assertions remaining to be proved:
DONALD J. TRUMP
MICHAEL R. PENCE v JOSEPH R. BIDEN
KAMALA D. HARRIS: current risk 0.8325187510665614


False

In [19]:
#need to determine how to escalate with consistent sampling
#replace sample_by_index() in new_sample_size() with consistent sampling
#keep track of incremental samples (rounds)?
#this is very, very slow
new_sample_size(
    contests = contests, 
    mvr_sample = sampled_mvrs, 
    cvr_sample = sampled_cvrs,
    cvr_list = cvr_list,
    use_style = True,
    risk_function = risk_fn
)

(6.0, {'PRESIDENT AND VICE PRESIDENT': 6})