In [1]:
from hart_tools import (
    prep_manifest,
    read_hart_cvr, 
    read_cvrs, 
    check_for_contest,
    filter_cvr_contest,
    tabulate_styles
)

from IPython.core.interactiveshell import InteractiveShell

from assertion_audit_utils import \
    Assertion, Assorter, CVR, TestNonnegMean, check_audit_parameters, find_margins,\
    find_p_values, find_sample_size, new_sample_size, summarize_status,\
    write_audit_parameters, sort_cvr_sample_num, consistent_sampling

import os
import io
import re
import numpy as np
import math
import csv
import pandas as pd
import warnings
import copy
import xml.etree.ElementTree as ET
import xml.dom.minidom
import cryptorandom
from cryptorandom.cryptorandom import SHA256, int_from_hash_py3, int_from_hash
#from pandas.io.parsers import ParserError

In [2]:
# parse XMLs
cvr_list = read_cvrs(cvr_folder = "Data/hart/OC2021/oc_cvrs_for_testing_v2")

In [3]:
# assign random sample nums to all CVRs
CVR.assign_sample_nums(cvr_list, prng=SHA256(32))

True

In [4]:
sort_cvr_sample_num(cvr_list)

True

In [5]:
[cvr.sample_num / 10**76 for cvr in cvr_list]

[0.2251359480598143,
 1.4634438778906222,
 1.6903930211582798,
 3.7641573363644016,
 4.3584788139091595,
 6.621239707455629,
 8.371248423455292,
 9.038615680692303,
 11.07705760395957]

In [6]:
sampled_cvrs = consistent_sampling(cvr_list, 
                    sample_size_dict = {'Proportion 17' : 2,
                                       'Proposition 20' : 3},
                    sampled_cvrs = []
                   )

In [7]:
print(sampled_cvrs[0].sample_num / 10**76)
print(sampled_cvrs[1].sample_num / 10**76)
print(sampled_cvrs[2].sample_num / 10**76)

0.2251359480598143
1.4634438778906222
3.7641573363644016


In [8]:
cvr_list[2].votes

{'Proposition 24': {'Yes': True}, 'Proposition 25': {'Yes': True}}

In [15]:
# print out CVR attributes
print(cvr_list[0].votes)
print(cvr_list[0].id)
print(cvr_list[0].phantom)
print(cvr_list[0].sample_num)
print(cvr_list[0].p)

{'PRESIDENT AND VICE PRESIDENT': {'JOSEPH R. BIDEN\nKAMALA D. HARRIS': True}, 'UNITED STATES REPRESENTATIVE\n48th District': {'HARLEY ROUDA': True}, 'MEMBER OF THE STATE ASSEMBLY\n72nd District': {'JANET NGUYEN': True}, 'RANCHO SANTIAGO COMMUNITY COLLEGE DISTRICT\nGoverning Board Member,\nTrustee Area 5': {'BRETT ELLIOTT FRANKLIN': True}, 'County Supervisor, 1st District': {'SERGIO CONTRERAS': True}, 'CITY OF SANTA ANA\nMayor': {'JOSE SOLORIO': True}, 'CITY OF SANTA ANA\nMember, City Council, Ward 1': {'CYNTHIA CONTRERAS': True}, 'Proposition 14': {'Yes': True}, 'Proposition 15': {'Yes': True}, 'Proposition 16': {'No': True}, 'Proposition 17': {'Yes': True}, 'Proposition 18': {'Yes': True}, 'Proposition 19': {'Yes': True}, 'Proposition 20': {'No': True}}
109_1
False
90386156806923029215443444739281896702112175362115299667967653824984627439082
None


In [3]:
# read in manifest
manifest = pd.read_csv("manifest-CARCL2021.csv")
manifest.head()

Unnamed: 0,Container,Tabulator,Batch Name,Number of Ballots
0,Mail,1,1,60
1,Mail,1,2,21
2,Mail,1,3,123
3,Mail,1,4,59
4,Mail,1,5,87


In [11]:
assertions = Assertion.make_all_assertions(contests)

In [14]:
find_margins(contests, assertions, cvr_list)

AttributeError: 'list' object has no attribute 'votes'

In [6]:
risk_function = "kaplan_kolmogorov" 
alpha = .05
contests = vote_count_df["contest"].unique()
error_rate = .001


#aggregate across styles to get sampling fraction to verify each contest
contest_totals_df = vote_count_df.groupby(["contest","vote"])["num_votes"].sum().reset_index()
sample_fractions = []
margins = []
#ballots = []
for i in range(len(contests)):
    valid_votes = sorted(contest_totals_df["num_votes"][(contest_totals_df["contest"] == contests[i]) & (contest_totals_df["vote"] != "NA")].tolist(), reverse = True)
    ballots_cast = sum(contest_totals_df["num_votes"][contest_totals_df["contest"] == contests[i]])
    #ballots = ballots.append(ballots_cast)
    #if there's only one ballot in the contest, check it.
    if ballots_cast == 1:
        sample_fractions.append(1)
        margins.append(0)
        continue
    #if there is only one option with valid votes, the next option received 0 (though we don't know what it is)
    if len(valid_votes) == 1:
        valid_votes.append(0)
    #Margins eventually need to be able to accomodate multiple winners
    m = (valid_votes[0] - valid_votes[1]) / ballots_cast
    margins.append(m)
    if m == 0:
        sample_fractions.append(1)
        continue
    #N might need to be more general to account for phantoms
    if risk_function == "kaplan_markov":
        risk_fn = lambda x: TestNonnegMean.kaplan_markov(x, g = .1)
    elif risk_function == "kaplan_wald":
        risk_fn = lambda x: TestNonnegMean.kaplan_wald(x, g = .1)
    elif risk_function == "kaplan_kolmogorov":
        risk_fn = lambda x: TestNonnegMean.kaplan_kolmogorov(x, N = ballots_cast, g = .1)
    elif risk_function == "kaplan_martingale":
        risk_fn = lambda x: TestNonnegMean.kaplan_martingale(x, N = ballots_cast, g = .1)[0]
    else:
        "Input a valid risk_function."

    sample_fractions.append(TestNonnegMean.initial_sample_size(risk_function = risk_fn, margin = m, N = ballots_cast, alpha = alpha, error_rate = error_rate, u = 1, t = 1/2) / ballots_cast)
#Uses S4, eventually we will want to use a more efficient method
dict(zip(contests, sample_fractions))

{'AA-City of Orange': 1.0,
 'ANAHEIM ELEMENTARY SCHOOL DISTRICT\nGoverning Board Member,\nTrustee Area 3': 1.0,
 'BB-City of San Clemente': 0.9166666666666666,
 'BREA OLINDA UNIFIED SCHOOL DISTRICT\nGoverning Board Member,\nTrustee Area 5': 1,
 'BUENA PARK LIBRARY DISTRICT\nTrustee': 1.0,
 'BUENA PARK LIBRARY DISTRICTTrustee': 1,
 'CAPISTRANO UNIFIED SCHOOL DISTRICT\nGoverning Board Member,\nTrustee Area 2': 0.9230769230769231,
 'CAPISTRANO UNIFIED SCHOOL DISTRICT\nGoverning Board Member,\nTrustee Area 3': 1.0,
 'CAPISTRANO UNIFIED SCHOOL DISTRICT\nGoverning Board Member,\nTrustee Area 5': 1,
 'CC-City of Tustin': 0.6428571428571429,
 'CITY OF ALISO VIEJO\nMember, City Council': 1,
 'CITY OF ANAHEIM\nMember, City Council, District 1': 1.0,
 'CITY OF ANAHEIM\nMember, City Council, District 4': 1.0,
 'CITY OF ANAHEIM\nMember, City Council, District 5': 1.0,
 'CITY OF ANAHEIMMember, City Council, District 5': 1,
 'CITY OF BREA\nCity Treasurer': 1.0,
 'CITY OF BREA\nMember, City Council': 

In [5]:
get_expected_sample_size(cvr_list, risk_function = "kaplan_kolmogorov", error_rate = 0, alpha = .05)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


689.0428398005711

In [6]:
get_expected_sample_size(cvr_list, risk_function = "kaplan_kolmogorov", error_rate = .001, alpha = .05)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


702.0676967683497

In [7]:
get_expected_sample_size(cvr_list, risk_function = "kaplan_markov", error_rate = 0, alpha = .05)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


703.2841084058059

In [8]:
get_expected_sample_size(cvr_list, risk_function = "kaplan_markov", error_rate = .001, alpha = .05)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


718.6366677980894

In [None]:
# example return ballots
sorted_cvr_list = assign_random_number(cvr_list)
get_ballots_threshold(sorted_cvr_list, sample_size_dict = {'Z-City of Newport Beach' : 5, 
                                                    'UNITED STATES REPRESENTATIVE46th District' : 5},
                     sampled_CVRs = [])

In [31]:
N = 5000
alpha = 0.05
m_null = 1 / 2

true = np.concatenate((np.repeat(0, 990), np.repeat(1, 1010)))
reported = np.concatenate((np.repeat(0, 990), np.repeat(1, 1010)))
omega = reported - true
v = 2 * np.mean(reported) - 1
b = (1 - omega) / (2 - v)
mu_0 = 1/2

In [32]:
get_workloads(
    workload_dict = {"SqKelly": lambda x: get_workload_from_mart(
        x,
        mart_fn=lambda y: sqKelly_martingale(y, 1 / 2, N=N, D=20, beta=1),
        alpha=alpha,
    )},
    data = b
)

{'SqKelly': array([2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,
        2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,
        2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,
        2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,
        2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,
        2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,
        2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,
        2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,
        2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,
        2000])}

In [7]:
vote_count_df = votes_df
contests = vote_count_df["contest"].unique()
#aggregate across styles to get sampling fraction to verify each contest
contest_totals_df = vote_count_df.groupby(["contest","vote"])["num_votes"].sum().reset_index()
sample_fractions = []
for i in range(len(contests)):
#construct a vector with N_w 1s, N_l 0s, and N_u 1/2s
#N_w is the number of votes for the winner, N_l is the number of votes for the loser who almost one
    valid_votes = sorted(contest_totals_df["num_votes"][(contest_totals_df["contest"] == contests[i]) & (contest_totals_df["vote"] != "NA")].tolist(), reverse = True)
    ballots_cast = sum(contest_totals_df["num_votes"][contest_totals_df["contest"] == contests[i]])
    reported_votes = np.concatenate(
        (np.repeat(0, valid_votes[1]),
         np.repeat(1/2, ballots_cast - valid_votes[0] - valid_votes[1]),
         np.repeat(1, valid_votes[0]))
        )
    sample_fractions.append(get_sample_size_kelly(reported_votes, alpha = .05) / ballots_cast)
   

   

TypeError: unsupported operand type(s) for /: 'NoneType' and 'int'